diff --git a/.gitignore b/.gitignore index c3ddc7ea990..194948758a3 100644 --- a/.gitignore +++ b/.gitignore @@ -121,3 +121,11 @@ CLAUDE.md # Conformance test outputs (timestamped folders from --test mode) **/conformance/20*-*-*_*-*-*/ + +# Generated by the authored_workflow_demo "Export plan" beat (sample output) +security_audit_plan.json + +# ADK Web demo session stores (runtime) +demo_sessions*.db +ca_demo_sessions*.db +ca_plan_store/ diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/README.md b/contributing/samples/workflows/authored_workflow_ca_demo/README.md new file mode 100644 index 00000000000..b7c2c341a3a --- /dev/null +++ b/contributing/samples/workflows/authored_workflow_ca_demo/README.md @@ -0,0 +1,132 @@ +# ADK Web demo — model-authored workflows for BigQuery Conversational Analytics (RFC #93) + +One agent, **seven prompts, seven workflow shapes**. Styled after [BigQuery +Conversational Analytics](https://docs.cloud.google.com/bigquery/docs/conversational-analytics): +a user asks data questions in natural language, and the planner **authors a +different typed `WorkflowSpec` per scenario** over Conversational-Analytics +capabilities — `nl2sql`, `dry_run`, `run_query`, `profile_table`, `skeptic`, +chart judging — against a mock `thelook_ecommerce` dataset (the dataset the +CA docs demo against). **Query execution is REAL BigQuery** when +credentials allow: `dry_run` hits the actual BigQuery dry-run API (real +errors, real bytes-scanned) and `run_query` executes against +`bigquery-public-data.thelook_ecommerce`, billed to your +`GOOGLE_CLOUD_PROJECT` with safety rails (`maximum_bytes_billed` = 2 GB per +query, 500-row result cap). Multi-dimensional questions ("each region's +trend per year") return real grouped results and chart as multi-series +lines. Without credentials (or with `CA_DEMO_USE_BIGQUERY=0`), execution +falls back to a deterministic micro-warehouse (synthetic facts + +SQL-intent aggregation) so CI and credential-less machines keep working — +each dry-run/result beat carries an `engine` field (`bigquery` or `mock`) +so the demo never misrepresents its data source. The language steps +(NL2SQL, summaries, classification, skeptics) are live Gemini calls. + +Every scenario runs the full #93 machinery: **author → validate → +independence lints → freeze (per-scenario key) → execute on the real engine +(#92 supervisor) → cost line**, and every shape is pinned in CI with the +language capabilities stubbed. + +## 0. Configure a model (no hardcoded project) + +```bash +export GOOGLE_GENAI_USE_VERTEXAI=1 +export GOOGLE_CLOUD_PROJECT= +export GOOGLE_CLOUD_LOCATION=global +export SPIKE_GEMINI_MODEL=gemini-3.5-flash +``` + +## 1. Run it + +```bash +adk web contributing/samples/workflows/authored_workflow_ca_demo --port 8001 +``` + +**Talk to it first** — the agent has a conversational gate (the RFC's +"no-plan escape hatch"): untriggered messages are intent-classified, and +meta/chit-chat turns get a direct answer instead of a workflow. Try: + +```text +What kinds of workflow can you issue? +``` + +→ a plain-language catalogue of the seven shapes with example prompts — `0 planner calls, 0 queries`. Data questions proceed to the machinery below. + +Open the UI, pick `bq_ca_planner`, and send the prompts below — **one +scenario per prompt**, each authoring a different coordination shape: + +| # | Send this prompt | Shape authored | CA story | +| --- | --------------------------------------------------------------------------------------- | ---------------------------------------------------------------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| 1 | `What was revenue by region last quarter?` | `loop_until(draft → REAL dry-run → repair) → run_query → render_chart + summarize` | the standard CA flow — **your actual question is the task input**, and a real BigQuery dry-run error (e.g. `TIMESTAMP_SUB ... YEAR`) feeds the repair round | +| 2 | `Profile data quality across the dataset tables.` | fan-out → synthesize | per-table profiling in parallel, one report | +| 3 | `Build a dashboard for these three questions.` | pipeline(`nl2sql → dry_run`) per item | each panel translated + validated barrier-free | +| 4 | `Route my question: what does order status 'Complete' mean?` | classify & route (branch) | metadata questions skip SQL planning — answered by a data-grounded agent that queries the REAL data (DISTINCT values, counts) | +| 5 | `Answer with SQL self-repair — the dry run is unreliable.` | loop_until + **loop-carried `init`** | a REALLY broken query (`thelook_ecommerce.order`) is checked by the REAL dry-run, repaired from the actual BigQuery error, then executed | +| 6 | `Audit this insight: ` (or just `audit that insight` after a question) | adversarial verification | **audits YOUR insights with DATA-GROUNDED skeptics** — each runs real BigQuery checks via its `query_thelook` tool and cites the numbers (the $1M-AOV claim is refuted with the actual ~$86 AOV); insights from your message, the session's last insight, or the canned fallback | +| 7 | `Pick the best chart for revenue by region.` | tournament | pairwise chart judging to a single winner | + +What to point at as each one streams: + +- **🗂️ scenario banner** — the expected shape, named before the model authors it. +- **📋 authored plan** — a *different* typed `WorkflowSpec` per prompt; same closed vocabulary every time. +- **✅ + 🧪 validation & independence lints** — every scenario lints clean; the provenance facts are statically provable from the bindings. +- **🔒 freeze (per-scenario key) + 📦 cross-session export** — every authored plan exports its full `FrozenWorkflowRecord` to `ca_plan_store/.json`. **Re-send any prompt**: same hash, `0 planner calls (frozen replay)`. **Start a whole new session** and ask again: the plan is **imported from the store** through the RFC's defensive path — spec hash recomputed, re-validated against the current registry, manual-version + declared-contract drift fail loudly (input kind + declared output schema; the typed object-output capabilities declare output models so the hash has teeth — primitive helpers like `sql_ok`/`judge_chart` return bare bool/str/list values and rely on manual versions) (with the rejection shown, then a fresh authoring), and your new question is validated against the captured `task_input_schema` (cross-session **template reuse**). Plans now outlive sessions. +- **template reuse (scenario 1)** — after the first ask, send a *different* question (`What was revenue by region last year?`): the frozen plan is reused unchanged, your new question flows through it as new task input, and the mock rows change with the window (quarter vs year canned sets). Same plan, new data — the RFC's replay-vs-template distinction, live. +- **📈 chart** — scenarios 1 and 7 emit the Conversational-Analytics-style chart artifact: a **rendered chart image inline in the chat** (matplotlib, optional — falls back to a Unicode preview) plus the **Vega-Lite spec** (what the real CA API returns). Time-series rows infer a line mark; in the tournament, the bracket picks the mark and `render_chart` draws the data with it. +- **honest failure handling** — a query that still fails after repair returns empty rows + the real error (`engine: bigquery`); the mock warehouse is used ONLY when credentials are absent, never to paper over a failing query. +- **📄 result + 📊 cost** — real execution on the #92 supervisor; the repair scenario shows exactly one repair iteration (`Table not found … did you mean orders?` → fixed), the audit scenario rejects the implausible insight, the tournament returns `["bar"]`. + +Talking point for scenario 5 (the differentiated one): *the repair loop needs +**loop-carried state** — the drafting step reads the loop's own id to get the +prior round's failed dry-run output. That's `LoopUntil.init`, the vocabulary +gap the pattern-coverage sweep surfaced. And the whole loop is frozen and +replayable — a turn-by-turn agent retry never is.* + +## 2. Correctness proof (no LLM, no BigQuery) + +```bash +pytest contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py -q # 38 collected (one live-gated; one gated on the patched ADK wrapper) +``` + +All seven expected shapes are built by hand, validated + lint-checked against +the demo registry, and **executed end-to-end** with the language capabilities +stubbed: the loop repairs exactly once, the branch routes the metadata +question away from SQL, the audit rejects the implausible insight, the +tournament converges to `bar` and renders it as a Vega-Lite chart artifact. The fan-out and tournament scenarios execute +against the **live** registry (their capabilities are deterministic mocks). + +## SQL freezing + human-feedback revision + +Plan freezing pins the *process*; **SQL freezing pins the numbers**. After a +question's SQL passes the real dry-run, it's frozen to +`ca_plan_store/sql/.json`. Re-ask the exact question (any +session): the drafting LLM is **skipped**, the frozen SQL re-validates +(doubling as warehouse-drift detection) and replays — live-verified +identical results run-to-run. Then govern it with feedback: + +```text +revise: exclude orders with status Cancelled or Returned +``` + +→ the SQL is revised to follow the feedback, must pass the REAL dry-run +before it replaces the frozen artifact, and the feedback itself is recorded +in the artifact's `revisions` history — who changed the query and why, +auditable. A failed revision leaves the frozen SQL untouched. + +## Notes + +- Honesty: like the security-audit demo, scenario recipes are + instruction-guided so each prompt reliably authors its intended shape; the + free-decomposition evidence is the spike's demand gate and the main demo's + free-authoring beat. The *variety* — seven shapes from one closed + vocabulary — is the claim here. +- Nothing in the live path is simulated anymore: the repair scenario + checks a really-broken query against the real dry-run; transient-failure + simulation now lives only in the CI test stubs. +- Frozen plans are per-scenario (`authored_workflow:ca:`) in + session state, AND exported per-scenario to `ca_plan_store/` for + cross-session reuse (delete a file to force fresh authoring; the store is + the demo's stand-in for the ArtifactService in the RFC's revised Q1). +- Scenario 1 takes your live message as the question; the other six prompts + are mode selectors with canned task inputs (their results don't change + with your wording). Query answers come from real BigQuery when + credentials allow (check the `engine` field in the dry-run/result beats); + otherwise the deterministic micro-warehouse. diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/__init__.py b/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/__init__.py new file mode 100644 index 00000000000..1a38cf933e9 --- /dev/null +++ b/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from . import agent # noqa: F401 diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py b/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py new file mode 100644 index 00000000000..e3c39039041 --- /dev/null +++ b/contributing/samples/workflows/authored_workflow_ca_demo/bq_ca_planner/agent.py @@ -0,0 +1,1885 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""ADK Web demo agent for RFC #93 — BigQuery Conversational Analytics planner. + +One agent, SEVEN scenario prompts, each making the planner author a DIFFERENT +workflow shape over Conversational-Analytics-flavored capabilities (nl2sql, +dry_run, run_query, profiling, insight verification) against a mock +``thelook_ecommerce`` dataset: + + sequence "What was revenue by region last quarter?" + fan-out "Profile data quality across the dataset tables." + pipeline "Build a dashboard for these three questions." + branch "Route my question: what does order status 'Complete' mean?" + loop_until "Answer with SQL self-repair — the dry run is unreliable." + adversarial "Audit these insights — verify each one independently." + tournament "Pick the best chart for revenue by region." + +Each scenario runs the same machinery as the security-audit demo: author +(live planner) -> validate -> independence lints -> freeze (per-scenario +state key; re-send replays without re-invoking the model) -> execute on the +real engine via the #92 supervisor -> cost line. Query execution and dry-run +are deterministic mocks (no BigQuery project needed); language steps +(nl2sql, summaries, classification, skeptics) are live Gemini calls. Run: + + adk web contributing/samples/workflows/authored_workflow_ca_demo + +Configure a model first (no hardcoded project): + export GOOGLE_GENAI_USE_VERTEXAI=1 GOOGLE_CLOUD_PROJECT= + export GOOGLE_CLOUD_LOCATION=global SPIKE_GEMINI_MODEL=gemini-3.5-flash +""" + +from __future__ import annotations + +import datetime +import json +import math +import os +import re +import sys +import time +from typing import Literal + +from google.adk import Agent +from google.adk import Context +from google.adk import Event +from google.adk import Workflow +from google.adk.workflow import node +from google.genai import types +from pydantic import BaseModel + +# Reuse the committed #93 authoring stack (sibling sample dir). +sys.path.insert( + 0, + os.path.join( + os.path.dirname(os.path.abspath(__file__)), + "..", + "..", + "authored_workflow_spike", + ), +) +from authoring import Binding # noqa: E402 +from authoring import Capability # noqa: E402 +from authoring import CapabilityRegistry # noqa: E402 +from authoring import export_plan # noqa: E402 +from authoring import FrozenWorkflowRecord # noqa: E402 +from authoring import import_plan # noqa: E402 +from authoring import independence_facts # noqa: E402 +from authoring import PlanImportError # noqa: E402 +from authoring import sha256_hex # noqa: E402 +from authoring import SpecInterpreter # noqa: E402 +from authoring import StepRef # noqa: E402 +from authoring import WorkflowSpec # noqa: E402 +from authoring import WorkflowSpecValidator # noqa: E402 + +MODEL = os.environ.get("SPIKE_GEMINI_MODEL", "gemini-2.5-flash") +DET = types.GenerateContentConfig(temperature=0) + +# ------------------------------------------------- mock thelook_ecommerce +# A miniature of bigquery-public-data.thelook_ecommerce — the dataset the +# Conversational Analytics docs demo against. run_query/dry_run/profiling +# are deterministic mocks so the demo needs no BigQuery project. +TABLES = { + "orders": "order_id, user_id, status, created_at, num_of_item", + "order_items": "id, order_id, product_id, sale_price, status, created_at", + "products": "id, name, category, brand, retail_price, department", + "users": "id, email, age, country, traffic_source, created_at", + "events": ( + "id, user_id, session_id, created_at, city, browser," + " traffic_source, uri, event_type" + ), + "inventory_items": ( + "id, product_id, created_at, sold_at, cost, product_category," + " product_brand, product_distribution_center_id" + ), + "distribution_centers": "id, name, latitude, longitude", +} + +_CANNED_ROWS = [ + {"region": "US-West", "revenue": 412300.50}, + {"region": "US-East", "revenue": 387910.25}, + {"region": "EMEA", "revenue": 295004.10}, + {"region": "APAC", "revenue": 188777.75}, +] + +# ------------------------------------------------- micro-warehouse engine +# The "intelligent mock executor": a deterministic synthetic fact table +# (24 months x 4 regions x 4 categories) plus lightweight SQL-INTENT parsing. +# Instead of pattern-matching to a canned answer, run_query AGGREGATES the +# facts according to the query's grouping (month/region/category), window +# (INTERVAL N YEAR/QUARTER/MONTH), filters (country/region literals), and +# measure alias (SUM(...) AS ). Honest scope: it executes the query's +# INTENT, not its SQL — a real BigQuery backend is the production step. +_REGION_WEIGHT = {"US-West": 1.00, "US-East": 0.95, "EMEA": 0.72, "APAC": 0.46} +_CATEGORY_WEIGHT = { + "Outerwear": 0.34, + "Jeans": 0.27, + "Activewear": 0.22, + "Accessories": 0.17, +} +_MONTHS = [f"{y}-{m:02d}" for y in (2024, 2025) for m in range(1, 13)] +_BASE_MONTHLY = 142000.0 + + +def _seasonal(i: int) -> float: + # mild growth + yearly seasonality — deterministic, no RNG. + return 1.0 + 0.18 * math.sin(i * math.pi / 6) + 0.012 * i + + +_FACTS = [ + { + "month": month, + "region": region, + "category": category, + "revenue": round(_BASE_MONTHLY * rw * cw * _seasonal(i), 2), + } + for i, month in enumerate(_MONTHS) + for region, rw in _REGION_WEIGHT.items() + for category, cw in _CATEGORY_WEIGHT.items() +] + + +def _query_engine(sql_text: str) -> list[dict]: + """Aggregate the synthetic facts according to the SQL's intent.""" + s = (sql_text or "").lower() + # time window: last N months from the warehouse's end (default: a quarter) + m_y = re.search(r"interval\s+(\d+)\s+year", s) + m_q = re.search(r"interval\s+(\d+)\s+quarter", s) + m_m = re.search(r"interval\s+(\d+)\s+month", s) + if m_y: + n = int(m_y.group(1)) * 12 + elif m_q: + n = int(m_q.group(1)) * 3 + elif m_m: + n = int(m_m.group(1)) + elif "year" in s: + n = 12 + else: + n = 3 + months = set(_MONTHS[-min(n, len(_MONTHS)) :]) + facts = [f for f in _FACTS if f["month"] in months] + # filters: country / region literals + if "united states" in s or "'us'" in s: + facts = [f for f in facts if f["region"].startswith("US-")] + for region in _REGION_WEIGHT: + if f"'{region.lower()}'" in s: + facts = [f for f in facts if f["region"] == region] + for category in _CATEGORY_WEIGHT: + if f"'{category.lower()}'" in s: + facts = [f for f in facts if f["category"] == category] + # measure name: honor the SQL's alias when present + alias = re.search(r"sum\([^)]*\)\s+as\s+([a-z_][a-z0-9_]*)", s) + measure = alias.group(1) if alias else "revenue" + # time grain: DATE_TRUNC(..., G) / EXTRACT(G FROM ...) / AS g / GROUP BY g. + # Scope the GROUP BY check to the actual clause (stop at ORDER BY/LIMIT) + # with INTERVAL phrases stripped — a trailing "INTERVAL 1 YEAR" window + # must not read as a yearly grouping. + gb_match = re.search(r"group by\s+(.*?)(?:\border by\b|\blimit\b|$)", s) + gb_clause = re.sub( + r"interval\s+\d+\s+\w+", "", gb_match.group(1) if gb_match else "" + ) + grain = None + for g in ("month", "week", "quarter", "year"): + if ( + re.search(rf"date_trunc\([^)]*,\s*{g}\s*\)", s) + or re.search(rf"extract\(\s*{g}\s+from", s) + or re.search(rf"\bas\s+{g}\b", s) + or re.search(rf"\b{g}\b", gb_clause) + ): + grain = "month" if g == "week" else g # weekly facts -> monthly grain + break + if grain: + + def bucket(month: str) -> str: + y, mm = month.split("-") + if grain == "month": + return month + if grain == "quarter": + return f"{y}-Q{(int(mm) - 1) // 3 + 1}" + return y # year + + agg: dict = {} + for f in facts: + b = bucket(f["month"]) + agg[b] = agg.get(b, 0.0) + f["revenue"] + return [{grain: k, measure: round(v, 2)} for k, v in sorted(agg.items())] + # categorical dimension + if "category" in s or "department" in s: + dim = "category" + elif "region" in s or "country" in s: + dim = "region" + else: + dim = None + if dim is None: + return [{measure: round(sum(f["revenue"] for f in facts), 2)}] + agg = {} + for f in facts: + agg[f[dim]] = agg.get(f[dim], 0.0) + f["revenue"] + return [ + {dim: k, measure: round(v, 2)} + for k, v in sorted(agg.items(), key=lambda kv: -kv[1]) + ] + + +# ------------------------------------------------- REAL BigQuery backend +# When credentials allow, dry_run and run_query hit the REAL +# bigquery-public-data.thelook_ecommerce dataset (billed to +# GOOGLE_CLOUD_PROJECT) — real dry-run errors, real bytes-scanned, real +# multi-dimensional results. Safety rails: maximum_bytes_billed caps each +# query, results cap at _MAX_ROWS. Anything that fails falls back to the +# deterministic micro-warehouse above, so CI and credential-less machines +# keep working. CA_DEMO_USE_BIGQUERY=0 forces the mock. +_BQ_DATASET = "bigquery-public-data.thelook_ecommerce" +_MAX_BYTES_BILLED = 2 * 1024**3 # 2 GB per query +_MAX_ROWS = 500 +_BQ = { + "client": None, + "disabled": os.environ.get("CA_DEMO_USE_BIGQUERY", "1") != "1", + "error": None, +} + + +def _bq_client(): + if _BQ["disabled"] or _BQ["error"]: + return None + if _BQ["client"] is None: + try: + from google.cloud import bigquery # optional dependency + + _BQ["client"] = bigquery.Client( + project=os.environ.get("GOOGLE_CLOUD_PROJECT") or None + ) + except Exception as e: # no lib / no credentials -> mock warehouse + _BQ["error"] = f"{type(e).__name__}: {e}" + return None + return _BQ["client"] + + +def _qualify_sql(sql: str) -> str: + """Fully qualify bare thelook table refs for real BigQuery.""" + s = (sql or "").replace("`", "") + s = re.sub( + r"(? dict: + sql = _qualify_sql(_sql_of(value)) + # Preserve the user's question through the dry run: after a FAILURE this + # output becomes the loop-carried value, and the repair round needs full + # context (question + sql + error), not just sql + error. + question = str(_field_of(value, "question", "") or "") + client = _bq_client() + if client is None: + return { + "sql": sql, + "question": question, + "valid": "select" in sql.lower(), + "error": None, + "engine": "mock", + } + from google.cloud import bigquery + + try: + job = client.query( + sql, + job_config=bigquery.QueryJobConfig(dry_run=True, use_query_cache=False), + ) + return { + "sql": sql, + "question": question, + "valid": True, + "error": None, + "engine": "bigquery", + "bytes_processed": int(job.total_bytes_processed or 0), + } + except Exception as e: # the REAL BigQuery error feeds the repair story + return { + "sql": sql, + "question": question, + "valid": False, + "error": str(e)[:500], + "engine": "bigquery", + } + + +def _execute_sql(value) -> dict: + sql = _qualify_sql(_sql_of(value)) + client = _bq_client() + if client is not None: + from google.cloud import bigquery + + try: + job = client.query( + sql, + job_config=bigquery.QueryJobConfig( + maximum_bytes_billed=_MAX_BYTES_BILLED + ), + ) + rows = [ + {k: _jsonify_cell(v) for k, v in dict(r).items()} + for r in job.result(max_results=_MAX_ROWS) + ] + return { + "rows": rows, + "engine": "bigquery", + "bytes_processed": int(job.total_bytes_processed or 0), + } + except Exception as e: + # A failing query must NOT fabricate an answer from the mock — that + # path is only for missing credentials. Return the failure honestly; + # the repair loop upstream exists to prevent reaching here. + return {"rows": [], "engine": "bigquery", "error": str(e)[:300]} + return {"rows": _query_engine(sql), "engine": "mock"} + + +def query_thelook(sql: str) -> dict: + """Run ONE read-only StandardSQL SELECT against the public dataset + bigquery-public-data.thelook_ecommerce to check a claim. Use small + aggregate queries (GROUP BY / COUNT / SUM); results are capped. Returns + rows, the executing engine, and the real error when the SQL is invalid. + """ + out = _execute_sql({"sql": sql}) + return { + "rows": out.get("rows", [])[:50], + "engine": out.get("engine"), + "error": out.get("error"), + } + + +# Mock fallback profiles (used WITHOUT credentials; clearly labeled via the +# engine field — with credentials, profiling queries the real __TABLES__). +_CANNED_PROFILES = { + "orders": {"table": "orders", "row_count": 125000, "size_mb": 11.0}, + "order_items": { + "table": "order_items", + "row_count": 182000, + "size_mb": 24.0, + }, + "products": {"table": "products", "row_count": 29120, "size_mb": 4.8}, + "users": {"table": "users", "row_count": 100000, "size_mb": 27.0}, + "events": {"table": "events", "row_count": 2400000, "size_mb": 740.0}, + "inventory_items": { + "table": "inventory_items", + "row_count": 490000, + "size_mb": 138.0, + }, + "distribution_centers": { + "table": "distribution_centers", + "row_count": 10, + "size_mb": 0.1, + }, +} + + +_TABLE_LIST_CACHE: dict = {} + + +def _live_table_list() -> list: + """The dataset's ACTUAL non-empty tables from __TABLES__ (cached per + process), falling back to the curated catalogue without credentials. + Empty strays (e.g. the 0-row 'thelook_ecommerce-table' placeholder) are + excluded — matching the production CA agent's 7-table scope.""" + if "tables" in _TABLE_LIST_CACHE: + return _TABLE_LIST_CACHE["tables"] + tables = list(TABLES) + if _bq_client() is not None: + out = _execute_sql({ + "sql": ( + "SELECT table_id FROM" + " `bigquery-public-data.thelook_ecommerce.__TABLES__` WHERE" + " row_count > 0 ORDER BY table_id" + ) + }) + live = [r["table_id"] for r in out.get("rows") or []] + if live: + tables = live + _TABLE_LIST_CACHE["tables"] = tables + return tables + + +def _profile_table(value) -> dict: + """REAL table profile from BigQuery __TABLES__ metadata (row count, size) + when credentials allow; the canned fallback otherwise — engine-labeled.""" + name = str(value).strip().strip("`'\"") + if _bq_client() is not None and re.fullmatch(r"[A-Za-z_][\w-]*", name): + out = _execute_sql({ + "sql": ( + "SELECT table_id, row_count, size_bytes FROM" + " `bigquery-public-data.thelook_ecommerce.__TABLES__` WHERE" + f" table_id = '{name}'" + ) + }) + rows = out.get("rows") or [] + if rows: + return { + "table": rows[0]["table_id"], + "row_count": int(rows[0]["row_count"]), + "size_mb": round(float(rows[0]["size_bytes"]) / 1048576, 1), + "engine": "bigquery", + } + prof = dict( + _CANNED_PROFILES.get( + name, {"table": name, "row_count": 0, "size_mb": 0.0} + ) + ) + prof["engine"] = "mock" + return prof + + +_JUDGE_RANK = {"bar": 0, "line": 1, "scatter": 2, "pie": 3} + + +# ------------------------------------------------- typed outputs (LLM caps) +class Sql(BaseModel): + sql: str + # Echoed by the SQL-drafting capabilities so the loop-carried value still + # holds the user's question after a FAILED dry run — the repair round + # repairs with full context (question + sql + real error), not sql+error. + question: str = "" + + +class DryRunResult(BaseModel): + sql: str + valid: bool + error: str | None = None + engine: str = "mock" + question: str = "" + bytes_processed: int = 0 + + +class QueryResult(BaseModel): + rows: list[dict] + engine: str = "mock" + bytes_processed: int = 0 + error: str | None = None + + +class ChartArtifact(BaseModel): + chart_type: str + x_field: str + y_field: str + series_field: str | None = None + ascii: str + vega_lite: dict + + +class TableProfile(BaseModel): + table: str + row_count: int + size_mb: float + engine: str = "mock" + + +class QualityReport(BaseModel): + tables: int + total_rows: int + largest_table: str + total_size_mb: float + + +class SchemaAnswer(BaseModel): + answer: str + + +class VerifiedInsights(BaseModel): + verified: list[str] + rejected: list[str] + + +class Insight(BaseModel): + insight: str + + +class Category(BaseModel): + category: Literal["data", "schema"] + + +class Verdict(BaseModel): + insight: str + refuted: bool + reason: str = "" # the skeptic must SHOW ITS WORK — one-sentence judgment + + +class Intent(BaseModel): + """The conversational gate's verdict for untriggered messages.""" + + intent: Literal["data", "meta", "chat"] + reply: str = "" + + +def _obj_of(v): + """Accept a dict, a JSON-encoded dict/list string, or return None.""" + if isinstance(v, (dict, list)): + return v + if isinstance(v, str): + try: + parsed = json.loads(v) + if isinstance(parsed, (dict, list)): + return parsed + except (ValueError, TypeError): + pass + return None + + +def _sql_of(v) -> str: + """The SQL text from an Sql dict, a JSON string, or a raw SQL string.""" + obj = _obj_of(v) + if isinstance(obj, dict): + return str(obj.get("sql", "")) + return v if isinstance(v, str) else "" + + +def _field_of(v, key, default=None): + obj = _obj_of(v) + if isinstance(obj, dict): + return obj.get(key, default) + return default + + +def _verdict_of(v) -> dict: + obj = _obj_of(v) + if isinstance(obj, dict) and "insight" in obj: + return { + "insight": str(obj["insight"]), + "refuted": bool(obj.get("refuted")), + "reason": str(obj.get("reason", "") or ""), + } + return {"insight": str(v), "refuted": False, "reason": ""} + + +def _verdict_lines(state: dict): + """Render every skeptic verdict found in interpreter state — one line per + insight, with the skeptic's stated reason — or [] when no audit ran.""" + lines = [] + for value in state.values(): + if not (isinstance(value, list) and value): + continue + verdicts = [_verdict_of(item) for item in value] + if not all( + _obj_of(item) and "refuted" in (_obj_of(item) or {}) for item in value + ): + continue + for v in verdicts: + mark = "❌ REFUTED" if v["refuted"] else "✅ upheld" + reason = f" — {v['reason']}" if v["reason"] else "" + lines.append(f"{mark} — \"{v['insight']}\"{reason}") + return lines + + +_VEGA_MARK = {"bar": "bar", "line": "line", "scatter": "point", "pie": "arc"} + + +def _ascii_bars(rows, x=None, y=None, series=None, width: int = 24) -> str: + """A Unicode bar preview of the rows — renders in the chat. Uses the + derived x/y/series fields when given (so an integer `year` column is + never mistaken for the measure); falls back to first-str/first-number.""" + pts = [] + for r in rows or []: + if not isinstance(r, dict): + continue + if x in r: + label = str(r.get(x, "?")) + else: + label = next((str(v) for v in r.values() if isinstance(v, str)), "?") + if series and series in r: + label = f"{r[series]} {label}" + if y in r: + num = float(r.get(y) or 0.0) + else: + num = next( + (float(v) for v in r.values() if isinstance(v, (int, float))), 0.0 + ) + pts.append((label, num)) + if not pts: + return "(no rows)" + pts = pts[:40] # keep the chat readable for wide results + mx = max(n for _, n in pts) or 1.0 + lw = max(len(label) for label, _ in pts) + return "\n".join( + f"{label:<{lw}} {'█' * max(1, round(n / mx * width)):<{width}} " + f" {n:>14,.2f}" + for label, n in pts + ) + + +def _render_chart(v) -> dict: + """Build a chart from whatever the authored binding hands over: query + output (dict with rows), raw rows (list of dicts), a tournament winner + (list with one chart-type string), or a bare chart-type string. Emits the + Conversational-Analytics-style artifact: a Vega-Lite spec + a text + preview the chat can render.""" + chart_type, rows, explicit = "bar", None, False + obj = _obj_of(v) + if isinstance(obj, dict): + rows = obj.get("rows", rows) + if str(obj.get("chart_type", "")) in _VEGA_MARK: + chart_type, explicit = str(obj["chart_type"]), True + elif isinstance(obj, list) and obj: + if isinstance(obj[0], dict): + rows = obj + elif str(obj[0]) in _VEGA_MARK: + chart_type, explicit = str(obj[0]), True + elif isinstance(v, str) and v in _VEGA_MARK: + chart_type, explicit = v, True + + # date-shaped x labels (a time series) default to a LINE mark unless the + # chart type was chosen explicitly (e.g. by the tournament winner). + def _datelike(r) -> bool: + return isinstance(r, dict) and any( + isinstance(val, str) and re.match(r"^\d{4}(-\d{2}|-q\d|$)", val.lower()) + for val in r.values() + ) + + if not explicit and len(rows or []) >= 2 and all(map(_datelike, rows)): + chart_type = "line" + if rows is None: + # No rows handed over (e.g. the tournament passes only the winning + # chart type): chart REAL revenue-by-region data, not canned values. + rows = ( + _execute_sql({ + "sql": ( + "SELECT u.country AS region, SUM(oi.sale_price) AS revenue" + " FROM thelook_ecommerce.order_items AS oi JOIN" + " thelook_ecommerce.orders AS o ON oi.order_id = o.order_id" + " JOIN thelook_ecommerce.users AS u ON o.user_id = u.id" + " GROUP BY region ORDER BY revenue DESC LIMIT 8" + ) + }).get("rows") + or _CANNED_ROWS + ) + first = rows[0] if rows and isinstance(rows[0], dict) else {} + timeish = ("year", "quarter", "month", "week", "date", "day") + str_fields = [k for k, v in first.items() if isinstance(v, str)] + time_fields = [ + k + for k, v in first.items() + if k.lower() in timeish + or (isinstance(v, str) and re.match(r"^\d{4}([-/]\d{2})?", v)) + ] + num_fields = [ + k + for k, v in first.items() + if isinstance(v, (int, float)) + and not isinstance(v, bool) + and k not in time_fields + ] + x_field = ( + time_fields[0] + if time_fields + else (str_fields[0] if str_fields else "label") + ) + # a second categorical field becomes the SERIES (one line per value) — + # e.g. GROUP BY region, year -> x=year, one series per region. + series_field = next( + (k for k in str_fields if k != x_field and k not in time_fields), None + ) + if series_field is None and len(time_fields) == 0: + series_field = None + + def _measure_rank(k: str) -> int: + kl = k.lower() + if "revenue" in kl or "sales" in kl: + return 0 + if "total" in kl or "amount" in kl: + return 1 + return 2 + + y_field = sorted(num_fields, key=_measure_rank)[0] if num_fields else "value" + if series_field and not explicit: + chart_type = "line" # comparing series over a dimension -> lines + encoding = { + "x": {"field": x_field, "type": "nominal"}, + "y": {"field": y_field, "type": "quantitative"}, + } + if series_field: + encoding["color"] = {"field": series_field, "type": "nominal"} + return { + "chart_type": chart_type, + "x_field": x_field, + "y_field": y_field, + "series_field": series_field, + "ascii": _ascii_bars(rows, x=x_field, y=y_field, series=series_field), + "vega_lite": { + "$schema": "https://vega.github.io/schema/vega-lite/v5.json", + "mark": _VEGA_MARK[chart_type], + "data": {"values": rows}, + "encoding": encoding, + }, + } + + +def _chart_png(chart: dict): + """Render the chart artifact to PNG bytes via matplotlib, or None. + + Optional dependency: without matplotlib the demo falls back to the text + preview + Vega-Lite spec (which any Vega editor renders faithfully).""" + try: + import matplotlib + + matplotlib.use("Agg") + import matplotlib.pyplot as plt + except ImportError: + return None + import io + + rows = chart["vega_lite"]["data"]["values"] + x, y = chart.get("x_field", "label"), chart.get("y_field", "value") + series = chart.get("series_field") + kind = chart["chart_type"] + fig, ax = plt.subplots(figsize=(6.8, 3.6), dpi=144) + if series: + # one line per series value (e.g. per region), x shared + by_series: dict = {} + for r in rows: + by_series.setdefault(str(r.get(series, "?")), []).append( + (str(r.get(x, "?")), float(r.get(y) or 0.0)) + ) + for name, pts in sorted(by_series.items()): + pts.sort() + ax.plot([p[0] for p in pts], [p[1] for p in pts], marker="o", label=name) + ax.legend(fontsize=8) + else: + labels = [str(r.get(x, "?")) for r in rows] + values = [float(r.get(y) or 0.0) for r in rows] + if kind == "pie": + ax.pie(values, labels=labels, autopct="%1.0f%%") + elif kind == "line": + ax.plot(labels, values, marker="o", color="#4285F4") + elif kind == "scatter": + ax.scatter(labels, values, s=80, color="#4285F4") + else: + ax.bar(labels, values, color="#4285F4") + if kind != "pie" or series: + ax.set_ylabel(y) + ax.grid(axis="y", alpha=0.3) + ax.spines[["top", "right"]].set_visible(False) + if len({str(r.get(x, "")) for r in rows}) > 8: + ax.tick_params(axis="x", labelrotation=60, labelsize=7) + title = f"{y} by {x}" + (f" per {series}" if series else "") + f" ({kind})" + ax.set_title(title) + fig.tight_layout() + buf = io.BytesIO() + fig.savefig(buf, format="png") + plt.close(fig) + return buf.getvalue() + + +def _stub(name, fn): + def build(): + @node(name=name) + async def n(ctx, node_input): + yield Event(output=fn(node_input)) + + return n + + return build + + +def _llm(name, output_schema, instruction): + return lambda: Agent( + name=name, + model=MODEL, + output_schema=output_schema, + generate_content_config=DET, + instruction=instruction, + ) + + +def _registry() -> CapabilityRegistry: + schema_blurb = "; ".join(f"{t}({c})" for t, c in TABLES.items()) + return CapabilityRegistry([ + # ---- live language capabilities (Gemini) ---- + Capability( + name="nl2sql", + input_kind="item", + output_model=Sql, + serialize_input=True, + build=_llm( + "nl2sql", + Sql, + "Translate the question in the input JSON to one BigQuery" + " StandardSQL SELECT over the public dataset" + " bigquery-public-data.thelook_ecommerce (use fully-qualified" + f" table names): {schema_blurb}. Output Sql, echoing the" + " question field.", + ), + ), + Capability( + name="draft_or_repair_sql", + input_kind="item", + output_model=Sql, + serialize_input=True, + build=_llm( + "draft_or_repair_sql", + Sql, + "Input JSON has a question, and possibly a prior sql + error" + " from a failed dry run, and possibly human feedback. If" + " there is feedback, REVISE the sql to follow it exactly. If" + " there is an error, REPAIR the sql using it; if the sql is" + " valid (no error, no feedback), return it unchanged." + " Otherwise draft" + " one BigQuery StandardSQL SELECT over the public dataset" + " bigquery-public-data.thelook_ecommerce (fully-qualified" + f" table names): {schema_blurb}. Output Sql, echoing the" + " question field.", + ), + ), + Capability( + name="summarize_insight", + input_kind="item", + output_model=Insight, + serialize_input=True, + build=_llm( + "summarize_insight", + Insight, + "Input: JSON query results (or profiling stats). Output" + " Insight: one crisp analyst sentence.", + ), + ), + Capability( + name="classify_question", + input_kind="item", + output_model=Category, + serialize_input=True, + build=_llm( + "classify_question", + Category, + "Classify the user question: 'data' if it needs a SQL query" + " over the tables, 'schema' if it asks what a column/value" + " means. Output Category.", + ), + ), + Capability( + name="skeptic", + input_kind="item", + output_model=Verdict, + serialize_input=True, + # v2: the skeptic became DATA-GROUNDED (a real query tool) — a + # semantic contract change, so stored plans drift-reject and + # re-author rather than silently reusing the plausibility-only + # skeptic. ADK supports output_schema + tools together: tools in + # the thought loop, structure enforced on the final output. + version="2", + build=lambda: Agent( + name="skeptic", + model=MODEL, + output_schema=Verdict, + generate_content_config=DET, + tools=[query_thelook], + instruction=( + "You are an adversarial DATA reviewer with a real" + " BigQuery tool. Input: one insight/claim about the" + " public dataset bigquery-public-data.thelook_ecommerce" + f" ({schema_blurb}). Do NOT judge from priors: VERIFY the" + " claim by running 1-3 small aggregate SELECTs with the" + " query_thelook tool and compare the actual numbers to" + " the claim. Then output Verdict: echo the claim as" + " insight; refuted=true only if the data contradicts it;" + " reason = one sentence citing the numbers you queried" + " (note caveats like partial years)." + ), + ), + ), + # ---- deterministic mocks (no BigQuery needed) ---- + Capability( + name="dry_run", + input_kind="item", + output_model=DryRunResult, + serialize_input=False, + build=_stub("dry_run", _bq_dry_run), + ), + Capability( + name="sql_ok", + input_kind="item", + serialize_input=False, + build=_stub( + "sql_ok", + lambda s: bool( + _field_of(s, "valid", s if s is not None else False) + ), + ), + ), + Capability( + name="run_query", + input_kind="item", + output_model=QueryResult, + serialize_input=False, + build=_stub("run_query", _execute_sql), + ), + Capability( + name="profile_table", + output_model=TableProfile, + input_kind="item", + serialize_input=False, + max_fan_out=20, + build=_stub("profile_table", _profile_table), + ), + Capability( + name="quality_report", + output_model=QualityReport, + input_kind="list", + serialize_input=False, + build=_stub( + "quality_report", + lambda profiles: { + "tables": len(profiles), + "total_rows": sum( + int(p.get("row_count", 0)) for p in profiles + ), + "largest_table": ( + max(profiles, key=lambda p: p.get("row_count", 0))[ + "table" + ] + if profiles + else "" + ), + "total_size_mb": round( + sum(float(p.get("size_mb", 0)) for p in profiles), 1 + ), + }, + ), + ), + Capability( + name="describe_schema", + output_model=SchemaAnswer, + input_kind="item", + serialize_input=True, + # v2: answers metadata questions from the REAL dataset (it queries + # DISTINCT values / counts) instead of a canned sentence. + version="2", + build=lambda: Agent( + name="describe_schema", + model=MODEL, + output_schema=SchemaAnswer, + generate_content_config=DET, + tools=[query_thelook], + instruction=( + "Answer metadata/meaning questions about the public" + " dataset bigquery-public-data.thelook_ecommerce" + f" ({schema_blurb}). QUERY the real data with the" + " query_thelook tool (e.g. SELECT DISTINCT values, small" + " counts) rather than answering from priors. Output" + " SchemaAnswer: a concise answer grounded in the queried" + " values." + ), + ), + ), + Capability( + name="render_chart", + input_kind="item", + output_model=ChartArtifact, + serialize_input=False, + build=_stub("render_chart", _render_chart), + ), + Capability( + name="keep_verified", + output_model=VerifiedInsights, + input_kind="list", + serialize_input=False, + build=_stub( + "keep_verified", + lambda vs: { + "verified": [ + v["insight"] + for v in map(_verdict_of, vs or []) + if not v["refuted"] + ], + "rejected": [ + v["insight"] + for v in map(_verdict_of, vs or []) + if v["refuted"] + ], + }, + ), + ), + Capability( + name="pair_charts", + input_kind="list", + serialize_input=False, + build=_stub( + "pair_charts", + lambda lst: [lst[i : i + 2] for i in range(0, len(lst), 2)], + ), + ), + Capability( + name="judge_chart", + input_kind="item", + serialize_input=False, + build=_stub( + "judge_chart", + lambda pair: min(pair, key=lambda c: _JUDGE_RANK.get(c, 99)), + ), + ), + Capability( + name="single_chart", + input_kind="list", + serialize_input=False, + build=_stub("single_chart", lambda lst: len(lst) == 1), + ), + ]) + + +# ------------------------------------------------- scenarios +_CAPS_BLURB = ( + # NOTE: instruction strings must stay BRACE-FREE — ADK templates + # "identifier" in instructions as session-state injection + # and raises KeyError on unknown variables. + "nl2sql (item: a question object -> Sql with field sql)," + " draft_or_repair_sql (item: a question plus optional prior sql and error" + " -> Sql), summarize_insight (item: rows or stats JSON -> Insight with" + " field insight), classify_question (item: a question -> Category with" + " field category equal to 'data' or 'schema'), skeptic (item: one —" + " data-grounded: it runs real verification queries via its query_thelook" + " tool; insight -> Verdict with fields insight and refuted), dry_run (item:" + " Sql or a task with sql -> object with sql, valid, error — the REAL" + " BigQuery dry-run), sql_ok (item: dry-run output -> bool), run_query" + " (item: validated sql -> object with rows), profile_table (item: a table" + " name -> stats object), quality_report (LIST of stats -> report object)," + " describe_schema (item: a question -> object with answer), keep_verified" + " (LIST of Verdicts -> object with verified and rejected), render_chart" + " (item: query output with rows, or a chart-type winner -> a chart artifact" + " with chart_type, ascii preview, and a vega_lite spec), pair_charts (LIST" + " -> list of pairs), judge_chart (item: a pair -> the winner), single_chart" + " (LIST -> bool)." +) + +_BINDING_RULES = ( + " Binding rules: Binding(source='task', path=) reads the task" + " input; Binding(source='step', step=) chains steps; pipeline" + " stages take the previous stage's per-item output automatically." +) + + +def _scenario_defs(): + """key -> (title, shape, triggers, task_input, planner recipe).""" + q_region = "What was revenue by region last quarter?" + return { + "sequence": dict( + title="Ask a question (draft → REAL dry-run → repair → execute)", + shape=( + "loop_until(draft_or_repair → real dry_run) → run_query →" + " render_chart + step" + ), + triggers=("revenue by region", "sequence"), + task={"question": q_region}, + recipe=( + "Author, in order: (1) ONE loop_until for SQL drafting with" + " self-repair: init = Binding(source='task'); body = [(a) a" + " step running draft_or_repair_sql whose input is" + " Binding(source='step', step=) — round 0" + " reads the task, later rounds read the failed dry-run output" + " (sql + error); (b) a step running dry_run on (a)];" + " until_capability = sql_ok with until_input =" + " Binding(source='step', step=); max_iters = 3." + " (2) a step running run_query on the loop's output. (3) a" + " step running render_chart on the run_query step's output." + " (4) a step running summarize_insight on the run_query" + " step's output. Output = the summarize step." + ), + ), + "fanout": dict( + title="Profile data quality (fan-out / synthesize)", + shape="fan_out → step", + triggers=("profile", "data quality"), + task={"tables": list(TABLES)}, + recipe=( + "Author: (1) a fan_out over task.tables running profile_table" + " per table; (2) a step running quality_report on the fan_out" + " output. Output = the report step." + ), + ), + "pipeline": dict( + title="Build a dashboard (pipeline — executes every panel)", + shape=( + "pipeline(draft → REAL dry_run → run_query → render_chart)" + " per panel, barrier-free" + ), + triggers=("dashboard",), + task={ + "questions": [ + {"question": "Monthly total revenue for 2025"}, + { + "question": ( + "Top 5 product categories by total revenue in 2025" + ) + }, + { + "question": ( + "New users per month in 2025 broken down by" + " traffic source" + ) + }, + ] + }, + recipe=( + "Author: ONE pipeline over task.questions with FOUR stages —" + " draft_or_repair_sql, then dry_run, then run_query, then" + " render_chart — so each dashboard panel is translated," + " validated, EXECUTED, and charted per item, barrier-free." + " Output = the pipeline." + ), + ), + "branch": dict( + title="Route the question (classify & route)", + shape="step → branch", + triggers=("route", "what does", "mean"), + task={"question": "What does order status 'Complete' mean?"}, + recipe=( + "Author: (1) a step running classify_question on the task;" + " (2) a branch on that step's 'category' field" + " (Binding(source='step', step=, path='category')) with" + " TWO routes: value 'data' -> a block [nl2sql on task," + " dry_run, run_query, summarize_insight]; value 'schema' -> a" + " block [describe_schema on task]. Output = the branch." + ), + ), + "loop": dict( + title="SQL self-repair from a REAL broken query (loop_until)", + shape="loop_until(REAL dry_run → repair) → run_query", + triggers=("repair", "unreliable", "retry", "broken"), + task={ + "question": q_region, + "sql": ( # 'order' instead of 'orders' -> a REAL not-found error + "SELECT u.country AS region, SUM(oi.sale_price) AS revenue" + " FROM thelook_ecommerce.order_items AS oi JOIN" + " thelook_ecommerce.order AS o ON oi.order_id =" + " o.order_id JOIN thelook_ecommerce.users AS u ON" + " o.user_id = u.id GROUP BY region ORDER BY revenue DESC" + ), + }, + recipe=( + "Author, in order: (1) ONE loop_until: init =" + " Binding(source='task'); body = [(a) a step running dry_run" + " whose input is Binding(source='step', step=) — round 0 checks the task's sql, later rounds check" + " the repaired sql; (b) a step running draft_or_repair_sql" + " on (a) — it reads question + sql + the REAL BigQuery error" + " and outputs a fixed Sql (if there is no error, return the" + " sql unchanged)]; until_capability = sql_ok with until_input" + " = Binding(source='step', step=); max_iters =" + " 3. (2) a step running run_query on the loop's output." + " Output = the run_query step." + ), + ), + "revise": dict( + title="Revise the frozen SQL from human feedback", + shape=( + "feedback → draft_or_repair (REAL dry-run) → re-freeze → execute" + ), + triggers=( + "revise", + "update the sql", + "update the query", + "change the query", + "instead of", + "redefine", + ), + task={}, + recipe="", + ), + "adversarial": dict( + title="Audit insights (adversarial verification)", + shape="fan_out(skeptic) → step(keep_verified)", + triggers=("audit", "verify insights"), + task={ + "insights": [ + "Average order value is roughly $75.", + "The average order value is $1,000,000.", + "Most users arrive via organic search.", + ] + }, + recipe=( + "Author: (1) a fan_out over task.insights running skeptic per" + " insight; (2) a step running keep_verified on the fan_out" + " output. Output = the keep_verified step." + ), + ), + "tournament": dict( + title="Pick the best chart (tournament)", + shape=( + "loop_until(init=task.chart_options, body=[pair, fan_out])" + " → render_chart" + ), + triggers=("best chart", "tournament"), + task={"chart_options": ["pie", "bar", "line", "scatter"]}, + recipe=( + "Author ONE loop_until: init = Binding(source='task'," + " path='chart_options'); body = [(a) a step running" + " pair_charts whose input is Binding(source='step', step=); (b) a fan_out over (a) running judge_chart" + " per pair]; until_capability = single_chart with until_input" + " = Binding(source='step', step=); max_iters" + " = 3. Then (2) a step running render_chart on the loop's" + " output (the winning chart type). Output = the render_chart" + " step." + ), + ), + } + + +SCENARIOS = _scenario_defs() + + +def _text_of(node_input) -> str: + """The user's message text, whatever shape the node input arrives in.""" + if isinstance(node_input, str): + return node_input + for holder in (node_input, getattr(node_input, "content", None)): + parts = getattr(holder, "parts", None) + if parts: + return " ".join(p.text for p in parts if getattr(p, "text", None)) + return str(node_input or "") + + +def _extract_insights(text: str): + """Insights inlined in an audit ask ('audit this insight: X' / lists + split on ';' or newlines), or None when the message is trigger-only.""" + t = (text or "").strip() + tl = t.lower() + for trig in ("verify insights", "audit", "verify"): + i = tl.find(trig) + if i < 0: + continue + rest = t[i + len(trig) :] + rest = re.sub( + r"^[\s:,\-—]*((these|this|the|my)\s+)?(insights?|claims?|ingisht\w*)?[\s:,\-—]*", + "", + rest, + flags=re.I, + ) + rest = rest.strip().strip('"').rstrip("?!.").strip() + if len(rest) >= 12: + parts = [s.strip() for s in re.split(r"[;\n]+", rest) if s.strip()] + return parts or None + return None + return None + + +def _task_for(key: str, text: str, last_insight: str | None = None) -> dict: + """The scenario's task input. LIVE inputs where they make sense: + + * sequence: the user's message IS the question; + * adversarial: insights inlined in the message are audited; with none + inlined, the session's LAST generated insight ('audit that'); only + then the canned demo set. + Other scenarios keep canned inputs (their prompts are mode selectors).""" + task = dict(SCENARIOS[key]["task"]) + if key == "sequence" and text.strip(): + task = {"question": text.strip()} + if key == "fanout": + task = {"tables": _live_table_list()} # whatever REALLY exists + if key == "adversarial": + inline = _extract_insights(text) + if inline: + task = {"insights": inline} + elif last_insight: + task = {"insights": [last_insight]} + return task + + +def _matched_scenario(text: str): + """The scenario whose trigger the message hits, or None (gate decides).""" + t = (text or "").lower() + for key, sc in SCENARIOS.items(): + if key == "sequence": + continue + if any(trigger in t for trigger in sc["triggers"]): + return key + return None + + +def _describe_workflows() -> str: + """A brace-free catalogue of the workflow kinds, built from SCENARIOS so + it never drifts from the actual demo.""" + lines = [] + for sc in SCENARIOS.values(): + shape = sc["shape"].replace("{", "(").replace("}", ")") + lines.append(f"* {sc['title']} — shape: {shape}") + return "\n".join(lines) + + +def _intent_agent() -> Agent: + # The conversational gate: small questions should not pay orchestration + # overhead (the RFC's no-plan escape hatch). NOTE: instruction must stay + # brace-free (ADK templates curly identifiers as state injection). + return Agent( + name="intent_gate", + model=MODEL, + output_schema=Intent, + generate_content_config=DET, + instruction=( + "You are the front door of a BigQuery Conversational Analytics" + " demo agent. It answers questions over the public" + " bigquery-public-data.thelook_ecommerce dataset (orders," + " order_items, products, users) by AUTHORING typed workflows:\n" + + _describe_workflows() + + "\nClassify the user's message. If it is a question answerable" + " from the e-commerce data (metrics, trends, segments, SQL-able" + " asks), output intent='data' with an empty reply. If it asks" + " what you can do, which workflows you can issue, how to use" + " you, or about your design, output intent='meta' and write a" + " genuinely helpful reply: list the workflow kinds above, one" + " example prompt each, and mention that plans are validated," + " frozen, replayable across sessions, and run on real BigQuery." + " Otherwise output intent='chat' with a brief friendly reply" + " that points at what you can do. Reply in plain markdown." + ), + ) + + +def _scenario_for(text: str) -> str: + """Specialized scenarios win over the generic ask-a-question fallback. + + 'sequence' is the default for ANY question, so its triggers must never + shadow a specialized intent — e.g. "best chart for revenue by region" + contains both a tournament trigger and a sequence trigger and must route + to the tournament. + """ + t = (text or "").lower() + for key, sc in SCENARIOS.items(): + if key == "sequence": + continue # fallback only — checked last by construction + if any(trigger in t for trigger in sc["triggers"]): + return key + return "sequence" + + +def _planner_instruction(sc) -> str: + keys = ", ".join(f"'{k}'" for k in sc["task"]) + return ( + "Author a WorkflowSpec using ONLY these capabilities: " + + _CAPS_BLURB + + " The task input JSON arrives as your input message; its keys:" + f" {keys}. " + + sc["recipe"] + + _BINDING_RULES + ) + + +# ------------------------------------------------- cross-session plan store +# Frozen plans outlive the session: on freeze, the FULL FrozenWorkflowRecord +# is exported as a portable envelope to disk (a stand-in for the +# ArtifactService in production — RFC Q1). A NEW session imports it through +# the RFC's DEFENSIVE import: spec_hash recomputed, re-validated against the +# CURRENT registry, manual-version + DECLARED-contract drift (input kind + +# declared output schema; capabilities without a declared output model rely +# on manual versions) fail loudly, and +# the new task input is validated against the captured task_input_schema +# (template reuse). Drift never silently replays a stale plan — it falls +# back to authoring fresh, with the rejection shown. +_PLAN_STORE = os.path.join(os.getcwd(), "ca_plan_store") + + +def _store_plan(key: str, record: FrozenWorkflowRecord) -> str: + os.makedirs(_PLAN_STORE, exist_ok=True) + path = os.path.join(_PLAN_STORE, f"{key}.json") + with open(path, "w") as f: + json.dump(export_plan(record), f, indent=1) + return path + + +_SQL_STORE = os.path.join(_PLAN_STORE, "sql") + + +def _now_iso() -> str: + return datetime.datetime.now(datetime.timezone.utc).isoformat() + + +def _q_digest(question: str) -> str: + return sha256_hex(re.sub(r"\s+", " ", (question or "").strip().lower()))[:16] + + +def _load_frozen_sql(question: str): + """The dry-run-validated SQL frozen for this exact question, or None.""" + if not question: + return None + path = os.path.join(_SQL_STORE, f"{_q_digest(question)}.json") + if not os.path.exists(path): + return None + try: + with open(path) as f: + return json.load(f) + except Exception: + return None + + +def _freeze_sql( + question: str, + sql: str, + *, + engine: str = "bigquery", + bytes_processed: int = 0, + feedback: str | None = None, + previous: dict | None = None, + plan_hash: str | None = None, + produced_by_step: str | None = None, +) -> str: + """Freeze (or revise) the validated SQL for a question. + + SQL freezing extends plan freezing one level deeper: the drafting LLM is + the remaining nondeterministic step in a frozen plan, so caching its + dry-run-validated output makes replays NUMERICALLY deterministic. A + human-feedback revision appends to `revisions` — the feedback itself + becomes part of the auditable artifact (who changed the query and why). + """ + os.makedirs(_SQL_STORE, exist_ok=True) + path = os.path.join(_SQL_STORE, f"{_q_digest(question)}.json") + rec = ( + dict(previous) + if previous + else { + "question": (question or "").strip(), + "revisions": [], + } + ) + if previous is not None and feedback is not None: + rec.setdefault("revisions", []).append({ + "feedback": feedback, + "previous_sql": previous.get("sql"), + "revised_at": _now_iso(), + }) + rec.update({ + "sql": sql, + "sql_hash": sha256_hex(sql), + "engine": engine, + "bytes_processed": int(bytes_processed or 0), + "validated_at": _now_iso(), + }) + # WORKFLOW LINEAGE: the middle result is an instance of a specific plan's + # step for a specific question — record which plan (hash) and which step + # produced it, so the artifact is structurally attached to the frozen + # workflow, not just stored beside it. Revisions inherit the lineage. + if plan_hash: + rec["plan_hash"] = plan_hash + if produced_by_step: + rec["produced_by_step"] = produced_by_step + with open(path, "w") as f: + json.dump(rec, f, indent=1) + return path + + +def _frozen_sql_spec() -> WorkflowSpec: + """The static replay plan for a frozen SQL: re-validate (REAL dry-run, + which doubles as warehouse-drift detection) -> execute -> chart -> + summarize. No drafting LLM anywhere — numerically deterministic given an + unchanged dataset.""" + return WorkflowSpec( + goal="execute a frozen, human-auditable SQL", + steps=[ + StepRef( + kind="step", + id="check", + capability="dry_run", + input=Binding(source="task"), + ), + StepRef( + kind="step", + id="rows", + capability="run_query", + input=Binding(source="step", step="check"), + ), + StepRef( + kind="step", + id="chart", + capability="render_chart", + input=Binding(source="step", step="rows"), + ), + StepRef( + kind="step", + id="sum", + capability="summarize_insight", + input=Binding(source="step", step="rows"), + ), + ], + output=Binding(source="step", step="sum"), + ) + + +def _load_stored_plan(key: str, registry, task): + """Returns (spec, None) on a valid import, (None, reason) on a rejected + or unreadable envelope, (None, None) when nothing is stored.""" + path = os.path.join(_PLAN_STORE, f"{key}.json") + if not os.path.exists(path): + return None, None + try: + with open(path) as f: + envelope = json.load(f) + return import_plan(envelope, registry, task_input=task), None + except PlanImportError as e: + return None, str(e)[:300] + except Exception as e: # unreadable/corrupt file + return None, f"{type(e).__name__}: {e}" + + +def _msg(text: str) -> Event: + return Event( + content=types.Content(role="model", parts=[types.Part(text=text)]) + ) + + +def _hash(spec: WorkflowSpec) -> str: + return sha256_hex(spec.model_dump(mode="json"))[:12] + + +@node(rerun_on_resume=True) +async def plan_and_run(ctx: Context, node_input): + reg = _registry() + text = _text_of(node_input) + key = _matched_scenario(text) + if key is None: + # Conversational gate: only untriggered messages pay this one call — + # meta/chat turns get a direct answer and never issue a workflow. + raw = await ctx.run_node(_intent_agent(), node_input=text, run_id="intent") + verdict = Intent.model_validate(raw) + if verdict.intent != "data": + yield _msg(verdict.reply or "Ask me a question about the data!") + yield _msg( + "💬 _Conversational turn — no workflow issued (1 intent call," + " 0 planner calls, 0 queries)._" + ) + yield Event(output={"scenario": "conversation", "intent": verdict.intent}) + return + key = "sequence" + sc = SCENARIOS[key] + task = _task_for( + key, + text, + last_insight=ctx.state.get("authored_workflow:ca:last_insight"), + ) + state_key = f"authored_workflow:ca:{key}" + + if key == "sequence": + task_note = f' — question: "{task["question"]}"' + elif key == "adversarial": + src_note = ( + "canned demo set" + if task == sc["task"] + else "YOUR insights (live input)" + ) + task_note = f" — auditing {src_note}: {task['insights']}" + else: + task_note = "" + data_note = ( + "LIVE `bigquery-public-data.thelook_ecommerce`" + if _bq_client() is not None + else "mock `thelook_ecommerce` warehouse (no BigQuery credentials)" + ) + yield _msg( + f"🗂️ **Scenario: {sc['title']}** — expected shape `{sc['shape']}`," + f" over {data_note}" + f" ({', '.join(TABLES)}){task_note}." + ) + + used_frozen_sql = False + if key == "revise": + # HUMAN FEEDBACK on the frozen SQL of the session's last question: + # revise (LLM, feedback-aware) -> re-validate (REAL dry-run) -> + # re-freeze with the feedback recorded in the revision history -> + # execute the revised query. + last_q = ctx.state.get("authored_workflow:ca:last_question") + rec = _load_frozen_sql(last_q) if last_q else None + if rec is None: + yield _msg( + "🛠️ Nothing to revise yet — ask a data question first; its" + " validated SQL gets frozen, and then your feedback can amend it." + ) + yield Event(output={"scenario": "revise", "revised": False}) + return + yield _msg( + f'🛠️ **Revising the frozen SQL** for: "{last_q}" (revision' + f" #{len(rec.get('revisions', [])) + 1})\nYour feedback:" + f" _{text.strip()}_\nCurrent SQL:\n```sql\n{rec['sql']}\n```" + ) + raw = await ctx.run_node( + reg["draft_or_repair_sql"].build(), + node_input=json.dumps( + {"question": last_q, "sql": rec["sql"], "feedback": text.strip()} + ), + run_id="revise_sql", + ) + new_sql = _sql_of(raw) + check = _bq_dry_run({"question": last_q, "sql": new_sql}) + if not check.get("valid"): + yield _msg( + "🛑 The revised SQL failed the REAL dry-run —" + f" `{str(check.get('error'))[:200]}`. The frozen SQL is" + " UNCHANGED (a revision must validate before it replaces the" + " frozen artifact)." + ) + yield Event(output={"scenario": "revise", "revised": False}) + return + _freeze_sql( + last_q, + check["sql"], + engine=str(check.get("engine", "bigquery")), + bytes_processed=check.get("bytes_processed", 0), + feedback=text.strip(), + previous=rec, + ) + yield _msg( + "🧊 **Re-frozen** — the feedback is now part of the artifact's" + " revision history (auditable: who changed the query and why)." + f"\nRevised SQL:\n```sql\n{check['sql']}\n```\nRunning it:" + ) + spec = _frozen_sql_spec() + spec_hash = _hash(spec) + task = {"question": last_q, "sql": check["sql"]} + used_frozen_sql = True + reused = True + + if not used_frozen_sql and key == "sequence": + rec = _load_frozen_sql(task.get("question", "")) + if rec is not None: + pre = _bq_dry_run({"question": task["question"], "sql": rec["sql"]}) + if pre.get("valid"): + spec = _frozen_sql_spec() + spec_hash = _hash(spec) + task = {"question": task["question"], "sql": rec["sql"]} + used_frozen_sql = True + reused = True + yield _msg( + "🧊 **Frozen SQL replay** — this exact question was answered" + f" before; its validated SQL (hash `{rec['sql_hash'][:12]}`," + f" validated {rec['validated_at'][:19]}," + f" {len(rec.get('revisions', []))} human revision(s)) is reused" + " — the drafting LLM is SKIPPED, so the numbers are" + " deterministic given an unchanged dataset. The real dry-run" + " just re-validated it (warehouse-drift check)." + ) + else: + yield _msg( + "🧊 The frozen SQL for this question no longer validates" + f" (warehouse drift): `{str(pre.get('error'))[:160]}` —" + " re-authoring fresh." + ) + + if used_frozen_sql: + pass # spec/task pinned above; skip plan-store/session reuse + else: + spec, source = None, None + # 1. LOAD-OR-AUTHOR. Reuse order: this session's state -> the + # CROSS-SESSION plan store (defensive import) -> author fresh. + existing = None if used_frozen_sql else ctx.state.get(state_key) + if existing: + spec = WorkflowSpec.model_validate(existing) + source = "session state" + elif not used_frozen_sql: + spec, reject = _load_stored_plan(key, reg, task) + if spec is not None: + source = "plan store (CROSS-SESSION import)" + ctx.state[state_key] = spec.model_dump() # cache for this session + elif reject: # noqa: F821 + yield _msg( + f"🛑 **Plan-store import rejected** for `{key}` — {reject}\n" + "Drift never silently replays a stale plan; re-authoring fresh." + ) + if used_frozen_sql: + pass # beats already emitted; spec/task/reused pinned + elif spec is not None: + spec_hash = _hash(spec) + reused = True + fresh_input = task != sc["task"] + yield _msg( + f"♻️ **Reusing frozen plan** for `{key}` from {source} — hash" + f" `{spec_hash}`. The model is NOT re-invoked" + + ( + "; the import recomputed the hash, re-validated against the" + " current registry, and checked contract-hash drift" + if "CROSS-SESSION" in (source or "") + else "" + ) + + ( + " — your NEW question is the task input (**template reuse**:" + " same plan, new data flowing through it)." + if fresh_input + else "." + ) + ) + else: + reused = False + planner = Agent( + name="planner", + model=MODEL, + output_schema=WorkflowSpec, + generate_content_config=DET, + instruction=_planner_instruction(sc), + ) + raw = await ctx.run_node( + planner, node_input=json.dumps(task), run_id=f"plan_{key}" + ) + spec = WorkflowSpec.model_validate(raw) + spec_hash = _hash(spec) + steps = " → ".join(s.kind for s in spec.steps) + yield _msg( + f"📋 **Authored plan** (`{steps}`):\n```json\n" + f"{json.dumps(spec.model_dump(exclude_none=True), indent=1)}\n```" + ) + + # 2. VALIDATE + 2b. INDEPENDENCE LINTS. + warnings = WorkflowSpecValidator(reg).validate(spec) + lints = [w for w in warnings if w.startswith("plan-quality")] + facts = "\n".join(f" - {f}" for f in independence_facts(spec)) + yield _msg( + f"✅ **Validation passed.** 🧪 plan-quality lints: {len(lints)}." + f" Provenance (statically provable):\n{facts}" + + (f"\n⚠️ {lints}" if lints else "") + ) + + # 3. FREEZE (per scenario) + EXPORT (cross-session). + if not reused: + ctx.state[state_key] = spec.model_dump() + record = FrozenWorkflowRecord.freeze( + spec, + planner_model=MODEL, + registry=reg, + created_at=datetime.datetime.now(datetime.timezone.utc).isoformat(), + task_input=task, + # capture the input schema = TEMPLATE promotion: a new session may + # run this plan on a NEW question, validated against this schema. + task_input_schema={"required": sorted(task)}, + ) + path = _store_plan(key, record) + yield _msg( + f"🔒 **Frozen** under `{state_key}` — hash `{spec_hash}`. 📦" + f" Exported the full record to `{os.path.relpath(path)}` —" + " **a NEW session will import and reuse this plan** (defensive" + " import: hash + registry + contract-hash checks, task input" + " validated against the captured schema)." + ) + + # 4. EXECUTE on the real engine via the #92 supervisor. + t0 = time.perf_counter() + interp = SpecInterpreter(reg, ctx) + result = await interp.execute(spec, task) + elapsed = time.perf_counter() - t0 + verdict_lines = _verdict_lines(interp.state) + if verdict_lines: + rendered = "\n".join(f" - {line}" for line in verdict_lines) + yield _msg( + "🕵️ **Skeptic verdicts** (one independent skeptic per insight —" + f" provably isolated from whatever produced it):\n{rendered}" + ) + for chart in ( + v + for v in interp.state.values() + if isinstance(v, dict) and "vega_lite" in v + ): + png = _chart_png(chart) + if png is not None: + yield Event( + content=types.Content( + role="model", + parts=[ + types.Part( + text=( + f"📈 **Chart ({chart['chart_type']})** — rendered" + " from the Conversational-Analytics-style" + " Vega-Lite artifact:" + ) + ), + types.Part.from_bytes(data=png, mime_type="image/png"), + ], + ) + ) + yield _msg( + "Vega-Lite spec (the portable artifact behind the image):\n" + f"```json\n{json.dumps(chart['vega_lite'], indent=1)}\n```" + ) + else: + yield _msg( + f"📈 **Chart ({chart['chart_type']})** — text preview + Vega-Lite" + " spec (install matplotlib for an inline rendered image):\n```\n" + f"{chart['ascii']}\n```\n```json\n" + f"{json.dumps(chart['vega_lite'], indent=1)}\n```" + ) + display = ( + {k: v for k, v in result.items() if k != "vega_lite"} + if isinstance(result, dict) + else result + ) + yield _msg( + f"📄 **Result:**\n```json\n{json.dumps(display, indent=1, default=str)}" + f"\n```\n📊 **Cost:** {interp.dispatch_count} capability dispatches in" + f" {elapsed:.1f}s + " + + ("0 planner calls (frozen replay)." if reused else "1 planner call.") + ) + if isinstance(result, dict) and isinstance(result.get("insight"), str): + # remembered so a later 'audit that insight' audits THIS, not canned data + ctx.state["authored_workflow:ca:last_insight"] = result["insight"] + if key in ("sequence", "revise") and task.get("question"): + ctx.state["authored_workflow:ca:last_question"] = task["question"] + if key == "sequence" and not used_frozen_sql: + # SQL FREEZING: the drafting LLM was the last nondeterministic step in + # a frozen plan — freeze its dry-run-validated output so replays of + # this exact question are numerically deterministic (and feedback can + # amend it auditably). + checked_step, checked = next( + ( + (k, v) + for k, v in interp.state.items() + if isinstance(v, dict) and v.get("valid") is True and v.get("sql") + ), + (None, None), + ) + if checked: + _freeze_sql( + task["question"], + checked["sql"], + engine=str(checked.get("engine", "bigquery")), + bytes_processed=checked.get("bytes_processed", 0), + plan_hash=spec_hash, + produced_by_step=checked_step, + ) + yield _msg( + "🧊 **SQL frozen** for this question — re-ask it (any session)" + " and the validated SQL replays with the drafting LLM skipped;" + " say `revise: ` to amend it with an audit trail." + ) + yield Event( + output={ + "scenario": key, + "hash": spec_hash, + "reused": reused, + "dispatches": interp.dispatch_count, + "result": result, + } + ) + + +root_agent = Workflow( + name="bq_ca_planner", + edges=[("START", plan_and_run)], +) diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/plan_inspector.py b/contributing/samples/workflows/authored_workflow_ca_demo/plan_inspector.py new file mode 100644 index 00000000000..1a8d6905cab --- /dev/null +++ b/contributing/samples/workflows/authored_workflow_ca_demo/plan_inspector.py @@ -0,0 +1,532 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""RFC #93 evidence page — frozen workflows with frozen middle results. + +Renders the plan store as a self-contained HTML pitch for the RFC: the +model-authored typed plan is the centerpiece; validated INTERMEDIATE step +results (here: the dry-run-checked SQL the drafting loop produced) freeze +onto the step that produced them, so replays skip that step's LLM entirely; +human feedback amends the frozen artifact through validation, with every +revision recorded. SQL is the demonstrated instance — the mechanism is the +RFC's general step-result freezing tier. + + python plan_inspector.py [session-id] [app] [user] + open ca_plan_store/plan_inspector.html + +With a session id, the page opens with that live session's actual flow, +each turn classified by the RFC mechanism that answered it. +""" + +from __future__ import annotations + +import html +import json +import os +import re +import sys + +STORE = os.path.join(os.getcwd(), "ca_plan_store") + +CSS = """ +:root { --ink:#1a1c1e; --mut:#5f6368; --line:#dadce0; --blue:#1a73e8; + --green:#188038; --amber:#b06000; --purple:#7627bb; --red:#c5221f; + --ice:#0277bd; --bg:#f8f9fa; --card:#ffffff; } +* { box-sizing: border-box; } +body { font: 14px/1.55 -apple-system, 'Segoe UI', Roboto, sans-serif; + color: var(--ink); background: var(--bg); margin: 0; padding: 32px; } +h1 { font-size: 24px; margin: 0 0 4px; } +h2 { font-size: 18px; margin: 36px 0 10px; } +h3 { font-size: 14px; margin: 18px 0 6px; } +.sub { color: var(--mut); margin-bottom: 18px; } +.pitch { border-left: 4px solid var(--blue); background: #e8f0fe66; + padding: 12px 16px; border-radius: 0 10px 10px 0; margin: 14px 0; } +.claims { display: flex; gap: 12px; flex-wrap: wrap; margin: 18px 0 8px; } +.claim { flex: 1 1 210px; background: var(--card); border: 1px solid var(--line); + border-radius: 10px; padding: 13px 15px; } +.claim b { display: block; margin-bottom: 4px; } +.c1 b { color: var(--blue); } .c2 b { color: var(--purple); } +.c3 b { color: var(--ice); } .c4 b { color: var(--green); } .c5 b { color: var(--amber); } +.card { background: var(--card); border: 1px solid var(--line); border-radius: 12px; + padding: 20px 22px; margin: 14px 0; } +.tag { display: inline-block; font-size: 11px; font-weight: 600; border-radius: 99px; + padding: 1px 9px; margin-left: 8px; vertical-align: 2px; } +.t-audit { background:#e8f0fe; color: var(--blue); } +.t-ver { background:#f3e8fd; color: var(--purple); } +.t-cons { background:#e6f4ea; color: var(--green); } +.t-safe { background:#fef7e0; color: var(--amber); } +.t-ice { background:#e1f5fe; color: var(--ice); } +.kv { margin: 6px 0; padding: 8px 10px; border-left: 3px solid var(--line); + background: var(--bg); border-radius: 0 6px 6px 0; } +.kv code { font: 12px/1.5 ui-monospace, Menlo, monospace; word-break: break-all; } +.kv .why { color: var(--mut); font-size: 12.5px; margin-top: 2px; } +.flow { display: flex; align-items: center; gap: 10px; flex-wrap: wrap; margin: 14px 0; } +.node { border: 1.5px solid var(--blue); border-radius: 8px; padding: 7px 12px; + background: #e8f0fe; font: 12px ui-monospace, Menlo, monospace; } +.node small { display: block; color: var(--mut); font-size: 10.5px; } +.loopbox { border: 1.5px dashed var(--purple); border-radius: 10px; + padding: 14px 10px 10px; display: flex; gap: 10px; + align-items: center; position: relative; } +.loopbox .lbl { color: var(--purple); font-size: 11px; font-weight: 700; } +.loopbox.iced { border-color: var(--ice); background: #e1f5fe33; } +.loopbox.iced::after { content: "❄️ result frozen — SKIPPED on replay"; + position: absolute; top: -11px; right: 10px; font-size: 10px; + background: var(--ice); color: #fff; border-radius: 99px; + padding: 1px 8px; } +.fanbox { border: 1.5px dashed var(--green); border-radius: 10px; padding: 10px; } +.fanbox .lbl { color: var(--green); font-size: 11px; font-weight: 700; } +.arrow { color: var(--mut); font-size: 18px; } +pre { background: #202124; color: #e8eaed; border-radius: 10px; padding: 14px; + overflow: auto; font: 11.5px/1.5 ui-monospace, Menlo, monospace; max-height: 300px; } +details summary { cursor: pointer; color: var(--blue); font-weight: 600; margin: 8px 0; } +.turn { display: flex; gap: 14px; margin: 10px 0; } +.turn .num { flex: 0 0 30px; height: 30px; border-radius: 50%; background: var(--blue); + color: #fff; font-weight: 700; display: flex; align-items: center; + justify-content: center; } +.turn .body { flex: 1; background: var(--card); border: 1px solid var(--line); + border-radius: 10px; padding: 10px 14px; } +.turn .ask { font-weight: 600; } +.turn .mech { font-size: 12px; margin: 4px 0; } +.turn .insight { color: var(--mut); font-size: 12.5px; border-left: 3px solid var(--line); + padding-left: 8px; margin-top: 6px; } +.midresult { border: 1.5px solid var(--ice); border-radius: 10px; + background: #e1f5fe44; padding: 12px 14px; margin: 10px 0; } +.midresult .q { font-weight: 600; } +.rev { border-left: 3px solid var(--blue); background: var(--bg); padding: 8px 10px; + border-radius: 0 6px 6px 0; margin: 6px 0; font-size: 12.5px; } +""" + +_SQL_PRODUCER_HINTS = ("draft", "sqlgen", "sql", "loop") + + +def _node(step, frozen_step_ids=()) -> str: + kind = step.get("kind") + if kind == "step": + binding = step.get("input", {}) + src = ( + "task input" + if binding.get("source") == "task" + else f"← {binding.get('step')}" + ) + return ( + f'
{html.escape(step["id"])}' + f"{html.escape(step['capability'])} ·" + f" {html.escape(src)}
" + ) + if kind == "fan_out": + over = step.get("over", {}) + src = over.get("path") or over.get("step") or "task" + inner = ( + f'
{html.escape(step["id"])}' + f"{html.escape(step['capability'])} × each of" + f" {html.escape(str(src))}
" + ) + return ( + '
FAN-OUT (parallel,' + f" isolated){inner}
" + ) + if kind == "pipeline": + stages = " ".join( + f'
{html.escape(s["capability"])}
' + for s in step.get("stages", []) + ) + return ( + f'
PIPELINE (per item,' + f" barrier-free)
{stages}
" + ) + if kind == "loop_until": + body = " ".join( + _node(s, frozen_step_ids) for s in step.get("body", []) + ) + iced = " iced" if step.get("id") in frozen_step_ids else "" + return ( + f'
LOOP until' + f" {html.escape(step.get('until_capability', '?'))} (max" + f" {step.get('max_iters')}){body}
" + ) + if kind == "branch": + return f'
branch: {html.escape(step["id"])}
' + return f'
{html.escape(str(kind))}
' + + +def _flow(spec, frozen_step_ids=()) -> str: + steps = " ".join( + _node(s, frozen_step_ids) for s in spec.get("steps", []) + ) + return f'
{steps}
' + + +def _kv(label, value, why, tag, tag_label) -> str: + return ( + f'
{html.escape(label)}' + f'{tag_label}
' + f"{html.escape(value)}" + f'
{html.escape(why)}
' + ) + + +def _sql_producing_step_ids(spec, mid_results=()) -> list: + """The plan steps whose validated results froze. Prefer the lineage the + artifacts RECORDED (produced_by_step); fall back to the drafting-loop + heuristic for records frozen before lineage existed.""" + recorded = { + r.get("produced_by_step") + for r in mid_results + if r.get("produced_by_step") + } + step_ids = {s.get("id") for s in spec.get("steps", [])} + # lineage may point INSIDE a loop body — surface the loop in that case + for s in spec.get("steps", []): + if s.get("kind") == "loop_until": + body_ids = {b.get("id") for b in s.get("body", [])} + if recorded & body_ids: + recorded.add(s.get("id")) + ids = [i for i in recorded if i in step_ids] + if ids: + return ids + return [ + s.get("id") + for s in spec.get("steps", []) + if s.get("kind") == "loop_until" + and any(h in str(s.get("id", "")).lower() for h in _SQL_PRODUCER_HINTS) + ] + + +def _mid_result(rec: dict) -> str: + """A frozen middle result, attached to the plan that produced it.""" + revs = rec.get("revisions", []) + rev_html = "".join( + f'
revision #{i + 1} — human feedback:' + f" {html.escape(r.get('feedback', ''))}" + f"
previous artifact (preserved)" + f"
{html.escape(r.get('previous_sql') or '')}
" + for i, r in enumerate(revs) + ) + return ( + '
❄️ Frozen middle result —' + f" question: “{html.escape(rec.get('question', ''))}”
" + f"
artifact hash" + f" {rec.get('sql_hash', '')[:16]} · validated" + f" {str(rec.get('validated_at', ''))[:19]} ·" + f" engine {html.escape(str(rec.get('engine', '')))} ·" + f" {len(revs)} human revision(s)" + + ( + " · lineage: plan" + f" {html.escape(str(rec['plan_hash']))}, step" + f" {html.escape(str(rec.get('produced_by_step', '?')))}" + if rec.get("plan_hash") + else "" + ) + + "
" + "
the validated artifact (SQL, in this instance)" + f"
{html.escape(rec.get('sql', ''))}
" + + (rev_html or "") + + "
" + ) + + +def _plan_card(name: str, env: dict, mid_results=()) -> str: + spec = env.get("spec", {}) + frozen_ids = _sql_producing_step_ids(spec, mid_results) if mid_results else [] + parts = [ + f'

Frozen workflow:' + f" {html.escape(name)} — “{html.escape(spec.get('goal', ''))}”

", + ( + "Authored by the model ONCE, as typed data — every box a" + " pre-approved capability, every arrow a typed binding the" + " validator checked. The plan replays across sessions with zero" + " planner calls:" + ), + _flow(spec, frozen_ids), + ] + if mid_results: + parts.append( + "

❄️ Frozen middle results of this workflow

" + "
The RFC's step-result" + " freezing tier: the ❄️ step's validated output is frozen WITH the" + " plan. On replay the step's LLM is skipped — the run is" + " numerically deterministic — and the artifact re-validates on" + " load (drift detection). Human feedback amends it THROUGH" + " validation, every revision recorded:
" + ) + parts.extend(_mid_result(r) for r in mid_results) + parts += [ + _kv( + "spec_hash", + env.get("spec_hash", ""), + "Tamper evidence: every import recomputes sha256 over the spec" + " and rejects on mismatch.", + "t-audit", + "AUDITABLE", + ), + _kv( + "planner_model · created_at", + f"{env.get('planner_model')} · {env.get('created_at')}", + "Authoring provenance: which model wrote this orchestration and" + " when.", + "t-audit", + "AUDITABLE", + ), + _kv( + "registry + capability versions · contract hashes", + f"registry v{env.get('registry_version')} · " + + json.dumps(env.get("capability_versions", {})) + + " · " + + json.dumps({ + k: v[:10] + for k, v in (env.get("capability_contract_hashes") or {}).items() + }), + "Drift detection: a capability whose contract changed since" + " freezing makes the plan refuse to load — loudly.", + "t-ver", + "VERSIONED", + ), + _kv( + "task_input_schema · task_input_digest", + f"{json.dumps(env.get('task_input_schema'))} · " + + str(env.get("task_input_digest", ""))[:16], + "Template reuse: a new session validates ITS question against" + " the captured schema and runs the same governed pipeline.", + "t-cons", + "CONSISTENT", + ), + ( + "
full frozen record (envelope JSON)" + f"
{html.escape(json.dumps(env, indent=2))}
" + ), + "
", + ] + return "\n".join(parts) + + +def _fetch_session(app: str, user: str, session_id: str, port: int = 8001): + import urllib.request + + url = f"http://127.0.0.1:{port}/apps/{app}/users/{user}/sessions/{session_id}" + try: + with urllib.request.urlopen(url, timeout=5) as r: + return json.loads(r.read()) + except Exception: + return None + + +def _session_questions(session: dict) -> set: + """Lower-cased user-turn texts plus revise-target questions — used to + scope middle results to THIS session's actual flow.""" + qs = set() + for e in session.get("events", []): + content = e.get("content") or {} + texts = [ + p.get("text", "") for p in content.get("parts") or [] if p.get("text") + ] + blob = " ".join(texts) + if e.get("author") == "user" and blob.strip(): + qs.add(re.sub(r"\s+", " ", blob.strip().lower())) + m = re.search(r'Revising the frozen SQL\*?\*? for: \\?"([^"]+)"', blob) + if m: + qs.add(re.sub(r"\s+", " ", m.group(1).strip().lower())) + return qs + + +def _session_timeline(session: dict) -> str: + turns, cur = [], None + for e in session.get("events", []): + content = e.get("content") or {} + texts = [ + p.get("text", "") for p in content.get("parts") or [] if p.get("text") + ] + blob = " ".join(texts) + if e.get("author") == "user" and blob.strip(): + cur = {"ask": blob.strip(), "beats": []} + turns.append(cur) + elif cur is not None and blob: + cur["beats"].append(blob) + + cards = [] + for i, t in enumerate(turns, 1): + beats = " ".join(t["beats"]) + if "Frozen SQL replay" in beats: + mech = ( + '❄️ STEP-RESULT REPLAY the' + " workflow ran with its drafting step SKIPPED — the frozen" + " middle result reused; numbers deterministic" + ) + elif "Revising the frozen SQL" in beats: + mech = ( + '🛠 HUMAN-GOVERNED REVISION' + " feedback applied to the frozen middle result THROUGH a real" + " dry-run, recorded in the artifact, then executed" + ) + elif "Authored plan" in beats: + mech = ( + '📝 MODEL AUTHORED THE WORKFLOW' + " once — typed plan, validated, frozen (1 planner call)" + ) + elif "Reusing frozen plan" in beats: + mech = ( + '♻️ FROZEN-WORKFLOW REPLAY 0' + " planner calls — new data through the same governed pipeline" + ) + elif "Conversational turn" in beats: + mech = ( + '💬 CONVERSATION intent gate —' + " no workflow issued" + ) + else: + mech = 'WORKFLOW' + hash_m = re.search(r"validated SQL \(hash `([0-9a-f]+)`", beats) + rev_m = re.search(r"(\d+) human revision", beats) + extra = "" + if hash_m: + extra += f" · artifact `{hash_m.group(1)}`" + if rev_m: + extra += f" · {rev_m.group(1)} revision(s) applied" + ins_m = re.search(r'"insight": "([^"]+)', beats) + insight = ( + f'
{html.escape(ins_m.group(1)[:220])}
' + if ins_m + else "" + ) + cards.append( + f'
{i}
' + f'
“{html.escape(t["ask"][:160])}”
' + f'
{mech}{html.escape(extra)}
{insight}' + "
" + ) + if not cards: + return "" + return ( + '

▶️ The mechanism, live —' + " this session as it actually ran

Read straight" + " from the running ADK session. Watch the arc: the workflow answers" + " → a human amends its frozen middle result → the SAME workflow" + " replays carrying the revision.
" + + "".join(cards) + + "
" + ) + + +def main() -> str: + envs = {} + for fn in sorted(os.listdir(STORE)): + if fn.endswith(".json"): + with open(os.path.join(STORE, fn)) as f: + envs[fn[:-5]] = json.load(f) + if not envs: + print("plan store is empty — run a demo session first", file=sys.stderr) + raise SystemExit(1) + sql_dir = os.path.join(STORE, "sql") + mid_results = [] + if os.path.isdir(sql_dir): + for fn in sorted(os.listdir(sql_dir)): + if fn.endswith(".json"): + with open(os.path.join(sql_dir, fn)) as f: + mid_results.append(json.load(f)) + + timeline, session_qs = "", None + if len(sys.argv) > 1: + sid = sys.argv[1] + app = sys.argv[2] if len(sys.argv) > 2 else "bq_ca_planner" + user = sys.argv[3] if len(sys.argv) > 3 else "user" + session = _fetch_session(app, user, sid) + if session: + timeline = _session_timeline(session) + session_qs = _session_questions(session) + + # Middle results attach to the workflow that produced them (sequence). + # With a session given, scope them to THAT session's flow; artifacts from + # other sessions collapse into a separate card (the store is global by + # design — that's the cross-session point — but the page should mirror + # the demo run on screen). + if session_qs is not None: + in_session = [ + r + for r in mid_results + if re.sub(r"\s+", " ", r.get("question", "").strip().lower()) + in session_qs + ] + other = [r for r in mid_results if r not in in_session] + else: + in_session, other = mid_results, [] + + cards = timeline + for name, env in envs.items(): + short = str(env.get("spec_hash", ""))[:12] + attach = [ + r + for r in in_session + if r.get("plan_hash") == short + or (not r.get("plan_hash") and name == "sequence") + ] + cards += _plan_card(name, env, attach) + if other: + cards += ( + '
❄️ Frozen middle results from' + f" OTHER sessions ({len(other)}) — the store is cross-session by" + " design" + + "".join(_mid_result(r) for r in other) + + "
" + ) + + page = f""" +RFC #93 — Frozen Workflows with Frozen Middle Results + +

RFC #93: Reproducible Model-Authored Workflows

+
Demonstrated live on BigQuery Conversational Analytics over +bigquery-public-data.thelook_ecommerce — every artifact on this page is real, +read from the running demo's plan store.
+ +
The thesis: a model should author orchestration once, as +typed data — then the workflow, and the validated middle results its steps produce, +freeze into durable artifacts. Replays skip the nondeterministic steps entirely; humans +amend the artifacts through validation, never by re-prompting; and everything is +auditable, versioned, and drift-checked. A chat agent gives you answers. This gives you a +governed analytics asset.
+ +
+
📝 Authored onceThe model emits a typed plan over a closed +capability vocabulary — no code, no sandbox. Validated, lint-checked, frozen, exported.
+
🏷️ Versioned & drift-checkedRegistry, capability versions, +and derived contract hashes seal in. Changed semantics → the plan refuses to load.
+
❄️ Middle results freeze tooThe step that drafts SQL is the last +nondeterministic step — so its dry-run-validated output freezes WITH the plan. Replays skip +it: same numbers, to the cent, across sessions.
+
🛠 Human-governedFeedback amends a frozen middle result through +real validation; the feedback and the previous artifact are preserved in the record — a +reviewed change, not a re-roll.
+
🔍 Auditable end to endWho authored the plan, what it runs, +which artifact answered, who revised it and why — all readable, diffable data.
+
+ +{cards} + +

Why this matters beyond SQL

+What froze here is a SQL statement — but the mechanism is general: any step's validated +output can freeze the same way. A retrieved schema, a verified claim set, a chart +specification, an extraction template — each one a middle result that today is re-rolled by +an LLM on every run. The RFC's freezing tiers turn them into governed artifacts: +v1 the frozen plan (process determinism) · v1.1 the exported envelope +(portability + audit) · v1.2 frozen step results (numeric determinism + human +governance) · v2 templates (approved reuse against new inputs).
+""" + + out = os.path.join(STORE, "plan_inspector.html") + with open(out, "w") as f: + f.write(page) + print(out) + return out + + +if __name__ == "__main__": + main() diff --git a/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py b/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py new file mode 100644 index 00000000000..27d79ccfb99 --- /dev/null +++ b/contributing/samples/workflows/authored_workflow_ca_demo/test_ca_demo_agent.py @@ -0,0 +1,1434 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""CI-safe tests for the BQ Conversational Analytics workflow demo (no LLM). + +Each demo scenario's expected workflow shape is built by hand, validated + +lint-checked against the demo registry, and EXECUTED end-to-end on the real +ADK engine with the language capabilities (nl2sql, summaries, classifier, +skeptic) swapped for deterministic stubs — so all seven coordination shapes +the demo authors on camera are pinned in CI. +""" + +from __future__ import annotations + +import json +import os +import sys + +from google.adk import Event +from google.adk.runners import Runner +from google.adk.sessions.in_memory_session_service import InMemorySessionService +from google.adk.workflow import node +from google.adk.workflow import Workflow +from google.genai import types +import pytest + +_HERE = os.path.dirname(os.path.abspath(__file__)) +# Import as a PACKAGE (bq_ca_planner.agent), not a bare `agent` module — the +# sibling demo's tests import their own `agent`, and a bare import would +# collide in sys.modules when pytest collects both directories. +sys.path.insert(0, _HERE) +sys.path.insert(0, os.path.join(_HERE, "..", "authored_workflow_spike")) +from authoring import Binding # noqa: E402 +from authoring import Branch # noqa: E402 +from authoring import Capability # noqa: E402 +from authoring import CapabilityRegistry # noqa: E402 +from authoring import FanOut # noqa: E402 +from authoring import LoopUntil # noqa: E402 +from authoring import Pipeline # noqa: E402 +from authoring import PipelineStage # noqa: E402 +from authoring import Route # noqa: E402 +from authoring import SpecInterpreter # noqa: E402 +from authoring import StepRef # noqa: E402 +from authoring import WorkflowSpec # noqa: E402 +from authoring import WorkflowSpecValidator # noqa: E402 +from bq_ca_planner import agent as demo # noqa: E402 + +_LLM_CAPS = ( + "nl2sql", + "draft_or_repair_sql", + "summarize_insight", + "classify_question", + "skeptic", + "describe_schema", # v2: data-grounded (query tool) — stubbed in tests +) + + +def _stub(name, fn): + def build(): + @node(name=name) + async def n(ctx, node_input): + yield Event(output=fn(node_input)) + + return n + + return build + + +def _stub_registry() -> CapabilityRegistry: + """The demo registry with the live language capabilities stubbed.""" + real = demo._registry() + stubs = [ + Capability( + name="nl2sql", + input_kind="item", + serialize_input=False, + build=_stub( + "nl2sql", + lambda s: { + "sql": ( + "SELECT region, SUM(sale_price) AS revenue FROM" + " order_items GROUP BY region" + ) + }, + ), + ), + Capability( + name="draft_or_repair_sql", + input_kind="item", + serialize_input=False, + build=_stub( + "draft_or_repair_sql", + lambda s: {"sql": "SELECT status FROM orders LIMIT 10"}, + ), + ), + Capability( + name="summarize_insight", + input_kind="item", + serialize_input=False, + build=_stub( + "summarize_insight", + lambda s: {"insight": "US-West leads revenue."}, + ), + ), + Capability( + name="classify_question", + input_kind="item", + serialize_input=False, + build=_stub( + "classify_question", + lambda s: { + "category": ( + "schema" if "mean" in json.dumps(s).lower() else "data" + ) + }, + ), + ), + Capability( + name="skeptic", + input_kind="item", + serialize_input=False, + build=_stub( + "skeptic", + lambda v: {"insight": str(v), "refuted": "1,000,000" in str(v)}, + ), + ), + Capability( + name="describe_schema", + input_kind="item", + serialize_input=False, + build=_stub( + "describe_schema", + lambda q: { + "answer": ( + "orders.status values include Complete, Shipped," + " Processing, Cancelled, Returned." + ) + }, + ), + ), + ] + passthrough = [ + cap for name, cap in real._by_name.items() if name not in _LLM_CAPS + ] + return CapabilityRegistry(stubs + passthrough) + + +# ----------------------------------------------------- expected shapes +def _expected_spec(key: str) -> WorkflowSpec: + """The shape each scenario's planner recipe asks for, built by hand.""" + if key == "sequence": + return WorkflowSpec( + goal="revenue by region", + steps=[ + LoopUntil( + kind="loop_until", + id="sqlgen", + init=Binding(source="task"), + body=[ + StepRef( + kind="step", + id="draft", + capability="draft_or_repair_sql", + input=Binding(source="step", step="sqlgen"), + ), + StepRef( + kind="step", + id="check", + capability="dry_run", + input=Binding(source="step", step="draft"), + ), + ], + until_capability="sql_ok", + until_input=Binding(source="step", step="check"), + max_iters=3, + ), + StepRef( + kind="step", + id="rows", + capability="run_query", + input=Binding(source="step", step="sqlgen"), + ), + StepRef( + kind="step", + id="chart", + capability="render_chart", + input=Binding(source="step", step="rows"), + ), + StepRef( + kind="step", + id="sum", + capability="summarize_insight", + input=Binding(source="step", step="rows"), + ), + ], + output=Binding(source="step", step="sum"), + ) + if key == "fanout": + return WorkflowSpec( + goal="profile data quality", + steps=[ + FanOut( + kind="fan_out", + id="profiles", + over=Binding(source="task", path="tables"), + capability="profile_table", + ), + StepRef( + kind="step", + id="report", + capability="quality_report", + input=Binding(source="step", step="profiles"), + ), + ], + output=Binding(source="step", step="report"), + ) + if key == "pipeline": + return WorkflowSpec( + goal="dashboard", + steps=[ + Pipeline( + kind="pipeline", + id="panels", + over=Binding(source="task", path="questions"), + stages=[ + PipelineStage(capability="draft_or_repair_sql"), + PipelineStage(capability="dry_run"), + PipelineStage(capability="run_query"), + PipelineStage(capability="render_chart"), + ], + ), + ], + output=Binding(source="step", step="panels"), + ) + if key == "branch": + return WorkflowSpec( + goal="route the question", + steps=[ + StepRef( + kind="step", + id="cls", + capability="classify_question", + input=Binding(source="task"), + ), + Branch( + kind="branch", + id="route", + on=Binding(source="step", step="cls", path="category"), + routes=[ + Route( + value="data", + block=[ + StepRef( + kind="step", + id="d_sql", + capability="nl2sql", + input=Binding(source="task"), + ), + StepRef( + kind="step", + id="d_check", + capability="dry_run", + input=Binding(source="step", step="d_sql"), + ), + StepRef( + kind="step", + id="d_rows", + capability="run_query", + input=Binding(source="step", step="d_check"), + ), + StepRef( + kind="step", + id="d_sum", + capability="summarize_insight", + input=Binding(source="step", step="d_rows"), + ), + ], + ), + Route( + value="schema", + block=[ + StepRef( + kind="step", + id="s_desc", + capability="describe_schema", + input=Binding(source="task"), + ) + ], + ), + ], + ), + ], + output=Binding(source="step", step="route"), + ) + if key == "loop": + return WorkflowSpec( + goal="sql self-repair from a real broken query", + steps=[ + LoopUntil( + kind="loop_until", + id="repair", + init=Binding(source="task"), + body=[ + StepRef( + kind="step", + id="check", + capability="dry_run", + input=Binding(source="step", step="repair"), + ), + StepRef( + kind="step", + id="fix", + capability="draft_or_repair_sql", + input=Binding(source="step", step="check"), + ), + ], + until_capability="sql_ok", + until_input=Binding(source="step", step="check"), + max_iters=3, + ), + StepRef( + kind="step", + id="rows", + capability="run_query", + input=Binding(source="step", step="repair"), + ), + ], + output=Binding(source="step", step="rows"), + ) + if key == "adversarial": + return WorkflowSpec( + goal="audit insights", + steps=[ + FanOut( + kind="fan_out", + id="verdicts", + over=Binding(source="task", path="insights"), + capability="skeptic", + ), + StepRef( + kind="step", + id="kept", + capability="keep_verified", + input=Binding(source="step", step="verdicts"), + ), + ], + output=Binding(source="step", step="kept"), + ) + if key == "tournament": + return WorkflowSpec( + goal="best chart", + steps=[ + LoopUntil( + kind="loop_until", + id="bracket", + init=Binding(source="task", path="chart_options"), + body=[ + StepRef( + kind="step", + id="pairs", + capability="pair_charts", + input=Binding(source="step", step="bracket"), + ), + FanOut( + kind="fan_out", + id="winners", + over=Binding(source="step", step="pairs"), + capability="judge_chart", + ), + ], + until_capability="single_chart", + until_input=Binding(source="step", step="winners"), + max_iters=3, + ), + StepRef( + kind="step", + id="viz", + capability="render_chart", + input=Binding(source="step", step="bracket"), + ), + ], + output=Binding(source="step", step="viz"), + ) + raise KeyError(key) + + +async def _run(spec, registry, task_input): + holder = {} + + @node(rerun_on_resume=True) + async def parent(ctx, node_input): + holder["out"] = await SpecInterpreter(registry, ctx).execute( + spec, task_input + ) + yield Event(output={"_done": True}) + + wf = Workflow(name="t", edges=[("START", parent)]) + ss = InMemorySessionService() + r = Runner(app_name=wf.name, node=wf, session_service=ss) + s = await ss.create_session(app_name=wf.name, user_id="u") + async for _ in r.run_async( + user_id="u", + session_id=s.id, + new_message=types.Content(parts=[types.Part(text="go")], role="user"), + ): + pass + return holder["out"] + + +# ----------------------------------------------------- tests +def test_stubs_tolerate_authored_binding_shapes(monkeypatch): + # The plan is MODEL-authored: a binding may hand a stub the whole step + # output (dict), a dotted path into it (raw string), or a JSON-encoded + # payload. The live error this pins: nl2sql -> dry_run with path='sql' + # passed a raw SQL string and the stub assumed a dict. + monkeypatch.setitem(demo._BQ, "disabled", True) + raw_sql = "SELECT region FROM order_items" + assert demo._sql_of({"sql": raw_sql}) == raw_sql + assert demo._sql_of(json.dumps({"sql": raw_sql})) == raw_sql + assert demo._sql_of(raw_sql) == raw_sql + assert demo._field_of({"valid": True}, "valid") is True + assert demo._field_of(json.dumps({"valid": True}), "valid") is True + assert demo._verdict_of(json.dumps({"insight": "x", "refuted": True})) == { + "insight": "x", + "refuted": True, + "reason": "", + } + assert demo._verdict_of("just text")["refuted"] is False + # dry-run (mock branch) tolerates a raw SQL string input too + out = demo._bq_dry_run(raw_sql) + assert out["valid"] is True and out["sql"] == raw_sql + + +def test_render_chart_accepts_authored_binding_shapes(monkeypatch): + monkeypatch.setitem(demo._BQ, "disabled", True) + # query output (dict with rows) -> bar over those rows + region_rows = demo._query_engine( + "SELECT region, SUM(x) AS revenue ... GROUP BY region INTERVAL 1 YEAR" + ) + ch = demo._render_chart({"rows": region_rows}) + assert ch["chart_type"] == "bar" + assert "US-West" in ch["ascii"] + assert ch["vega_lite"]["data"]["values"] == region_rows + # tournament winner (list with one chart type) -> that mark, canned rows + ch = demo._render_chart(["pie"]) + assert ch["chart_type"] == "pie" + assert ch["vega_lite"]["mark"] == "arc" + # bare chart-type string and raw rows list + assert demo._render_chart("scatter")["vega_lite"]["mark"] == "point" + ch = demo._render_chart(demo._CANNED_ROWS) + assert "US-West" in ch["ascii"] + # ascii preview: one bar line per region, longest bar for the leader + lines = demo._render_chart({"rows": demo._CANNED_ROWS})["ascii"].splitlines() + assert len(lines) == 4 and lines[0].count("█") > lines[-1].count("█") + + +def test_render_chart_derives_encoding_fields(monkeypatch): + monkeypatch.setitem(demo._BQ, "disabled", True) + ch = demo._render_chart({"rows": [{"category": "A", "count": 3}]}) + assert ch["x_field"] == "category" and ch["y_field"] == "count" + enc = ch["vega_lite"]["encoding"] + assert enc["x"]["field"] == "category" and enc["y"]["field"] == "count" + + +def test_chart_png_renders_or_falls_back(monkeypatch): + monkeypatch.setitem(demo._BQ, "disabled", True) + ch = demo._render_chart({"rows": demo._CANNED_ROWS}) + png = demo._chart_png(ch) + if png is None: + pytest.skip("matplotlib not installed — text fallback path") + assert png[:8] == b"\x89PNG\r\n\x1a\n" # real PNG bytes + assert len(png) > 5000 + # every chart kind renders without error + for kind in ("pie", "line", "scatter"): + assert demo._chart_png(demo._render_chart([kind])) is not None + + +def test_qualify_sql_for_real_bigquery(): + q = demo._qualify_sql( + "SELECT * FROM thelook_ecommerce.orders JOIN" + " thelook_ecommerce.order_items USING (order_id)" + ) + assert "`bigquery-public-data.thelook_ecommerce.orders`" in q + assert "`bigquery-public-data.thelook_ecommerce.order_items`" in q + # already-qualified and backticked inputs normalize to the same form + same = demo._qualify_sql( + "SELECT * FROM `bigquery-public-data.thelook_ecommerce.orders`" + ) + assert same.count("`bigquery-public-data.thelook_ecommerce.orders`") == 1 + + +def test_jsonify_cells(): + import datetime + import decimal + + assert demo._jsonify_cell(decimal.Decimal("3.14159")) == 3.14 + assert demo._jsonify_cell(2.71828) == 2.72 + assert demo._jsonify_cell(datetime.date(2024, 1, 31)) == "2024-01-31" + assert demo._jsonify_cell(datetime.datetime(2024, 1, 31, 12, 0)).startswith( + "2024-01-31T12:00" + ) + assert demo._jsonify_cell("x") == "x" and demo._jsonify_cell(7) == 7 + + +def test_dry_run_and_execute_fall_back_without_bigquery(monkeypatch): + monkeypatch.setitem(demo._BQ, "disabled", True) + d = demo._bq_dry_run({"sql": "SELECT region FROM thelook_ecommerce.orders"}) + assert d["engine"] == "mock" and d["valid"] is True + out = demo._execute_sql( + {"sql": "SELECT region, SUM(x) AS revenue ... GROUP BY region"} + ) + assert out["engine"] == "mock" + assert [r["region"] for r in out["rows"]][0] == "US-West" + + +def test_failing_query_returns_error_not_fabricated_rows(monkeypatch): + class _Boom: + + def query(self, *a, **k): + raise RuntimeError("400 invalid query") + + monkeypatch.setitem(demo._BQ, "disabled", False) + monkeypatch.setitem(demo._BQ, "error", None) + monkeypatch.setitem(demo._BQ, "client", _Boom()) + out = demo._execute_sql({"sql": "SELECT broken"}) + assert out["engine"] == "bigquery" + assert out["rows"] == [] and "400" in out["error"] # honest failure + + +def _freeze_record(key): + return demo.FrozenWorkflowRecord.freeze( + _expected_spec(key), + planner_model="gemini-3.5-flash", + registry=demo._registry(), + created_at="2026-06-10T00:00:00Z", + task_input=demo.SCENARIOS[key]["task"], + task_input_schema={"required": sorted(demo.SCENARIOS[key]["task"])}, + ) + + +def test_cross_session_store_roundtrip_and_template_reuse( + tmp_path, monkeypatch +): + # Session A freezes + exports; "session B" (no session state) imports the + # plan through the defensive path — including with a NEW question, which + # is template reuse validated against the captured task_input_schema. + monkeypatch.setattr(demo, "_PLAN_STORE", str(tmp_path)) + demo._store_plan("sequence", _freeze_record("sequence")) + # same canned input -> replay path + spec, reject = demo._load_stored_plan( + "sequence", demo._registry(), demo.SCENARIOS["sequence"]["task"] + ) + assert reject is None and spec is not None + # NEW question -> template path (schema validates the input) + spec, reject = demo._load_stored_plan( + "sequence", + demo._registry(), + {"question": "revenue by category last year?"}, + ) + assert reject is None and spec is not None + assert spec.model_dump() == _expected_spec("sequence").model_dump() + # nothing stored for another key + assert demo._load_stored_plan( + "fanout", demo._registry(), demo.SCENARIOS["fanout"]["task"] + ) == (None, None) + + +def test_cross_session_import_rejects_tamper_and_drift(tmp_path, monkeypatch): + monkeypatch.setattr(demo, "_PLAN_STORE", str(tmp_path)) + path = demo._store_plan("fanout", _freeze_record("fanout")) + # tampered spec -> hash mismatch, rejected with a reason + env = json.load(open(path)) + env["spec"]["goal"] = "exfiltrate" + json.dump(env, open(path, "w")) + spec, reject = demo._load_stored_plan( + "fanout", demo._registry(), demo.SCENARIOS["fanout"]["task"] + ) + assert spec is None and "spec_hash mismatch" in reject + # contract drift: same plan, but a capability's schema changed since + demo._store_plan("fanout", _freeze_record("fanout")) + + from pydantic import BaseModel + + class NewReport(BaseModel): + n: int + + drifted = demo._registry() + drifted["profile_table"].output_model = NewReport # version not bumped + spec, reject = demo._load_stored_plan( + "fanout", drifted, demo.SCENARIOS["fanout"]["task"] + ) + assert spec is None and "contract drift" in reject + + +def test_dry_run_preserves_question_for_repair_rounds(monkeypatch): + # Review finding: after a FAILED dry run, the loop-carried value must + # still hold the user's question — otherwise the repair round repairs + # from sql+error with no goal context. Mock branch: + monkeypatch.setitem(demo._BQ, "disabled", True) + out = demo._bq_dry_run({"sql": "SELECT 1", "question": "trend by year?"}) + assert out["question"] == "trend by year?" + + # Real-branch FAILURE (the path that feeds the repair round): + class _Boom: + + def query(self, *a, **k): + raise RuntimeError("400 TIMESTAMP_SUB does not support YEAR") + + monkeypatch.setitem(demo._BQ, "disabled", False) + monkeypatch.setitem(demo._BQ, "error", None) + monkeypatch.setitem(demo._BQ, "client", _Boom()) + out = demo._bq_dry_run({"sql": "SELECT broken", "question": "trend?"}) + assert out["valid"] is False and "TIMESTAMP_SUB" in out["error"] + assert out["question"] == "trend?" # full repair context preserved + # and the Sql schema itself carries the echo field: + assert "question" in demo.Sql.model_fields + + +def test_chart_multiseries_per_region_per_year(monkeypatch): + monkeypatch.setitem(demo._BQ, "disabled", True) + # The shape the user's real question produces: GROUP BY region, year with + # two measures. x = the time field, one SERIES per region, measure picked + # by name preference (total_sales over total_orders); int year never + # mistaken for the measure. + rows = [ + {"region": r, "year": y, "total_sales": s, "total_orders": o} + for (r, y, s, o) in [ + ("US-West", 2024, 100.0, 10), + ("US-West", 2025, 130.0, 12), + ("EMEA", 2024, 70.0, 8), + ("EMEA", 2025, 90.0, 9), + ] + ] + ch = demo._render_chart({"rows": rows}) + assert ch["x_field"] == "year" + assert ch["series_field"] == "region" + assert ch["y_field"] == "total_sales" + assert ch["chart_type"] == "line" + assert ch["vega_lite"]["encoding"]["color"]["field"] == "region" + assert "US-West" in ch["ascii"] and "130.00" in ch["ascii"] + png = demo._chart_png(ch) + if png is not None: + assert png[:8] == b"\x89PNG\r\n\x1a\n" + + +@pytest.mark.skipif( + not os.environ.get("CA_DEMO_LIVE_BQ"), + reason="live BigQuery round-trip (set CA_DEMO_LIVE_BQ=1 + credentials)", +) +def test_live_bigquery_roundtrip(): + good = demo._bq_dry_run({ + "sql": ( + "SELECT status, COUNT(*) AS n FROM thelook_ecommerce.orders" + " GROUP BY status" + ) + }) + assert good["engine"] == "bigquery" and good["valid"] is True + assert good["bytes_processed"] > 0 + bad = demo._bq_dry_run({"sql": "SELECT nope FROM thelook_ecommerce.orders"}) + assert bad["valid"] is False and bad["error"] # a REAL BigQuery error + out = demo._execute_sql({ + "sql": ( + "SELECT status, COUNT(*) AS n FROM thelook_ecommerce.orders" + " GROUP BY status ORDER BY n DESC LIMIT 3" + ) + }) + assert out["engine"] == "bigquery" and len(out["rows"]) == 3 + assert out["rows"][0]["n"] > 0 + + +def test_engine_aggregates_by_region_and_window(): + # The "intelligent mock": rows are AGGREGATED from synthetic facts per the + # SQL's intent, not pattern-matched to a canned answer. + q = demo._query_engine( + "SELECT country AS region, SUM(p) AS revenue ... GROUP BY region" + " ... INTERVAL 1 QUARTER" + ) + y = demo._query_engine( + "SELECT country AS region, SUM(p) AS revenue ... GROUP BY region" + " ... INTERVAL 1 YEAR" + ) + assert [r["region"] for r in q] == ["US-West", "US-East", "EMEA", "APAC"] + # a year window strictly contains the quarter window: + assert ( + all(yr["revenue"] > qr["revenue"] for yr, qr in zip(y, q)) and len(y) == 4 + ) + + +def test_engine_monthly_trend_with_alias_and_country_filter(): + # The exact live gap this replaces: a trend question now returns a real + # monthly series, honoring the SQL's measure alias and US filter. + rows = demo._query_engine( + "SELECT DATE_TRUNC(o.created_at, MONTH) AS month, SUM(oi.sale_price)" + " AS total_sales FROM ... WHERE country = 'United States' AND" + " created_at >= TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 2 YEAR)" + " GROUP BY month ORDER BY month" + ) + assert len(rows) == 24 # 2 years of months + assert list(rows[0]) == ["month", "total_sales"] + assert rows[0]["month"] == "2024-01" and rows[-1]["month"] == "2025-12" + # US-only filter: below the all-regions total for the same window + all_rows = demo._query_engine( + "SELECT month, SUM(x) AS total_sales ... INTERVAL 2 YEAR GROUP BY month" + ) + assert rows[0]["total_sales"] < all_rows[0]["total_sales"] + + +def test_engine_grand_total_and_category_grouping(): + total = demo._query_engine("SELECT SUM(sale_price) ... INTERVAL 2 YEAR") + assert len(total) == 1 and total[0]["revenue"] > 0 + cats = demo._query_engine( + "SELECT category, SUM(x) AS revenue ... GROUP BY category" + ) + assert [r["category"] for r in cats] == [ + "Outerwear", + "Jeans", + "Activewear", + "Accessories", + ] + + +def test_engine_yearly_and_quarterly_grains(): + # The exact live gap: EXTRACT(YEAR ...) AS year GROUP BY year produced a + # single anonymous grand total. Yearly and quarterly grains now bucket the + # monthly facts (the warehouse holds 24 months, so a 3-year window caps + # at 2 years of buckets). + yearly = demo._query_engine( + "SELECT EXTRACT(YEAR FROM t1.created_at) AS year, SUM(t2.sale_price)" + " AS total_sales FROM ... WHERE t1.created_at >=" + " TIMESTAMP_SUB(CURRENT_TIMESTAMP(), INTERVAL 3 YEAR) GROUP BY year" + " ORDER BY year" + ) + assert [r["year"] for r in yearly] == ["2024", "2025"] + assert all(r["total_sales"] > 1_000_000 for r in yearly) + quarterly = demo._query_engine( + "SELECT DATE_TRUNC(created_at, QUARTER) AS quarter, SUM(x) AS revenue" + " ... INTERVAL 2 YEAR GROUP BY quarter" + ) + assert [r["quarter"] for r in quarterly] == [ + f"{y}-Q{q}" for y in (2024, 2025) for q in (1, 2, 3, 4) + ] + # buckets are consistent: quarters sum to their year. + assert round(sum(r["revenue"] for r in quarterly[:4]), 2) == round( + yearly[0]["total_sales"], 2 + ) + + +def test_chart_infers_line_for_time_series(monkeypatch): + monkeypatch.setitem(demo._BQ, "disabled", True) + rows = demo._query_engine( + "SELECT month, SUM(x) AS sales ... GROUP BY month INTERVAL 1 YEAR" + ) + ch = demo._render_chart({"rows": rows}) + assert ch["chart_type"] == "line" # date-shaped x labels -> trend line + assert ch["vega_lite"]["mark"] == "line" + # quarterly and yearly buckets are time series too: + q_rows = [{"quarter": "2024-Q1", "v": 1.0}, {"quarter": "2024-Q2", "v": 2.0}] + assert demo._render_chart({"rows": q_rows})["chart_type"] == "line" + y_rows = [{"year": "2024", "v": 1.0}, {"year": "2025", "v": 2.0}] + assert demo._render_chart({"rows": y_rows})["chart_type"] == "line" + # an explicit winner still wins over the inference: + assert demo._render_chart(["bar"])["chart_type"] == "bar" + # a single point is not a trend: + assert demo._render_chart({"rows": [{"total": 5.0}]})["chart_type"] == "bar" + + +def test_text_of_extracts_user_message(): + assert demo._text_of("plain text") == "plain text" + content = types.Content( + role="user", parts=[types.Part(text="last year please")] + ) + assert demo._text_of(content) == "last year please" + + class Wrapped: + pass + + w = Wrapped() + w.content = content + assert demo._text_of(w) == "last year please" + + +def test_fanout_profiles_the_live_table_list(monkeypatch): + # Profiling fans out over whatever REALLY exists in the dataset (live + # __TABLES__, cached); without credentials it falls back to the curated + # catalogue. The live list legitimately includes the empty stray + # 'thelook_ecommerce-table' the console shows. + monkeypatch.setitem(demo._BQ, "disabled", True) + monkeypatch.setattr(demo, "_TABLE_LIST_CACHE", {}) + assert demo._task_for("fanout", "profile data quality") == { + "tables": list(demo.TABLES) + } + # cache short-circuits repeated metadata queries + monkeypatch.setattr(demo, "_TABLE_LIST_CACHE", {"tables": ["a", "b"]}) + assert demo._task_for("fanout", "x") == {"tables": ["a", "b"]} + + +def test_sequence_takes_live_question_others_stay_canned(): + q = "What was revenue by region last year?" + assert demo._task_for("sequence", q) == {"question": q} + # empty/whitespace falls back to the canned question + assert demo._task_for("sequence", " ") == demo.SCENARIOS["sequence"]["task"] + # mode-selector scenarios keep canned/derived inputs (fanout discovers + # the live table list — see test_fanout_profiles_the_live_table_list) + assert demo._task_for("pipeline", q) == demo.SCENARIOS["pipeline"]["task"] + + +def test_root_agent_importable_and_named(): + assert isinstance(demo.root_agent, Workflow) + assert demo.root_agent.name == "bq_ca_planner" + + +def test_registry_clean_and_typed(): + reg = demo._registry() + for name in _LLM_CAPS + ("dry_run", "run_query", "profile_table"): + assert name in reg + assert reg.open_map_warnings() == [] # enumerated fields only + + +def test_audit_takes_live_insights_not_canned(): + # The live failure this pins: 'audit this insight ' must audit X, not + # the canned demo set. (Typo'd filler like 'ingisht' is tolerated.) + claim = ( + "China and the United States lead global sales, with most markets" + " peaking in 2025" + ) + task = demo._task_for("adversarial", f"audit this ingisht {claim}") + assert task == {"insights": [claim]} + # multiple insights split on ';' + task = demo._task_for("adversarial", "verify insights: A is true; B is up") + assert task == {"insights": ["A is true", "B is up"]} + # trigger-only message + a remembered last insight -> audit THAT + task = demo._task_for( + "adversarial", "audit these insights", last_insight=claim + ) + assert task == {"insights": [claim]} + # trigger-only, nothing remembered -> canned demo set (final fallback) + task = demo._task_for("adversarial", "audit these insights") + assert task == demo.SCENARIOS["adversarial"]["task"] + # other scenarios unaffected + assert demo._task_for("pipeline", f"audit {claim}") == ( + demo.SCENARIOS["pipeline"]["task"] + ) + + +@pytest.mark.asyncio +async def test_skeptics_are_runtime_isolated_not_reading_history(): + """Empirical isolation proof (no network): a spy on the model layer + captures each fanned-out skeptic's ACTUAL LLM request. Each request must + contain exactly its own insight — not the sibling's insight, not prior + chat beats, not the user's turn message. This is the runtime half of the + independence story; the binding lints are the static half.""" + from google.adk import Agent + from google.adk.models.llm_response import LlmResponse + + captured = [] + + def spy(callback_context=None, llm_request=None, **kw): + captured.append( + " ".join( + p.text + for c in llm_request.contents or [] + for p in c.parts or [] + if p.text + ) + ) + return LlmResponse( + content=types.Content( + role="model", + parts=[ + types.Part( + text=json.dumps( + {"insight": "echo", "refuted": False, "reason": "spy"} + ) + ) + ], + ) + ) + + def skeptic_build(): + return Agent( + name="skeptic", + model="gemini-2.5-flash", # never called — spy short-circuits + output_schema=demo.Verdict, + instruction="Refute or uphold. Output Verdict.", + before_model_callback=spy, + ) + + reg = CapabilityRegistry([ + Capability( + name="skeptic", + input_kind="item", + serialize_input=True, + build=skeptic_build, + ), + demo._registry()._by_name["keep_verified"], + ]) + spec = _expected_spec("adversarial") + holder = {} + + @node(rerun_on_resume=True) + async def parent(ctx, node_input): + yield Event( + content=types.Content( + role="model", + parts=[types.Part(text="SECRET-PRIOR-BEAT plan authored")], + ) + ) + holder["out"] = await SpecInterpreter(reg, ctx).execute( + spec, + {"insights": ["INSIGHT-ALPHA is true", "INSIGHT-BETA is false"]}, + ) + yield Event(output={"_done": True}) + + wf = Workflow(name="t", edges=[("START", parent)]) + ss = InMemorySessionService() + r = Runner(app_name="t", node=wf, session_service=ss) + s = await ss.create_session(app_name="t", user_id="u") + async for _ in r.run_async( + user_id="u", + session_id=s.id, + new_message=types.Content( + parts=[types.Part(text="USER-TURN-MESSAGE audit stuff")], + role="user", + ), + ): + pass + + assert len(captured) == 2 # one REAL dispatch per insight + assert "INSIGHT-ALPHA" in captured[0] and "INSIGHT-BETA" in captured[1] + for request_text in captured: + assert "SECRET-PRIOR-BEAT" not in request_text # no chat history + assert "INSIGHT-BETA" not in captured[0] # no sibling leakage + assert "INSIGHT-ALPHA" not in captured[1] + # NOTE: the session's user-turn message MAY appear (ADK includes unscoped + # events by design); the isolation guarantees are: own input present, + # sibling inputs and other beats never. + + +def _wrapper_carries_isolation_scope() -> bool: + import inspect + + from google.adk.workflow import _llm_agent_wrapper as wrapper + + return "agent_ctx.isolation_scope" in inspect.getsource( + wrapper.prepare_llm_agent_context + ) + + +@pytest.mark.skipif( + not _wrapper_carries_isolation_scope(), + reason=( + "installed ADK lacks the isolation_scope carry fix in" + " prepare_llm_agent_context (run with PYTHONPATH=src)" + ), +) +@pytest.mark.asyncio +async def test_tool_loop_siblings_do_not_swap_inputs(): + """Regression for the contamination the data-grounded skeptic exposed: + a fanned-out single_turn agent that makes MULTIPLE model calls (tool + loop) must rebuild its context from ITS OWN input on every call. Before + the fix (supervisor per-dispatch scope + wrapper scope carry), call 2 + rebuilt from the LATEST sibling's input — every skeptic answered the + last claim. The spy simulates the tool loop deterministically.""" + from google.adk import Agent + from google.adk.models.llm_response import LlmResponse + + captured = [] + + def make_spy(): + state = {"n": 0} + + def spy(callback_context=None, llm_request=None, **kw): + state["n"] += 1 + texts = " ".join( + p.text + for c in llm_request.contents or [] + for p in c.parts or [] + if p.text + ) + captured.append((id(state), state["n"], texts)) + if state["n"] == 1: + return LlmResponse( + content=types.Content( + role="model", + parts=[ + types.Part( + function_call=types.FunctionCall( + name="query_thelook", args={"sql": "SELECT 1"} + ) + ) + ], + ) + ) + return LlmResponse( + content=types.Content( + role="model", + parts=[ + types.Part( + text=json.dumps( + {"insight": "z", "refuted": False, "reason": "spy"} + ) + ) + ], + ) + ) + + return spy + + def skeptic_build(): + return Agent( + name="skeptic", + model="gemini-2.5-flash", # never called — spy short-circuits + output_schema=demo.Verdict, + tools=[demo.query_thelook], + instruction="Check the claim. Output Verdict.", + before_model_callback=make_spy(), + ) + + reg = CapabilityRegistry([ + Capability( + name="skeptic", + input_kind="item", + serialize_input=True, + build=skeptic_build, + ), + demo._registry()._by_name["keep_verified"], + ]) + spec = _expected_spec("adversarial") + monkeypatch_bq = demo._BQ["disabled"] + demo._BQ["disabled"] = True # the simulated FC executes query_thelook + try: + out = await _run(spec, reg, {"insights": ["claim ALPHA", "claim BETA"]}) + finally: + demo._BQ["disabled"] = monkeypatch_bq + assert len(out["verified"]) == 2 + # group the captured calls per agent instance; each agent must see ITS + # OWN claim on EVERY call (especially call 2, after the tool roundtrip) + by_agent: dict = {} + for agent_id, n, texts in captured: + by_agent.setdefault(agent_id, []).append(texts) + assert len(by_agent) == 2 + claims = [] + for calls in by_agent.values(): + assert len(calls) == 2 + own = "claim ALPHA" if "claim ALPHA" in calls[0] else "claim BETA" + other = "claim BETA" if own == "claim ALPHA" else "claim ALPHA" + claims.append(own) + for texts in calls: + assert own in texts # its own claim, every call + assert other not in texts # never the sibling's + assert sorted(claims) == ["claim ALPHA", "claim BETA"] + + +def test_skeptic_is_data_grounded(monkeypatch): + # The skeptic carries a REAL verification tool and a bumped version (a + # semantic contract change — stored plans drift-reject and re-author + # rather than silently reusing the plausibility-only skeptic). + cap = demo._registry()["skeptic"] + assert cap.version == "2" + agent_obj = cap.build() + tool_names = [ + getattr(t, "__name__", getattr(t, "name", "")) for t in agent_obj.tools + ] + assert "query_thelook" in tool_names + assert agent_obj.output_schema is demo.Verdict # tools + schema together + assert "VERIFY the claim" in agent_obj.instruction + # the tool itself: read-only, capped, honest about its engine (mock here) + monkeypatch.setitem(demo._BQ, "disabled", True) + out = demo.query_thelook( + "SELECT region, SUM(x) AS revenue ... GROUP BY region INTERVAL 1 YEAR" + ) + assert out["engine"] == "mock" and len(out["rows"]) <= 50 + assert out["rows"][0]["region"] == "US-West" + + +def test_skeptic_verdicts_render_with_reasons(): + # The audit beat the user could not see: every verdict in interpreter + # state renders as one line WITH the skeptic's stated reason. + state = { + "sqlgen": {"sql": "..."}, # non-verdict values are ignored + "verdicts": [ + { + "insight": "AOV is $1,000,000.", + "refuted": True, + "reason": "Implausible: dataset AOV is roughly $60-90.", + }, + json.dumps({ + "insight": "Sales peaked in 2025.", + "refuted": False, + "reason": "Consistent with the yearly totals; 2026 is partial.", + }), + ], + } + lines = demo._verdict_lines(state) + assert len(lines) == 2 + assert lines[0].startswith("❌ REFUTED") and "$60-90" in lines[0] + assert lines[1].startswith("✅ upheld") and "2026 is partial" in lines[1] + assert demo._verdict_lines({"x": {"rows": []}}) == [] # no audit -> no beat + # the Verdict schema demands the reasoning field + assert "reason" in demo.Verdict.model_fields + # extraction drops the trailing '?' user phrasing drags in + task = demo._task_for("adversarial", "audit this insight sales doubled YoY?") + assert task == {"insights": ["sales doubled YoY"]} + + +def test_sql_freezing_roundtrip_and_revision_history(tmp_path, monkeypatch): + # SQL freezing: the validated SQL for a question is a durable artifact; + # replays skip the drafting LLM (numeric determinism). Human feedback + # revisions append to an auditable history. + monkeypatch.setattr(demo, "_SQL_STORE", str(tmp_path)) + q = " What was REVENUE by region last quarter? " + demo._freeze_sql(q, "SELECT 1", engine="bigquery", bytes_processed=42) + # normalized question text hits the same record + rec = demo._load_frozen_sql("what was revenue by region last quarter?") + assert rec is not None and rec["sql"] == "SELECT 1" + assert rec["revisions"] == [] and rec["bytes_processed"] == 42 + # a feedback revision replaces the SQL and RECORDS the feedback + old sql + demo._freeze_sql( + q, + "SELECT 2", + feedback="use calendar quarters, not trailing 90 days", + previous=rec, + ) + rec2 = demo._load_frozen_sql(q) + assert rec2["sql"] == "SELECT 2" + assert len(rec2["revisions"]) == 1 + assert rec2["revisions"][0]["previous_sql"] == "SELECT 1" + assert "calendar quarters" in rec2["revisions"][0]["feedback"] + # unknown question -> None + assert demo._load_frozen_sql("never asked") is None + # WORKFLOW LINEAGE: the middle result records which plan + step produced + # it, and revisions inherit the lineage (the artifact is structurally + # attached to the frozen workflow, not just stored beside it). + demo._freeze_sql( + "lineage q", + "SELECT 1", + plan_hash="abc123def456", + produced_by_step="sqlgen", + ) + rec3 = demo._load_frozen_sql("lineage q") + assert rec3["plan_hash"] == "abc123def456" + assert rec3["produced_by_step"] == "sqlgen" + demo._freeze_sql("lineage q", "SELECT 2", feedback="tweak it", previous=rec3) + rec4 = demo._load_frozen_sql("lineage q") + assert rec4["plan_hash"] == "abc123def456" # lineage survives revisions + assert len(rec4["revisions"]) == 1 + + +def test_frozen_sql_replay_plan_is_static_and_clean(): + # The replay plan is STATIC (constant hash — no authoring) and contains + # no drafting step: dry_run -> run_query -> chart -> summarize. + spec = demo._frozen_sql_spec() + caps = [s.capability for s in spec.steps] + assert caps == ["dry_run", "run_query", "render_chart", "summarize_insight"] + assert "draft_or_repair_sql" not in caps and "nl2sql" not in caps + warnings = WorkflowSpecValidator(demo._registry()).validate(spec) + assert [w for w in warnings if w.startswith("plan-quality")] == [] + h1 = demo.sha256_hex(spec.model_dump(mode="json")) + h2 = demo.sha256_hex(demo._frozen_sql_spec().model_dump(mode="json")) + assert h1 == h2 # deterministic replay plan + + +@pytest.mark.asyncio +async def test_frozen_sql_replay_skips_the_drafting_llm(monkeypatch): + monkeypatch.setitem(demo._BQ, "disabled", True) + out_holder = {} + + @node(rerun_on_resume=True) + async def parent(ctx, node_input): + interp = SpecInterpreter(_stub_registry(), ctx) + out_holder["out"] = await interp.execute( + demo._frozen_sql_spec(), + { + "question": "revenue by region?", + "sql": "SELECT region, SUM(x) AS revenue ... GROUP BY region", + }, + ) + out_holder["n"] = interp.dispatch_count + yield Event(output={"_done": True}) + + wf = Workflow(name="t", edges=[("START", parent)]) + ss = InMemorySessionService() + r = Runner(app_name="t", node=wf, session_service=ss) + s = await ss.create_session(app_name="t", user_id="u") + async for _ in r.run_async( + user_id="u", + session_id=s.id, + new_message=types.Content(parts=[types.Part(text="go")], role="user"), + ): + pass + assert out_holder["n"] == 4 # check, run, chart, summarize — NO draft + assert out_holder["out"] == {"insight": "US-West leads revenue."} + + +def test_conversational_gate_routing(): + # Triggered messages bypass the gate (mode selectors stay deterministic); + # untriggered messages go through the intent gate first — including the + # live failure this pins: a meta-question must NOT become a data query. + assert demo._matched_scenario("profile data quality please") == "fanout" + assert demo._matched_scenario("pick the best chart for revenue") == ( + "tournament" + ) + assert demo._matched_scenario("What was revenue by region?") is None + assert ( + demo._matched_scenario("tell what kinds of workflow you can issue?") + is None + ) + # the catalogue lists all seven scenarios and is template-safe + cat = demo._describe_workflows() + for sc in demo.SCENARIOS.values(): + assert sc["title"] in cat + import re as _re + + assert not _re.findall(r"\{[A-Za-z_][A-Za-z0-9_]*\}", cat) + assert not _re.findall( + r"\{[A-Za-z_][A-Za-z0-9_]*\}", demo._intent_agent().instruction + ) + # the gate's schema: data -> proceed; meta/chat -> direct reply + assert set(demo.Intent.model_fields) == {"intent", "reply"} + + +@pytest.mark.asyncio +async def test_meta_question_escapes_without_any_workflow( + tmp_path, monkeypatch +): + """The full no-workflow escape path, no LLM: a meta-question returns the + gate's reply BEFORE plan-store import, session replay, authoring, or + execution — the exact live failure mode, locked end-to-end.""" + + def stub_gate(): + @node(name="intent_gate") + async def n(ctx, node_input): + yield Event(output={"intent": "meta", "reply": "Seven workflow shapes."}) + + return n + + monkeypatch.setattr(demo, "_intent_agent", stub_gate) + stub_reg = _stub_registry() # build BEFORE patching (it reads _registry) + monkeypatch.setattr(demo, "_registry", lambda: stub_reg) + monkeypatch.setattr(demo, "_PLAN_STORE", str(tmp_path)) # empty store + ss = InMemorySessionService() + session = await ss.create_session(app_name="demo", user_id="u") + runner = Runner(app_name="demo", node=demo.root_agent, session_service=ss) + texts, final = [], None + async for ev in runner.run_async( + user_id="u", + session_id=session.id, + new_message=types.Content( + parts=[types.Part(text="tell what kinds of workflow you can issue?")], + role="user", + ), + ): + if isinstance(ev, Event) and ev.content and ev.content.parts: + texts += [p.text for p in ev.content.parts if p.text] + if isinstance(ev, Event) and isinstance(ev.output, dict): + final = ev.output + joined = "\n".join(texts) + assert "Seven workflow shapes." in joined # the gate's reply reached chat + assert final == {"scenario": "conversation", "intent": "meta"} + # and NOTHING workflow-shaped happened: + assert "hash" not in (final or {}) + for marker in ("Authored plan", "Reusing frozen plan", "Validation passed"): + assert marker not in joined + assert not list(tmp_path.iterdir()) # nothing was frozen/exported + + +def test_scenario_routing(): + assert demo._scenario_for("What was revenue by region?") == "sequence" + assert demo._scenario_for("Profile data quality please") == "fanout" + assert demo._scenario_for("Build a dashboard for these") == "pipeline" + assert demo._scenario_for("what does status Complete mean?") == "branch" + assert demo._scenario_for("the dry run is unreliable, retry") == "loop" + assert demo._scenario_for("audit these insights") == "adversarial" + assert demo._scenario_for("pick the best chart") == "tournament" + assert demo._scenario_for("hello") == "sequence" # default + assert demo._scenario_for("revise: use calendar quarters") == "revise" + assert demo._scenario_for("update the sql to exclude returns") == "revise" + # overlapping triggers: specialized intent must beat the generic fallback + # ("revenue by region" is a sequence trigger, but these aren't questions). + assert ( + demo._scenario_for("Pick the best chart for revenue by region.") + == "tournament" + ) + assert ( + demo._scenario_for("give me the best chart for revenue by region") + == "tournament" + ) + assert ( + demo._scenario_for("Profile data quality for revenue by region") + == "fanout" + ) + + +def test_all_seven_shapes_validate_and_lint_clean(): + reg = demo._registry() + for key in demo.SCENARIOS: + if key == "revise": # custom feedback flow, not an authored shape + continue + warnings = WorkflowSpecValidator(reg).validate(_expected_spec(key)) + lints = [w for w in warnings if w.startswith("plan-quality")] + assert lints == [], f"{key}: {lints}" + + +@pytest.mark.asyncio +async def test_sequence_executes(monkeypatch): + monkeypatch.setitem(demo._BQ, "disabled", True) # no network in unit tests + out = await _run( + _expected_spec("sequence"), + _stub_registry(), + demo.SCENARIOS["sequence"]["task"], + ) + assert out == {"insight": "US-West leads revenue."} + + +@pytest.mark.asyncio +async def test_fanout_executes_no_llm_needed(monkeypatch): + # All SEVEN real dataset tables, profiled in parallel. The mock engine is + # pinned here (no network in unit tests); with credentials the profiler + # queries the real __TABLES__ metadata and labels engine=bigquery. + monkeypatch.setitem(demo._BQ, "disabled", True) + out = await _run( + _expected_spec("fanout"), + demo._registry(), + demo.SCENARIOS["fanout"]["task"], + ) + profiles = demo._CANNED_PROFILES.values() + assert out == { + "tables": 7, + "total_rows": sum(p["row_count"] for p in profiles), + "largest_table": "events", + "total_size_mb": round(sum(p["size_mb"] for p in profiles), 1), + } + + +@pytest.mark.asyncio +async def test_pipeline_executes_per_question(monkeypatch): + # Every panel is translated, validated, EXECUTED, and charted — the shape + # the CA head-to-head proved out (3 real panels in ~10s, barrier-free). + monkeypatch.setitem(demo._BQ, "disabled", True) + out = await _run( + _expected_spec("pipeline"), + _stub_registry(), + demo.SCENARIOS["pipeline"]["task"], + ) + assert len(out) == 3 + for panel in out: + assert "vega_lite" in panel and panel["chart_type"] in ("bar", "line") + assert panel["vega_lite"]["data"]["values"] # executed rows, per panel + + +@pytest.mark.asyncio +async def test_branch_routes_schema_question(): + out = await _run( + _expected_spec("branch"), + _stub_registry(), + demo.SCENARIOS["branch"]["task"], # "...what does ... mean?" -> schema + ) + assert "Complete" in out["answer"] + + +@pytest.mark.asyncio +async def test_loop_repairs_sql_exactly_once(monkeypatch): + # The demo uses the REAL dry-run on a really-broken query; in CI the + # failure is simulated by a stateful stub (fails the first check only). + monkeypatch.setitem(demo._BQ, "disabled", True) + calls = {"n": 0} + + def checking(s): + calls["n"] += 1 + if calls["n"] == 1: + return { + "question": str(demo._field_of(s, "question", "") or ""), + "sql": demo._sql_of(s), + "valid": False, + "error": "Not found: Table thelook_ecommerce.order", + } + return { + "question": str(demo._field_of(s, "question", "") or ""), + "sql": demo._sql_of(s), + "valid": True, + "error": None, + } + + reg = _stub_registry() + caps = [c for n, c in reg._by_name.items() if n != "dry_run"] + caps.append( + Capability( + name="dry_run", + input_kind="item", + serialize_input=False, + build=_stub("dry_run", checking), + ) + ) + reg = CapabilityRegistry(caps) + out = await _run(_expected_spec("loop"), reg, demo.SCENARIOS["loop"]["task"]) + assert calls["n"] == 2 # exactly one repair round + assert out["engine"] == "mock" and out["rows"] # query ran after repair + + +@pytest.mark.asyncio +async def test_adversarial_rejects_implausible_insight(): + out = await _run( + _expected_spec("adversarial"), + _stub_registry(), + demo.SCENARIOS["adversarial"]["task"], + ) + assert len(out["verified"]) == 2 + assert any("1,000,000" in r for r in out["rejected"]) + + +@pytest.mark.asyncio +async def test_tournament_picks_best_chart_no_llm_needed(monkeypatch): + monkeypatch.setitem(demo._BQ, "disabled", True) + # pairing + judging are deterministic mocks even in the LIVE registry. + out = await _run( + _expected_spec("tournament"), + demo._registry(), + demo.SCENARIOS["tournament"]["task"], + ) + # bracket converges to bar; the winner is rendered as a chart artifact. + assert out["chart_type"] == "bar" + assert out["vega_lite"]["mark"] == "bar" + assert "US-West" in out["ascii"] diff --git a/contributing/samples/workflows/authored_workflow_demo/DEMO_NARRATIVE.md b/contributing/samples/workflows/authored_workflow_demo/DEMO_NARRATIVE.md new file mode 100644 index 00000000000..45603b40319 --- /dev/null +++ b/contributing/samples/workflows/authored_workflow_demo/DEMO_NARRATIVE.md @@ -0,0 +1,258 @@ +# Demo narrative — model-authored typed Workflows (RFC #93) + +A beat-by-beat narration for the ~7-minute recording, with a **real transcript** +captured on Vertex `gemini-3.5-flash`. Pair this with the run commands in +`README.md`. Bottom line: *ADK Web sells the product fit; pytest/CI sells the +correctness.* + +## Thesis (say this first, ~20s) + +> "#92 gives ADK a supervised concurrent executor. #93 lets a model **author** a +> typed `WorkflowSpec` — a plan as *data, not code* — that ADK validates against +> a capability allow-list, freezes, and executes reproducibly. Watch the model +> write a plan, ADK validate it, run it, and then **replay the exact same frozen +> plan** without re-invoking the model." + +## Beat 1 — author (ADK Web chat) + +Send: **"Plan and run a codebase security review."** The chat streams: + +``` +🧭 Model-authored Workflow — planning a security audit over 4 files using only + registered capabilities (reviewer, verifier, triager, formatter). + +📋 Authored plan (pipeline → step → step): + { + "goal": "Audit files and format the report", + "steps": [ + {"kind": "pipeline", "id": "review_pipeline", + "over": {"source":"task","path":"files"}, + "stages": [{"capability":"reviewer"}, {"capability":"verifier"}], + "collect": "list"}, + {"kind": "step", "id": "triage_step", "input": {"source":"step","step":"review_pipeline"}, "capability": "triager"}, + {"kind": "step", "id": "format_step", "input": {"source":"step","step":"triage_step"}, "capability": "formatter"} + ], + "output": {"source": "step", "step": "format_step"} + } +``` + +> "The model emitted a *typed plan*, not code — a **pipeline** over the files +> (`reviewer → verifier` per file, barrier-free), then `triager`, then +> `formatter`, with explicit data bindings between steps. The pipeline is the +> construct that lets each file flow review→verify independently — item A can be +> verifying while item B is still being reviewed." + +## Beat 1b — free authoring (the honest "model-authored" claim) + +In a new session, send: **"Freely plan a security review of the files — decompose it yourself."** + +``` +🧭 Free authoring — the planner receives ONLY the goal + capability + descriptions (reviewer, verifier, triager, formatter); no plan recipe. + The shape below is the model's own decomposition (it may differ run to + run — and the freeze beat then makes THIS run replayable). +``` + +> "Beat 1 was instruction-guided so the recording is reproducible — the model +> filled in a known shape. This beat is the real claim: goal + capabilities +> in, plan out, no recipe. Whatever it authors, the same validator, lints, +> freeze, and replay machinery apply — free authoring composes with +> everything you're about to see, and the frozen hash pins *this* run." + +## Beat 2 — validate (capability allow-list) + +``` +✅ Validation passed. Capabilities referenced (all registered): + ['formatter', 'reviewer', 'triager', 'verifier']. +``` + +> "Validation confirms every capability the plan names is in the registry. The +> model can only compose pre-approved capabilities — no arbitrary calls, no code +> execution. That's the security model: capability allow-listing, not a sandbox." + +## Beat 2b — independence lints (the quality argument, made static) + +``` +🧪 Plan-quality lints: 0 warnings. Agent independence is statically checkable + from the typed bindings — the frozen record *proves* it to an auditor: + - pipeline 'review_pipeline': stage 'verifier' sees ONLY stage 'reviewer''s + per-item output — independent verification, per item + - 'review_pipeline' consumes ONLY the task input + - 'triage_step' consumes ONLY the typed output of 'review_pipeline' + - 'format_step' consumes ONLY the typed output of 'triage_step' +``` + +> "This is the multi-agent quality argument, made structural. Isolation is what +> mitigates the documented single-agent failure modes — an agent grading its own +> output (self-preferential bias) and requirements decaying through layers of +> summarization (goal drift). Because every step's only input is a typed +> binding, those properties are **statically checkable**: the validator warns +> if a plan has a step reviewing its own capability's output, or a fan-out +> that's never synthesized. Here it's clean — and the frozen record *proves* +> the verifier saw only the reviewer's output. You can't prove that about +> model-written orchestration code; you can about a typed plan. For a regulated +> audience, that's the sharpest line in the demo." + +## Beat 3 — freeze (State tab) + +``` +🔒 Frozen spec persisted to session state — hash 71997cdf0669. + Re-send the prompt: it replays this exact plan, not a new one. +``` + +> "Open the **State** tab: `authored_workflow:frozen_spec` and `…_hash`. The plan +> is now durable data you can store, diff, and audit." + +*(Presenter note: session **state** keeps a minimal `{spec, hash}` subset so the State tab stays readable. The **export** beat below serializes the full `FrozenWorkflowRecord`. Production v1 would persist the full record to state too — see `authored_workflow_spike/DESIGN.md` §5. The split here is presentational, not the canonical contract.)* + +## Beat 3b — export the plan (the enterprise artifact) + +``` +📦 Exported plan → security_audit_plan.json (full 71997cdf0669, schema v1, + planner gemini-3.5-flash). Re-imported OK — import recomputes the hash and + re-validates against the current registry, never trusting the envelope's own + validation. This is the reviewable / diffable / replayable audit artifact. +``` + +> "The frozen plan isn't just in-memory state — it serializes to a **portable +> JSON envelope**: the spec, its `sha256`, the planner model, registry + +> per-capability versions, the validation result, and a *digest* of the task +> input (not the raw input). `cat security_audit_plan.json` — this is the thing +> you check into a repo, diff in a PR, and hand to an auditor. And import is +> **defensive**: it recomputes the hash (rejects a tampered spec), re-validates +> against the *current* registry (rejects a dropped capability), and flags +> per-capability version drift — it never trusts the envelope's own `validation` +> stamp. That defensive import is exactly what makes a model-authored plan safe +> to store and replay later." + +Show the file on camera: + +```bash +cat security_audit_plan.json | jq '{schema_version, spec_hash, planner_model, capability_versions, validation}' +``` + +## Beat 3c — lower the static subset to ADK config + +``` +🧬 ADK config lowering (static subset) — 2/3 top-level steps project to ADK + config; dynamic blocks stay SpecInterpreter-only: ['pipeline']. + { "agent_class": "SequentialAgent", "name": "security_audit_planner", + "sub_agents": [ + { "agent_class": "", "workflowspec_kind": "pipeline", … }, + { "agent_class": "LlmAgent", "name": "triage_step", "capability": "triager" }, + { "agent_class": "LlmAgent", "name": "format_step", "capability": "formatter" } ] } +``` + +> "This is the convergence with ADK config, made concrete. The static parts are +> what the `loop_config/root_agent.yaml` style is good at: a known Workflow graph +> and known child agents. This demo projects the top-level sequence onto that +> family of config shapes, with leaves referenced by **capability name, not an +> importable FQN**. The `reviewer → verifier` **pipeline** is flagged +> `` because it is per-item over a runtime list; raw +> YAML would need a wrapper node, while `WorkflowSpec` keeps it typed and +> policy-checked. Honest framing: this is an *illustrative projection* (RFC #93 +> §11), not a loadable `root_agent.yaml`; execution still runs through the +> interpreter." + +## Beat 4 — execute (Events / trace tab) + +``` +📄 Audit result: Identified 4 vulnerabilities: 1 critical (command injection), + 2 high (hardcoded credentials and SQL injection), and 1 medium (division by zero). +📊 Cost: 10 capability dispatches in 8.2s + 1 planner call — per-step work runs + outside the planner's context. +``` + +> "Open **Events**: ADK runs the plan on the real engine via the #92 supervisor. +> Note the interleaving — `reviewer` and `verifier` events alternate **per +> file** (a file is being verified while another is still under review); that's +> the barrier-free pipeline, not two separate fan-out waves. Then `triager` over +> all verified findings, then `formatter`. The findings are real: a CRITICAL +> `os.system` injection, HIGH hardcoded creds and SQL injection, and a MEDIUM +> divide-by-zero. And note the cost line: **one** planner call, ten capability +> dispatches — the plan is authored once, the work scales outside the planner's +> context. On the replay beat it'll say **zero** planner calls." + +## Beat 5 — reproduce (re-send the same prompt) + +``` +♻️ Reusing frozen plan from session state — hash 71997cdf0669. + The model is NOT re-invoked; the exact prior plan is replayed. +✅ Validation passed. ... +📄 Audit result: ... +``` + +> "Send the same prompt again — **same hash, model not re-invoked**. The frozen +> plan is replayed. That's the reproducibility guarantee: authoring is a +> one-time, auditable step; execution is deterministic replay." + +**Verified outputs (this capture):** + +| Run | `reused` | `hash` | +| ----------- | -------- | -------------- | +| 1 (author) | `false` | `71997cdf0669` | +| 2 (re-send) | `true` | `71997cdf0669` | + +Same hash, `reused` flips to `true` — the model is not called the second time. + +## Beat 6 — the quality gate catches a biased plan (adversarial ask) + +Send: **"Plan a sloppy review: have the reviewer double-check its own findings."** + +``` +🧭 Adversarial ask — authoring a plan where the reviewer double-checks its OWN + findings. Watch the quality gate. +📋 Authored plan (valid registry refs, valid bindings, valid shapes): … + "stages": [{"capability":"reviewer"}, {"capability":"reviewer"}] … +🚨 Plan-quality lints fired (1): + - ⚠️ plan-quality: pipeline 'review_pipeline' stage 'reviewer' re-checks its + own capability's output — same-capability review cannot provide + independent verification (self-preferential bias) +🛑 Plan rejected by the quality gate — NOT frozen, NOT executed. +``` + +> "This is the counterpoint to Beat 2b, and the sharpest 30 seconds in the +> demo. I *asked* for a biased plan, and the model obliged — every capability +> registered, every binding typed, plain validation green. A code-authoring +> system would now run it. Here the **structural bias check** catches it +> pre-execution: an agent grading its own output is detectable *from the plan +> itself*, because the plan is data. The gate refuses to freeze or execute it; +> in production that triggers a bounded re-plan. Bias control as a static +> check — that's not possible when the model writes orchestration code." + +## Close (~20s) + +> "So: a model authored a typed, validated, capability-bounded plan whose +> **agent independence is statically proven**; ADK executed it on the real +> engine at a visible cost (one planner call, the work outside its context); +> the plan **exported** to a portable, defensively-imported audit artifact; and +> a re-send replayed the exact frozen plan with zero planner calls. The +> deterministic test suites — 11 (#92) + 36 (#93) + 8 (demo) — lock all of this +> in CI, including the no-LLM reuse path, the export round-trip / tamper / +> drift checks, the plan-quality lints, and the six-coordination-pattern +> coverage sweep (adversarial verification and tournament included)." + +**Convergence with ADK Workflow config / `root_agent.yaml`** — this is what Beat 3c shows, if a reviewer asks "why not author `loop_config/root_agent.yaml`?": + +> "`loop_config/root_agent.yaml` is a good **derived target** for static graph +> structure: it has `agent_class: Workflow`, fixed `edges`, child YAML files, and +> route functions like `.agent.route_headline`. It is not the right **raw model +> output** because those refs are exactly what we don't want a model to invent: +> Python functions, `_code` refs, child config paths, tools/callbacks, or FQNs. +> #93 keeps the planner output closed and allow-listed, then lowers static parts +> toward config. The `reviewer → verifier` pipeline stays a first-class +> `WorkflowSpec` block because it dispatches per item over a runtime list; raw +> YAML would need a wrapper. The lowering shown is illustrative, not a loadable +> `root_agent.yaml`; a full config compiler is future work. `Workflow` itself is +> not deprecated, but the current config loader path and agent-config sugar +> classes are `@deprecated` + `@experimental`, so this is convergence with the +> Workflow config *shape* for compatibility, not a bet on today's loader or +> deprecated sugar." + +## Proof commands (terminal, ~60s) + +```bash +pytest contributing/samples/workflows/dynamic_supervisor_spike/test_dynamic_supervisor_spike.py -q # 11 +pytest contributing/samples/workflows/authored_workflow_spike/test_authoring.py -q # 36 +pytest contributing/samples/workflows/authored_workflow_demo/test_demo_agent.py -q # 8 +``` diff --git a/contributing/samples/workflows/authored_workflow_demo/README.md b/contributing/samples/workflows/authored_workflow_demo/README.md new file mode 100644 index 00000000000..a22490bb4cd --- /dev/null +++ b/contributing/samples/workflows/authored_workflow_demo/README.md @@ -0,0 +1,103 @@ +# ADK Web demo — model-authored typed Workflows (RFC #93) + +A ~7-minute demo: a model authors a typed `WorkflowSpec`, ADK validates it +against a capability registry, freezes it to session state, and executes it on +the real ADK engine via the #92 supervisor — all visible in ADK Web's chat, +state, and event surfaces. **ADK Web sells the product fit; pytest/CI sells the +correctness.** + +## 0. Configure a model (no hardcoded project) + +```bash +export GOOGLE_GENAI_USE_VERTEXAI=1 +export GOOGLE_CLOUD_PROJECT= +export GOOGLE_CLOUD_LOCATION=global # gemini-3.5-flash serves from `global` +export SPIKE_GEMINI_MODEL=gemini-3.5-flash # or any flash model you can access +``` + +## 1. Thesis (20s) + +- **#92** is the supervised concurrent executor (`DynamicNodeSupervisor` + `ctx.pipeline`). +- **#93** is the model-authored typed `WorkflowSpec` layer. +- The demo: a model authors a *validated* plan, then ADK executes that *frozen* plan reproducibly. + +## 2. ADK Web walkthrough (3–5 min) + +```bash +adk web contributing/samples/workflows/authored_workflow_demo \ + --port 8000 --session_service_uri "sqlite:///demo_sessions.db" +``` + +Open the UI, pick `security_audit_planner`, and send: + +```text +Plan and run a codebase security review. +``` + +Point at the ADK-native evidence as it streams: + +1. **Authored `WorkflowSpec`** — the chat shows the JSON plan (`pipeline → step → step`: a `reviewer → verifier` pipeline over the files, then `triager`, then `formatter`). +1. **Validation** — "Validation passed" + the capability list (all registered). +1. **Independence lints** — `🧪 Plan-quality lints: 0 warnings.` The typed bindings make agent isolation **statically checkable**: the verifier stage provably sees only the reviewer's per-item output (independent verification, per file), and each downstream step provably consumes only its upstream's typed output. The frozen record can *prove* these structural bias controls to an auditor — model-written orchestration code can't be checked this way. +1. **Frozen spec + hash** — open the **State** tab: `authored_workflow:frozen_spec` and `…_hash`. +1. **Exported plan** — `📦 Exported plan → security_audit_plan.json`. The full `FrozenWorkflowRecord` (spec, `sha256`, planner model, registry + capability versions, validation, task-input digest) as a portable envelope; import recomputes the hash and re-validates against the current registry. `cat security_audit_plan.json | jq .` on camera. +1. **ADK config lowering** — `🧬 ADK config lowering (static subset) — 2/3 …`. The plan's static skeleton projects toward ADK Workflow/agent config shapes (a static `Workflow`/`SequentialAgent` skeleton + `LlmAgent` leaves by capability name); the `reviewer → verifier` pipeline is flagged **no-AgentConfig-equivalent**, not fabricated. An illustrative projection (RFC #93 §11) — see the talking point below. +1. **Execution** — the **Events / trace** view shows `reviewer` and `verifier` interleaving **per file** (the barrier-free pipeline), then `triager`, then `formatter`. +1. **Final output + cost** — the triaged audit (1 CRITICAL + 2 HIGH + 1 MEDIUM across `auth.py`/`db.py`/`net.py`/`math.py`), then `📊 Cost: 10 capability dispatches in N.Ns + 1 planner call` — the planner is invoked at most once (zero on replay); all per-step work runs outside its context. + +(Re-send the same prompt to show resume reuses the frozen spec — same hash, not re-authored.) + +Then run the **free-authoring beat** — in a **new session**, send: + +```text +Freely plan a security review of the files — decompose it yourself. +``` + +The planner receives ONLY the goal + capability descriptions (no plan recipe — `test_free_planner_instruction_is_recipe_free` pins this). The shape may differ run to run; that's the point — and the freeze beat then makes *this* run replayable. Talking point: *the default walkthrough shows the mechanics on a scripted plan; this beat is the honest "model-authored" claim.* + +Then run the **quality-gate beat** — send: + +```text +Plan a sloppy review: have the reviewer double-check its own findings. +``` + +The planner authors a *valid* plan (registered capabilities, typed bindings) whose pipeline is `reviewer → reviewer` — and the **plan-quality lint fires on camera**: `🚨 plan-quality: pipeline 'rev' stage 'reviewer' re-checks its own capability's output — same-capability review cannot provide independent verification (self-preferential bias)`, followed by `🛑 Plan rejected by the quality gate — NOT frozen, NOT executed`. Talking point: *plain validation passes; only the structural bias check catches it — before anything runs, and provably.* + +### Relationship to ADK Workflow config / `root_agent.yaml` (talking point) + +The RFC's direction is to **converge with ADK config where it fits** (RFC #93 → "Relationship to ADK Workflow config / `root_agent.yaml`"; DESIGN §11). The linked `loop_config/root_agent.yaml` sample is the right mental model for the **static** portion: a human-authored `agent_class: Workflow` YAML graph with known `edges`, child YAML files, and function refs like `.agent.route_headline`. #93 should be able to lower/export static graph skeletons toward that style, while the model-facing format stays `WorkflowSpec`. + +- the **top-level sequence** (`pipeline → triager → formatter`) is the kind of static composition that can lower to a static Workflow/config skeleton; +- the **`reviewer → verifier` pipeline** (per-item, barrier-free over a runtime list) is exactly what raw YAML **doesn't express directly** today; it would need a wrapper node, while `WorkflowSpec` can keep it typed and policy-checked as a first-class runtime block; +- raw YAML can name function refs, `_code` refs, child YAML files, tools, callbacks, or importable FQNs; model-authored plans should reference only allow-listed capability names. + +The demo now **shows** this split: the 🧬 lowering beat prints the static skeleton projected onto ADK config shapes (2/3 of the demo plan), with the pipeline marked no-equivalent. + +Honest scope: it's an **illustrative structural projection** (leaves by capability name, dynamic blocks flagged) — **not** a loadable `root_agent.yaml`. Execution still runs via the `SpecInterpreter` on the real engine; a full loadable-config compiler (Workflow YAML edges + child YAML + an allow-listed capability-ref field) is future work (DESIGN §12). + +> **If asked "why not just author `loop_config/root_agent.yaml`?"** — use that YAML shape as a lowering/export target for static graphs, not as the raw model output. The sample intentionally resolves Python function refs and child YAML refs; #93 needs a closed, response-schema-safe, capability-allow-listed authoring format first. Also, `Workflow` itself is not deprecated, but the current config loader path and agent-config sugar classes are `@deprecated` + `@experimental`; this is convergence with the Workflow config **shape** for compatibility/illustration, not a long-term dependency on today's loader or deprecated sugar (RFC §11). + +## 3. Shape sweep — not a one-off (1–2 min) + +```bash +SPIKE_LIVE=1 pytest \ + contributing/samples/workflows/authored_workflow_spike/test_live_planner_sweep.py -q -s +``` + +Proof points: multi-stage `fan_out → step → step`; branch `step → branch`; loop `loop_until`. + +## 4. Correctness proof (60s) + +```bash +pytest contributing/samples/workflows/dynamic_supervisor_spike/test_dynamic_supervisor_spike.py -q # 11 +pytest contributing/samples/workflows/authored_workflow_spike/test_authoring.py -q # 36 +pytest contributing/samples/workflows/authored_workflow_demo/test_demo_agent.py -q # 8 +``` + +- Deterministic suites: #92 **11** + #93 **36** + demo **8** = **55** (incl. a no-LLM reuse-path test, the six-pattern coverage sweep — adversarial verification + tournament via loop-carried `init` — the plan-quality lints with `allow_self_chain` policy + recorded waivers, contract-hash drift rejection (fail-closed on stripped hashes), and the recipe-free free-authoring instruction pin). +- PR #3 CI green except the documented fork-only `agent-triage` token job. + +## Recording notes + +- macOS `Cmd+Shift+5` or Loom; browser at 110–125% zoom, terminal font 16+. +- Hide project IDs / env vars. Keep it under ~7 minutes. diff --git a/contributing/samples/workflows/authored_workflow_demo/security_audit_planner/__init__.py b/contributing/samples/workflows/authored_workflow_demo/security_audit_planner/__init__.py new file mode 100644 index 00000000000..1a38cf933e9 --- /dev/null +++ b/contributing/samples/workflows/authored_workflow_demo/security_audit_planner/__init__.py @@ -0,0 +1,15 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from . import agent # noqa: F401 diff --git a/contributing/samples/workflows/authored_workflow_demo/security_audit_planner/agent.py b/contributing/samples/workflows/authored_workflow_demo/security_audit_planner/agent.py new file mode 100644 index 00000000000..62bcbc85049 --- /dev/null +++ b/contributing/samples/workflows/authored_workflow_demo/security_audit_planner/agent.py @@ -0,0 +1,484 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""ADK Web demo agent for RFC #93 — model-authored typed Workflows. + +`root_agent` is a `Workflow` whose single node: + 1. asks a planner `LlmAgent(output_schema=WorkflowSpec)` to author a plan, + 2. validates it (`WorkflowSpecValidator`) against a capability registry, + 3. persists the frozen spec + hash to session state, + 4. executes it on the real ADK engine via the #92 supervisor, +surfacing each step as a chat message so the ADK Web UI shows the authored +plan, validation, capabilities, frozen hash, and final output. Run with: + + adk web contributing/samples/workflows/authored_workflow_demo + +Configure a model first (no hardcoded project): + export GOOGLE_GENAI_USE_VERTEXAI=1 GOOGLE_CLOUD_PROJECT= + export GOOGLE_CLOUD_LOCATION=global SPIKE_GEMINI_MODEL=gemini-3.5-flash +""" + +from __future__ import annotations + +import datetime +import json +import os +import sys +import time +from typing import Literal + +from google.adk import Agent +from google.adk import Context +from google.adk import Event +from google.adk import Workflow +from google.adk.workflow import node +from google.genai import types +from pydantic import BaseModel + +# Reuse the committed #93 authoring stack (sibling sample dir). +sys.path.insert( + 0, + os.path.join( + os.path.dirname(os.path.abspath(__file__)), + "..", + "..", + "authored_workflow_spike", + ), +) +from authoring import agent_config_coverage # noqa: E402 +from authoring import Capability # noqa: E402 +from authoring import CapabilityRegistry # noqa: E402 +from authoring import export_plan # noqa: E402 +from authoring import FrozenWorkflowRecord # noqa: E402 +from authoring import import_plan # noqa: E402 +from authoring import independence_facts # noqa: E402 +from authoring import lower_to_agent_config # noqa: E402 +from authoring import sha256_hex # noqa: E402 +from authoring import SpecInterpreter # noqa: E402 +from authoring import WorkflowSpec # noqa: E402 +from authoring import WorkflowSpecValidator # noqa: E402 + +MODEL = os.environ.get("SPIKE_GEMINI_MODEL", "gemini-2.5-flash") +DET = types.GenerateContentConfig(temperature=0) + +# A small, deliberately-mixed codebase to audit (3 vulnerable, 1 safe). +FILES = [ + { + "path": "auth.py", + "code": "def login(pw): return pw == 'admin123' # hardcoded", + }, + { + "path": "db.py", + "code": "q = 'SELECT * FROM users WHERE id=' + request.args['id']", + }, + {"path": "net.py", "code": "os.system('ping ' + user_supplied_host)"}, + {"path": "math.py", "code": "def mean(xs):\n return sum(xs) / len(xs)"}, +] + + +class Finding(BaseModel): + path: str + severity: Literal["CRITICAL", "HIGH", "MEDIUM", "LOW", "NONE"] + issue: str + + +class ReportFixed(BaseModel): + total: int + critical: int + high: int + medium: int + low: int + none: int + summary: str + + +class Note(BaseModel): + note: str + + +def _registry() -> CapabilityRegistry: + return CapabilityRegistry([ + Capability( + name="reviewer", + input_kind="item", + output_model=Finding, + serialize_input=True, + max_fan_out=50, + build=lambda: Agent( + name="reviewer", + model=MODEL, + output_schema=Finding, + generate_content_config=DET, + instruction=( + "Input JSON with keys path and code. Output a Finding" + " (echo the path)." + ), + ), + ), + Capability( + name="verifier", + input_kind="item", + output_model=Finding, + serialize_input=True, + max_fan_out=50, + build=lambda: Agent( + name="verifier", + model=MODEL, + output_schema=Finding, + generate_content_config=DET, + instruction=( + "Input: a Finding JSON (path, severity, issue). Confirm the" + " severity and keep or adjust the issue. Output the Finding" + " (echo the path)." + ), + ), + ), + Capability( + name="triager", + input_kind="list", + output_model=ReportFixed, + serialize_input=True, + build=lambda: Agent( + name="triager", + model=MODEL, + output_schema=ReportFixed, + generate_content_config=DET, + instruction=( + "Input: a JSON list of Findings. Output ReportFixed:" + " total, per-severity counts (must sum to total), and" + " a one-line summary." + ), + ), + ), + Capability( + name="formatter", + input_kind="item", + output_model=Note, + serialize_input=True, + build=lambda: Agent( + name="formatter", + model=MODEL, + output_schema=Note, + generate_content_config=DET, + instruction=( + "Input: a ReportFixed JSON. Output a Note: a one-line" + " markdown bullet summarizing the audit." + ), + ), + ), + ]) + + +_REGISTRY_DESC = ( + "reviewer (item: a file with path and code -> Finding), verifier (item: a" + " Finding -> a confirmed Finding), triager (LIST of Findings ->" + " ReportFixed), formatter (item: a ReportFixed -> Note)." +) +_PLANNER_INSTR = ( + "Author a WorkflowSpec using ONLY these capabilities: " + + _REGISTRY_DESC + + " The task input has a 'files' list of objects with path and code." + " Author, in order:" + " (1) a pipeline over task.files with two stages, reviewer then verifier," + " so each file is reviewed and then its finding is verified per item;" + " (2) a step running triager on the pipeline output;" + " (3) a step running formatter on the report." + " Use Binding(source='task', path='files') for the pipeline's over, and" + " Binding(source='step', step=) to chain steps. A pipeline stage takes" + " its input from the previous stage automatically, so stages need no input" + " binding. Set output to the formatter step." +) + + +# The QUALITY-GATE beat: a deliberately biased ask — the reviewer +# double-checking its OWN findings. Registry/bindings/shapes are all valid, so +# plain validation passes; only the plan-quality lints catch the structural +# self-review bias, and the demo rejects the plan before freezing or running. +_SLOPPY_TRIGGERS = ("sloppy", "self-review", "own findings", "double-check") +_SLOPPY_PLANNER_INSTR = ( + "Author a WorkflowSpec using ONLY these capabilities: " + + _REGISTRY_DESC + + " The task input has a 'files' list of objects with path and code." + " Author, in order:" + " (1) a pipeline over task.files with two stages, reviewer then reviewer" + " AGAIN — the reviewer double-checks its own findings per item;" + " (2) a step running triager on the pipeline output;" + " (3) a step running formatter on the report." + " Use Binding(source='task', path='files') for the pipeline's over, and" + " Binding(source='step', step=) to chain steps. A pipeline stage takes" + " its input from the previous stage automatically, so stages need no input" + " binding. Set output to the formatter step." +) + + +# The FREE-AUTHORING beat: the planner receives ONLY the goal + capability +# descriptions — no plan recipe. This is the honest "model-authored" claim +# (the default _PLANNER_INSTR dictates the shape for recording reliability; +# the spike's demand gate also used free authoring). +_FREE_TRIGGERS = ("freely", "free-form", "your own plan", "decompose") +_FREE_PLANNER_INSTR = ( + "Author a WorkflowSpec using ONLY these capabilities: " + + _REGISTRY_DESC + + " The task input has a 'files' list of objects with path and code." + " GOAL: audit the files for security issues and produce a one-line" + " report note. Decompose the goal into a plan YOURSELF — no recipe is" + " provided. Choose whichever control blocks fit (step / fan_out /" + " pipeline / branch / loop_until). Binding rules:" + " Binding(source='task', path='files') reads the file list;" + " Binding(source='step', step=) reads a prior step's output; a" + " pipeline stage takes the previous stage's per-item output" + " automatically. Set output to the final step." +) + + +def _msg(text: str) -> Event: + return Event( + content=types.Content(role="model", parts=[types.Part(text=text)]) + ) + + +def _hash(spec: WorkflowSpec) -> str: + # The one canonical hash definition (authored_workflow_spike/authoring.py), + # shown truncated; the full digest lives in the exported FrozenWorkflowRecord. + return sha256_hex(spec.model_dump(mode="json"))[:12] + + +# Where the "Export plan" beat writes the portable envelope (cwd of `adk web`). +_EXPORT_PATH = os.path.join(os.getcwd(), "security_audit_plan.json") + + +@node(rerun_on_resume=True) +async def author_validate_execute(ctx: Context, node_input): + reg = _registry() + + # 0. QUALITY-GATE path (checked before load-or-author so it works in any + # session): an adversarial ask makes the planner author a structurally + # biased plan; the lints catch it and the gate rejects it pre-execution. + if any(k in str(node_input or "").lower() for k in _SLOPPY_TRIGGERS): + yield _msg( + "🧭 **Adversarial ask** — authoring a plan where the reviewer" + " double-checks its OWN findings. Watch the quality gate." + ) + sloppy = Agent( + name="planner", + model=MODEL, + output_schema=WorkflowSpec, + generate_content_config=DET, + instruction=_SLOPPY_PLANNER_INSTR, + ) + raw = await ctx.run_node( + sloppy, + node_input=f"Audit these files: {[f['path'] for f in FILES]}.", + run_id="plan_sloppy", + ) + spec = WorkflowSpec.model_validate(raw) + yield _msg( + "📋 **Authored plan** (valid registry refs, valid bindings, valid" + f" shapes):\n```json\n{json.dumps(spec.model_dump(), indent=1)}\n```" + ) + lints = [ + w + for w in WorkflowSpecValidator(reg).validate(spec) + if w.startswith("plan-quality") + ] + if lints: + fired = "\n".join(f" - ⚠️ {w}" for w in lints) + yield _msg( + f"🚨 **Plan-quality lints fired ({len(lints)}):**\n{fired}\n\n🛑" + " **Plan rejected by the quality gate** — NOT frozen, NOT executed." + " Plain validation passed (every capability is registered, every" + " binding is typed); only the structural bias check caught it. In" + " production this triggers a bounded re-plan (`max_replans`)." + ) + else: + yield _msg( + "ℹ️ The planner did not author the biased shape this time —" + " re-send the prompt to retry the adversarial ask." + ) + yield Event(output={"rejected": bool(lints), "lints": len(lints)}) + return + + # 1. LOAD-OR-AUTHOR. If a frozen spec exists in this session, REUSE it (do not + # re-author) — this is the resume/reproducibility claim. Otherwise the model + # authors a fresh typed WorkflowSpec (data, not code). + existing = ctx.state.get("authored_workflow:frozen_spec") + if existing: + spec = WorkflowSpec.model_validate(existing) + spec_hash = ctx.state.get("authored_workflow:frozen_spec_hash") or _hash( + spec + ) + reused = True + yield _msg( + f"♻️ **Reusing frozen plan** from session state — hash `{spec_hash}`. " + "The model is NOT re-invoked; the exact prior plan is replayed." + ) + else: + reused = False + free = any(k in str(node_input or "").lower() for k in _FREE_TRIGGERS) + cap_list = ", ".join(f"`{n}`" for n in reg.names()) + if free: + yield _msg( + "🧭 **Free authoring** — the planner receives ONLY the goal +" + f" capability descriptions ({cap_list}); no plan recipe. The shape" + " below is the model's own decomposition (it may differ run to" + " run — and the freeze beat then makes THIS run replayable)." + ) + else: + yield _msg( + "🧭 **Model-authored Workflow** — planning a security audit over " + f"{len(FILES)} files using only registered capabilities " + f"({cap_list})." + ) + planner = Agent( + name="planner", + model=MODEL, + output_schema=WorkflowSpec, + generate_content_config=DET, + instruction=_FREE_PLANNER_INSTR if free else _PLANNER_INSTR, + ) + raw = await ctx.run_node( + planner, + node_input=f"Audit these files: {[f['path'] for f in FILES]}.", + run_id="plan", + ) + spec = WorkflowSpec.model_validate(raw) + spec_hash = _hash(spec) + steps = " → ".join(s.kind for s in spec.steps) + yield _msg( + f"📋 **Authored plan** (`{steps}`):\n```json\n" + f"{json.dumps(spec.model_dump(), indent=1)}\n```" + ) + + # 2. VALIDATE — semantic validation against the registry (always). + warnings = WorkflowSpecValidator(reg).validate(spec) # raises on hard error + caps = set() + for s in spec.steps: + if getattr(s, "capability", None): + caps.add(s.capability) + for st in getattr(s, "stages", None) or []: # pipeline stage capabilities + caps.add(st.capability) + caps = sorted(caps) + yield _msg( + "✅ **Validation passed.** Capabilities referenced (all registered): " + f"`{caps}`." + + (f"\n⚠️ warnings: {warnings}" if warnings else "") + ) + + # 2b. INDEPENDENCE — the quality argument, made static. Isolation is what + # mitigates self-preferential bias and goal drift in multi-agent work; with + # typed bindings it is a checkable property of the frozen plan (the validator + # lints same-capability self-review and unsynthesized fan-out), not a runtime + # hope. Model-authored orchestration *code* cannot be checked this way. + lints = [w for w in warnings if w.startswith("plan-quality")] + facts = "\n".join(f" - {f}" for f in independence_facts(spec)) + yield _msg( + f"🧪 **Plan-quality lints: {len(lints)} warnings.** Agent independence" + " is statically checkable from the typed bindings — the frozen record" + f" *proves* it to an auditor:\n{facts}" + ) + + # 3. FREEZE — persist spec + hash to session state on first author only + # (visible in the State tab; reused runs already have it). + # NOTE: session state keeps a minimal {spec, hash} subset so the State tab + # stays readable for the resume/reuse beat. The EXPORT beat below serializes + # the full FrozenWorkflowRecord (planner/registry/capability versions, + # validation, task_input_digest) — see authored_workflow_spike/DESIGN.md §5/§10. + # Production v1 would persist that full record to state too; the split here is + # presentational, not the canonical contract. + if not reused: + ctx.state["authored_workflow:frozen_spec"] = spec.model_dump() + ctx.state["authored_workflow:frozen_spec_hash"] = spec_hash + yield _msg( + f"🔒 **Frozen spec** persisted to session state — hash `{spec_hash}`. " + "Re-send the prompt: it replays this exact plan, not a new one." + ) + + # 3b. EXPORT — serialize the full FrozenWorkflowRecord to a portable JSON + # envelope (DESIGN.md §10), then prove the import contract by re-importing + # it: import_plan recomputes the hash and re-validates against the CURRENT + # registry — it never trusts the envelope's own `validation`. + record = FrozenWorkflowRecord.freeze( + spec, + planner_model=MODEL, + registry=reg, + created_at=datetime.datetime.now(datetime.timezone.utc).isoformat(), + task_input={"files": FILES}, + ) + envelope = export_plan(record) + try: + with open(_EXPORT_PATH, "w") as f: + json.dump(envelope, f, indent=2) + import_plan( + envelope, reg, task_input={"files": FILES} + ) # re-hash+re-validate + yield _msg( + f"📦 **Exported plan** → `{os.path.basename(_EXPORT_PATH)}` " + f"(full `{record.spec_hash[:12]}`, schema `{record.schema_version}`, " + f"planner `{record.planner_model}`). Re-imported OK — import " + "recomputes the hash and re-validates against the current registry, " + "never trusting the envelope's own validation. This is the " + "reviewable / diffable / replayable audit artifact." + ) + except OSError as e: + yield _msg(f"📦 Export skipped (filesystem): {e}") + + # 3c. LOWER — project the plan's STATIC subset toward ADK config shapes + # (RFC #93 §11 convergence, shown concretely). Illustrative structural + # projection — NOT a loadable root_agent.yaml: leaves are referenced by + # allow-listed capability name (never an importable FQN), and dynamic blocks + # (pipeline/fan_out/branch) are flagged unsupported, never fabricated. + cov = agent_config_coverage(spec) + lowered = lower_to_agent_config(spec, name="security_audit_planner") + yield _msg( + "🧬 **ADK config lowering (static subset)** —" + f" {cov['lowerable']}/{cov['total']} top-level steps project to ADK" + " config; dynamic blocks stay SpecInterpreter-only:" + f" {cov['dynamic']}.\n```json\n{json.dumps(lowered, indent=1)}\n```" + ) + + # 4. EXECUTE — run the validated plan on the real ADK engine (#92 supervisor). + t0 = time.perf_counter() + interp = SpecInterpreter(reg, ctx) + result = await interp.execute(spec, {"files": FILES}) + elapsed = time.perf_counter() - t0 + yield _msg( + "📄 **Audit result:**" + f" {result.get('note') if isinstance(result, dict) else result}" + ) + # 4b. COST — cheap visibility into what the orchestration spent. The planner + # was invoked at most once (zero on frozen replay); every capability dispatch + # ran OUTSIDE the planner's context. + planner_cost = ( + "0 planner calls (frozen replay)" if reused else "1 planner call" + ) + yield _msg( + f"📊 **Cost:** {interp.dispatch_count} capability dispatches in" + f" {elapsed:.1f}s + {planner_cost} — per-step work runs outside the" + " planner's context." + ) + yield Event( + output={ + "hash": spec_hash, + "result": result, + "capabilities": caps, + "reused": reused, + "dispatches": interp.dispatch_count, + } + ) + + +root_agent = Workflow( + name="security_audit_planner", + edges=[("START", author_validate_execute)], +) diff --git a/contributing/samples/workflows/authored_workflow_demo/test_demo_agent.py b/contributing/samples/workflows/authored_workflow_demo/test_demo_agent.py new file mode 100644 index 00000000000..cc7cc02025d --- /dev/null +++ b/contributing/samples/workflows/authored_workflow_demo/test_demo_agent.py @@ -0,0 +1,285 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""CI-safe tests for the ADK Web demo wrapper (no live model). + +Covers: import + `root_agent` shape, the demo registry, demo-spec validation, +and — crucially — the **reuse path** end-to-end with a stubbed (deterministic) +capability registry, so the frozen-spec/replay claim the demo makes on camera +is actually verified without calling Gemini. +""" + +from __future__ import annotations + +import json +import os +import sys + +from google.adk import Event +from google.adk.runners import Runner +from google.adk.sessions.in_memory_session_service import InMemorySessionService +from google.adk.workflow import node +from google.adk.workflow import Workflow +from google.genai import types +import pytest + +_HERE = os.path.dirname(os.path.abspath(__file__)) +sys.path.insert(0, os.path.join(_HERE, "security_audit_planner")) +sys.path.insert(0, os.path.join(_HERE, "..", "authored_workflow_spike")) +import agent as demo # noqa: E402 +from authoring import Binding # noqa: E402 +from authoring import Capability # noqa: E402 +from authoring import CapabilityRegistry # noqa: E402 +from authoring import Pipeline # noqa: E402 +from authoring import PipelineStage # noqa: E402 +from authoring import StepRef # noqa: E402 +from authoring import WorkflowSpec # noqa: E402 +from authoring import WorkflowSpecValidator # noqa: E402 + + +def _demo_spec() -> WorkflowSpec: + return WorkflowSpec( + goal="audit", + steps=[ + Pipeline( + kind="pipeline", + id="rev", + over=Binding(source="task", path="files"), + stages=[ + PipelineStage(capability="reviewer"), + PipelineStage(capability="verifier"), + ], + ), + StepRef( + kind="step", + id="tri", + capability="triager", + input=Binding(source="step", step="rev"), + ), + StepRef( + kind="step", + id="fmt", + capability="formatter", + input=Binding(source="step", step="tri"), + ), + ], + output=Binding(source="step", step="fmt"), + ) + + +def test_root_agent_importable_and_named(): + assert isinstance(demo.root_agent, Workflow) + assert demo.root_agent.name == "security_audit_planner" + assert len(demo.root_agent.edges) == 1 + + +def test_demo_registry_is_clean(): + reg = demo._registry() + for name in ("reviewer", "verifier", "triager", "formatter"): + assert name in reg + assert reg["reviewer"].input_kind == "item" + assert reg["verifier"].input_kind == "item" # pipeline stages take an item + assert reg["triager"].input_kind == "list" + # ReportFixed uses enumerated fields, not an open dict[str, X] map. + assert reg.open_map_warnings() == [] + + +def test_demo_spec_validates(): + WorkflowSpecValidator(demo._registry()).validate(_demo_spec()) # no raise + + +def test_quality_gate_rejects_self_review_plan(): + # The adversarial-ask beat: reviewer double-checking its OWN findings is a + # VALID plan (registry, bindings, shapes) — only the plan-quality lint + # catches the structural bias. This pins the shape the live beat relies on. + sloppy = WorkflowSpec( + goal="audit", + steps=[ + Pipeline( + kind="pipeline", + id="rev", + over=Binding(source="task", path="files"), + stages=[ + PipelineStage(capability="reviewer"), + PipelineStage(capability="reviewer"), # self-review + ], + ), + StepRef( + kind="step", + id="tri", + capability="triager", + input=Binding(source="step", step="rev"), + ), + StepRef( + kind="step", + id="fmt", + capability="formatter", + input=Binding(source="step", step="tri"), + ), + ], + output=Binding(source="step", step="fmt"), + ) + warnings = WorkflowSpecValidator(demo._registry()).validate(sloppy) + lints = [w for w in warnings if w.startswith("plan-quality")] + assert len(lints) == 1 + assert "re-checks its own capability's output" in lints[0] + + +def test_free_planner_instruction_is_recipe_free(): + # The honesty contract behind the free-authoring beat: the default + # instruction dictates the plan; the free instruction must NOT — only the + # goal, the capability descriptions, and the binding rules. + assert "reviewer then verifier" in demo._PLANNER_INSTR # scripted (default) + assert "reviewer then verifier" not in demo._FREE_PLANNER_INSTR + assert "(1)" not in demo._FREE_PLANNER_INSTR # no step recipe + assert "YOURSELF" in demo._FREE_PLANNER_INSTR + # trigger sets must not overlap (a prompt can't be both free and sloppy). + assert not set(demo._FREE_TRIGGERS) & set(demo._SLOPPY_TRIGGERS) + + +def test_demo_spec_quality_lints_clean_and_independent(): + # Zero plan-quality lints: verification is by a DIFFERENT capability + # (reviewer -> verifier), and the fan-out is synthesized (triager). + warnings = WorkflowSpecValidator(demo._registry()).validate(_demo_spec()) + assert [w for w in warnings if w.startswith("plan-quality")] == [] + # The independence facts the demo shows on camera are derivable statically: + # the verifier stage provably sees only the reviewer's per-item output, and + # each downstream step provably consumes only its upstream's typed output. + from authoring import independence_facts # noqa: E402 + + facts = "\n".join(independence_facts(_demo_spec())) + assert "stage 'verifier' sees ONLY stage 'reviewer'" in facts + assert "'tri' consumes ONLY the typed output of 'rev'" in facts + assert "'fmt' consumes ONLY the typed output of 'tri'" in facts + + +def test_demo_spec_agentconfig_lowering(): + # The demo's plan (pipeline -> step -> step) is exactly the static/dynamic + # split RFC #93 §11 describes: the two trailing steps lower to LlmAgent under + # a SequentialAgent; the reviewer->verifier pipeline has no AgentConfig + # equivalent. (Illustrative projection — leaves by capability name, not FQN.) + from authoring import agent_config_coverage # noqa: E402 + from authoring import lower_to_agent_config # noqa: E402 + + cfg = lower_to_agent_config(_demo_spec(), name="security_audit_planner") + assert cfg["agent_class"] == "SequentialAgent" + kinds = [s["agent_class"] for s in cfg["sub_agents"]] + assert kinds == ["", "LlmAgent", "LlmAgent"] + assert agent_config_coverage(_demo_spec()) == { + "total": 3, + "lowerable": 2, + "dynamic": ["pipeline"], + } + assert '"code"' not in json.dumps(cfg) # never an importable FQN + + +def _stub_registry() -> CapabilityRegistry: + def stub(name, fn): + def build(): + @node(name=name) + async def n(ctx, node_input): + yield Event(output=fn(node_input)) + + return n + + return build + + return CapabilityRegistry([ + Capability( + name="reviewer", + input_kind="item", + serialize_input=False, + build=stub( + "reviewer", + lambda f: {"path": f["path"], "severity": "HIGH", "issue": "x"}, + ), + ), + Capability( + name="verifier", + input_kind="item", + serialize_input=False, + build=stub( + "verifier", + lambda finding: {**finding, "issue": finding["issue"] + "!"}, + ), + ), + Capability( + name="triager", + input_kind="list", + serialize_input=False, + build=stub("triager", lambda findings: {"total": len(findings)}), + ), + Capability( + name="formatter", + input_kind="item", + serialize_input=False, + build=stub( + "formatter", lambda r: {"note": f"audited {r['total']} files"} + ), + ), + ]) + + +@pytest.mark.asyncio +async def test_reuse_path_no_llm(monkeypatch): + """Pre-seed a frozen spec + stub the registry: the demo must REUSE the plan + (no planner/Gemini call) and still surface hash, capabilities, and output.""" + monkeypatch.setattr(demo, "_registry", _stub_registry) + spec = _demo_spec() + + ss = InMemorySessionService() + session = await ss.create_session( + app_name="demo", + user_id="u", + state={ + "authored_workflow:frozen_spec": spec.model_dump(), + "authored_workflow:frozen_spec_hash": "deadbeef0000", + }, + ) + runner = Runner(app_name="demo", node=demo.root_agent, session_service=ss) + + out, reused_msg, authored_msg = None, False, False + async for ev in runner.run_async( + user_id="u", + session_id=session.id, + new_message=types.Content(parts=[types.Part(text="go")], role="user"), + ): + if isinstance(ev, Event) and ev.content and ev.content.parts: + for p in ev.content.parts: + if p.text and "Reusing frozen plan" in p.text: + reused_msg = True + if p.text and "Authored plan" in p.text: + authored_msg = True + if ( + isinstance(ev, Event) + and isinstance(ev.output, dict) + and "hash" in ev.output + ): + out = ev.output + + assert reused_msg and not authored_msg # reused; planner NOT invoked + assert out is not None + assert out["reused"] is True + assert ( + out["hash"] == "deadbeef0000" + ) # same frozen hash, not re-derived from a new plan + assert set(out["capabilities"]) == { + "reviewer", + "verifier", + "triager", + "formatter", + } + assert out["result"]["note"].startswith("audited") + # cost visibility: 4 files x 2 pipeline stages + triager + formatter = 10. + assert out["dispatches"] == 10 diff --git a/contributing/samples/workflows/authored_workflow_spike/DESIGN.md b/contributing/samples/workflows/authored_workflow_spike/DESIGN.md new file mode 100644 index 00000000000..4bb8bc55e59 --- /dev/null +++ b/contributing/samples/workflows/authored_workflow_spike/DESIGN.md @@ -0,0 +1,316 @@ +# Technical Design — Agent-authored typed Workflows (RFC #93) + +Canonical technical design for RFC #93 (GoogleCloudPlatform/BigQuery-Agent-Analytics-SDK#93). Mirrors the issue's Technical Design comment. Covers the data model, validator, interpreter/compilation, frozen-spec contract, security model, framework changes, testing, and the empirical findings that shaped it. Audience: implementers / technical reviewers. + +> **Phasing (MVP-first).** Ship **#92 first**; build full #93 only once leadership commits it as a product bet **and** 3–5 real tasks beat hand-wired workflows. **MVP scope** = `WorkflowSpec` + validator + **freeze/replay + export**; **defer** templates (v2), complex loops, and broad compiler features. (Strategic rationale: the concise RFC's *Positioning & priority*.) + +## 1. Data model — `WorkflowSpec` + +A plain `kind`-tagged, recursive, ordered **tree of blocks** (not a graph with jumps). `id`s are globally-unique **binding names** for dataflow, never jump targets — which removes join / fall-through / GOTO ambiguity by construction. + +```python +# src/google/adk/workflow/authoring/_spec.py (NEW) + +# Typed dataflow: a Binding is the ONLY way a node sources input — a source + +# optional dotted path, validated against the producer's output schema. +class Binding(BaseModel): + source: Literal["task", "step"] # the workflow task input, or a prior step's output + step: str | None = None # REQUIRED iff source == "step"; None when source == "task" + path: str | None = None # optional dotted field path; checked vs the schema + # model_validator enforces: (source == "step") == (step is not None) + +class StepRef(BaseModel): + kind: Literal["step"] + id: str + capability: str # MUST resolve in the registry + input: Binding # validated against the capability's input_schema + +class FanOut(BaseModel): + kind: Literal["fan_out"] + id: str + over: Binding # MUST resolve to a LIST-typed value + capability: str # run once per element (compiles to ctx.pipeline/parallel) + collect: Literal["list"] = "list" # per-item outputs aggregate to an order-preserving list + +class PipelineStage(BaseModel): + capability: str # MUST resolve in the registry; takes an item + input: Binding | None = None # defaults to the previous stage's per-item output + +class Pipeline(BaseModel): + kind: Literal["pipeline"] + id: str + over: Binding # MUST resolve to a LIST-typed value + stages: list[PipelineStage] # each item flows through ALL stages, BARRIER-FREE + collect: Literal["list"] = "list" # outputs aggregate to an order-preserving list + # Compiles to #92 ctx.pipeline: item A may be in stage k while item B is in stage 1. + # Failed item -> None; control exceptions follow #92. stage[0] input defaults to the + # per-item element; stage[n] input defaults to stage[n-1]'s per-item output. + +class Route(BaseModel): + value: str # the switch value this route matches + block: list["SpecNode"] # non-empty; output = block's last-node output + +class Branch(BaseModel): + kind: Literal["branch"] + id: str + on: Binding # switch value; MUST resolve to a STRING/STR-ENUM schema + routes: list[Route] # ENUMERATED LIST, not an open dict[str, ...] map (see Findings) + unmatched: Literal["fail"] = "fail" # unmatched value at runtime = FAIL (no runtime re-plan in v1) + +class LoopUntil(BaseModel): + kind: Literal["loop_until"] + id: str + body: list["SpecNode"] # non-empty; loop output = LAST iteration's last-node output + until_capability: str # MUST declare a STRICT-bool output schema + until_input: Binding # predicate input (validated vs until_capability.input_schema) + max_iters: int = Field(ge=1) # REQUIRED, >= 1 + init: Binding | None = None # LOOP-CARRIED seed: a body step may bind the loop's OWN id to + # read the prior iteration's body output (`init` on round 0). + # Surfaced by the tournament pattern (pairs recomputed per round + # from prior winners); required for any accumulate-and-refine loop. + # Binding the loop's id in the body WITHOUT init = validation error. + +# PLAIN union, each member carrying a `kind` Literal (structurally-tagged) — NOT +# Annotated[..., Field(discriminator="kind")]: the discriminated form emits a +# JSON-schema `discriminator` keyword that Gemini's response_schema rejects +# (Schema: extra_forbidden — verified). `kind` still disambiguates parsing. +SpecNode = Union[StepRef, FanOut, Pipeline, Branch, LoopUntil] + +class WorkflowSpec(BaseModel): + goal: str + steps: list[SpecNode] # ordered blocks (sequence by list order) + output: Binding # terminal output selection (validated) +``` + +**Block-output rule:** a block's output is its **last node's** output — so a `Branch`'s output is the taken route's last-node output, and a `LoopUntil`'s is the last iteration's last-node output. This gives every composite node a well-defined output schema, which is what makes `Binding(source="step", step=)` schema-checkable. + +**Binding scope:** a `Binding` may reference only a step that lexically precedes it on the **same** root-to-node path (ancestors + earlier same-level siblings). References into a not-taken sibling route, or to a later step, are rejected at validation. + +## 2. The agent + +```python +# src/google/adk/workflow/authoring/_agent.py (NEW) +class AuthoredWorkflowAgent(BaseAgent): + planner_model: str + registry: CapabilityRegistry # the ONLY capabilities a plan may reference + max_replans: int = 1 + + async def _run_async_impl(self, ctx): + frozen = await self._load_frozen_spec(ctx) # resume: reuse the SAME spec, never re-plan + if frozen is None: + spec = await self._author(ctx) # LlmAgent(output_schema=WorkflowSpec) + WorkflowSpecValidator(self.registry).validate(spec) + frozen = self._freeze_and_persist(ctx, spec) # see Frozen-spec contract + workflow = WorkflowCompiler(self.registry).compile(frozen.spec) # -> a real Workflow + async for event in workflow.run_async(ctx): # deterministic + resumable + yield event +``` + +- **Authoring** = `LlmAgent(output_schema=WorkflowSpec)`; ADK validates structured output, so a malformed plan is caught and re-planned (bounded by `max_replans`). +- **Validation** is a **new semantic validator** (below) that *lowers to* `Graph.validate_graph()` for structural checks. +- **Compilation** lowers the block tree: sequence → edges; `Branch` → conditional route edges over nested blocks; `FanOut` → `ctx.parallel`-map; `Pipeline` → barrier-free `ctx.pipeline` (multi-stage); `LoopUntil` → bounded loop. The compiled artifact is an ordinary `Workflow` — nothing downstream knows it was machine-authored. +- **Registry** = developer-supplied capabilities (an agent, or a tool wrapped as a node), each with per-capability policy. + +## 3. Validator — semantic, then structural + +`WorkflowSpecValidator` checks what `Graph` cannot, then lowers: + +- capability refs resolve in the registry; +- `Binding` invariant + path/type compatibility vs the producer's `output_schema` and consumer's `input_schema`; +- `FanOut.over` resolves to a list; the fan-out capability takes an item; +- `Branch.on` is string/str-enum-typed; route blocks share a compatible last-node output schema; non-exhaustive enum domain is flagged (unmatched at runtime fails); +- `Pipeline`: `over` resolves to a list; every stage `capability` is registered and takes an item; stage[0] input defaults to the per-item element, stage[n] to stage[n-1]'s output; the last stage's output type defines the pipeline output (validated for downstream bindings); +- `LoopUntil`: strict-bool `until_capability`, present/compatible `until_input`, `max_iters >= 1`; a body binding to the loop's own id requires `init`; +- globally-unique `id`s; binding-scope (no non-preceding / cross-route references); +- registry-version match vs a frozen spec (drift = hard error). + +Then **`Graph.validate_graph()`** (reused) handles duplicate names, `START`/reachability, duplicate edges, unconditional cycles on the compiled graph. + +**Plan-quality lints (soft warnings).** Multi-agent quality rests on isolation — it mitigates the documented single-agent failure modes (*agentic laziness*, *self-preferential bias*, *goal drift*; see [Dynamic Workflows: scaling complex work](https://aipractitioner.substack.com/p/claude-dynamic-workflows-scaling)). Because dataflow is typed `Binding`s, independence is **statically checkable** — something model-authored orchestration *code* cannot offer — and the validator lints two violations: + +- **self-review**: a node (or pipeline stage) consuming output produced by the *same capability* — same-capability review cannot provide independent verification; +- **unsynthesized fan-out**: the terminal output binds a bare per-item `fan_out` never combined or verified downstream. + +**Suppression** (so the lints stay credible instead of globally disabled): a capability registered with `allow_self_chain=True` opts out of the self-review lint (legitimate `draft → critique → redraft` refinement), and per-plan `lint_waivers` (node id → justification) are **recorded in the `FrozenWorkflowRecord`** — a suppressed lint is an auditable decision, not a silenced one. + +The complementary positive facts (`independence_facts`) are derivable from the frozen spec — e.g. *"stage `verifier` sees ONLY stage `reviewer`'s per-item output"* — which is what lets the frozen record **prove** structural bias controls to an auditor, not just assert them. + +## 4. Semantics + +- **Authoring non-deterministic; execution deterministic.** Once frozen, execution + resume replay is fully deterministic (it's just a `Workflow`). +- **Reuses #92 + the engine wholesale.** Fan-out → supervised `ctx.pipeline`/`ctx.parallel` (bounded, interrupt-safe); sequence/branch → edges + routes; loop → bounded loop. No new executor. +- **`Pipeline` is barrier-free per-item** (compiles directly to #92's `ctx.pipeline`): item A may be in stage *k* while item B is in stage 1; an ordinary failure drops that item to `None`; control exceptions follow #92. This closes the gap where the vocabulary was *less* expressive than its own executor — a single-capability `fan_out` is parallel-map; `Pipeline` is the multi-stage barrier-free form. +- **Re-plan is pre-execution-only.** `max_replans` applies only to validation failures; an execution failure fails the frozen run; recovery = a new explicit run/version. No recursive planner-spawning-planner. +- **Budget + agent caps from #92** bound a mis-plan's spend. + +## 5. Frozen-spec contract (correctness requirement) + +Persist **one** `FrozenWorkflowRecord` before any execution — the *same* shape backs session state, the audit event, and the export envelope (§10), so v1 storage is never a weaker subset: + +```python +class FrozenWorkflowRecord(BaseModel): + schema_version: str # "v1" + spec: WorkflowSpec + spec_hash: str # sha256(canonical_json(spec)) — see §10 + planner_model: str + registry_version: str + capability_versions: dict[str, str] # manual bumps — coarse SECONDARY signal + capability_contract_hashes: dict[str, str] # DERIVED sha256(input_kind+output schema) — primary drift signal + lint_waivers: dict[str, str] # node id -> justification; auditable lint suppression + validation: ValidationResult # {passed: bool, warnings: [...]} + created_at: str # ISO-8601, stamped at freeze + task_input_schema: dict | None # expected root task-input schema (enables template reuse) + task_input_digest: str | None # sha256(canonical_json(task_input)) +``` + +Deterministic replay holds **only** if resume loads the **same** record → **resume MUST reuse it and MUST NOT re-plan** unless the user starts a new run; a registry/capability-version mismatch on resume is a hard error. + +- **Storage target (v1):** the **full record** in session state under an **unprefixed (session-scoped) key** `authored_workflow:frozen_record` — not just `{spec, hash}`, so drift detection and audit have everything they need. **Not** `app:` (app-scoped — `State.APP_PREFIX`, extracted in `_session_util.extract_state_delta` — would leak per-run data and break per-run resume). +- **Audit event shape:** persist **state-only** — `Event(state={"authored_workflow:frozen_record": record})`. **Not** `Event.output` (`NodeRunner._track_event_in_context` sets `ctx.output = event.output`; `Context.output` rejects a second output → "Output already set"). **Not** `Event.content` (would re-enter a model's context). +- **Demo vs production:** the committed demo persists only a minimal `{spec, hash}` subset to keep the walkthrough readable — **it illustrates the behavior; production v1 would store the full `FrozenWorkflowRecord`.** The demo is illustrative, not the canonical contract. + +## 6. Security model + +Going declarative **eliminates the code-execution / sandbox-escape class** — but **not** all risk (bad args, prompt-injected inputs, side-effectful tools, expensive fan-out/loops). Controls = validation **+ per-capability policy**: + +- **Capability allow-list** — non-registry refs rejected at validation. +- **No code execution** — nothing to sandbox. +- **Per-capability policy** (registry-declared): `max_calls`, `max_fan_out`, allowed caller/edge constraints, `side_effect` (requires explicit approval to appear in a plan), argument constraints/schema. **Static vs runtime split:** the validator enforces statically-knowable policy (static call counts, `max_iters`, side-effect approval, caller/edge, arg schemas); runtime enforces data-dependent caps before dispatch (`max_fan_out` vs actual list size, realized branch-path call counts). +- **Output-schema guidance (from the spike):** registered capabilities should avoid open `dict[str, X]` output maps (Gemini fills them unreliably); the registry/validator SHOULD warn, and outputs should carry invariants (e.g. counts sum to total) checked with one repair retry. +- **Per-capability permissions unchanged** — each agent runs under its own ADK tool allowlist; authoring grants no elevation. +- **Bounded blast radius** — current ADK enforces `RunConfig.max_llm_calls` (default 500); the proposed #92 limits (leaf gate, optional per-run agent cap, optional `max_tokens`) bound further; `max_iters`/`max_replans` bound loops. +- **Auditable** — frozen spec (+ hash, versions) persisted; humans can review/pre-approve. + +Residual: "model composes approved capabilities, within policy, in a wasteful-but-bounded order, possibly on injected inputs" — dramatically smaller than executing model-authored Python, but **not zero**; argument-level injection into an approved side-effectful tool is the sharpest residual (hence side-effect caps default to approval-required). + +## 7. Backward compatibility + +Fully additive. New `authoring/` package + `AuthoredWorkflowAgent`; no change to existing agents, `Workflow`, or the engine. Opt-in; the compiled artifact is a plain `Workflow`. + +## 8. Testing + +- **Semantic validator rejects:** unknown capability; `Binding` invariant / incompatible path-type; `FanOut.over` non-list; `Branch.on` non-string or incompatible route output schemas; `LoopUntil` non-strict-bool predicate / missing `until_input` / `max_iters < 1`; non-preceding or cross-route binding; duplicate `id`; registry drift. +- **Structural lowering:** `Graph.validate_graph()` catches duplicate names / unreachable / unconditional cycles. +- **Frozen-spec contract:** persisted before execution; resume reuses, does not re-plan; registry-version mismatch is a hard error. +- **Per-capability policy:** plan exceeding `max_calls`/`max_fan_out` or placing an unapproved side-effect capability is rejected pre-execution. +- **Compiler:** golden test — a `WorkflowSpec` lowers to a `Workflow` matching a hand-written equivalent; fan-out → bounded `ctx.pipeline`. +- **`AuthoredWorkflowAgent`:** malformed planner output → bounded re-plan → fail past `max_replans`. +- **Determinism:** frozen spec replays identically, resumes exactly-once (inherits #92). +- **Two gates:** *planning* (valid + sensible + executable + structurally matches a hand-wired baseline) and *output-quality* (intermediate outputs match, capability invariants hold, one repair retry). +- **Pattern coverage:** the six empirically common coordination patterns (classify-route, fan-out/synthesize, generate-filter, loop-until-done, adversarial verification, tournament) all author + validate + execute. The two non-obvious shapes have explicit deterministic tests; tournament exercises loop-carried state. +- **Plan-quality lints:** same-capability self-review and unsynthesized fan-out warn; an independent (different-capability) verification plan lints clean. + +## 9. Empirical findings (from the demand-gate spike on `gemini-3.5-flash`) + +1. **Gate passed.** A planner authored a valid, structurally-correct spec for a codebase audit, validated first try, executed on the real engine, matched a hand-wired baseline — across multi-stage / branch / loop_until shapes. +1. **Open-`dict[str, X]` maps are a structured-output reliability hazard** — hit twice: a capability's `counts: dict[str,int]` came back empty, and the spec's own `Branch.routes` (an open map) came back empty. **Both fixed by enumerated/list structures** (`Branch.routes` → `list[Route]`; capability outputs use fixed fields). The validator warns on open-map capability outputs. +1. **Discriminated unions are incompatible with Gemini `response_schema`** — `Field(discriminator="kind")` emits a `discriminator` keyword genai rejects (`Schema: extra_forbidden`). Use a plain `kind`-tagged union. +1. **Planner quality vs capability quality are separable** — authoring/structure was reliably good; the residual variance was per-capability output quality (prompts/schemas/retries), proven via an intermediate-output diff (authored vs baseline findings were semantically identical). The strict `unmatched=fail` branch contract also caught a bad field-binding loudly instead of mis-routing. +1. **The pattern-coverage sweep surfaced a real vocabulary gap** — the tournament shape (pairs recomputed per round from the prior round's winners) is inexpressible without **loop-carried state**: a body step must read the previous iteration's output, which the binding-scope rules statically forbid. Fixed with `LoopUntil.init` (seed binding) + the rule that a body binding to the loop's own id reads the carried value. Pattern-driven gate-task selection finds these gaps; single ad-hoc tasks don't. + +Re-runnable: `contributing/samples/workflows/authored_workflow_spike/` (36 deterministic tests + env-gated live sweep) and `authored_workflow_demo/` (ADK Web `root_agent` + 8 CI-safe tests incl. the no-LLM reuse path), in `caohy1988/adk-python` PR #3. + +## 10. Plan export & storage — the frozen spec as a durable artifact + +> **Spike status:** `export_plan` / `import_plan` / `FrozenWorkflowRecord` are **implemented** in `authoring.py` and exercised by deterministic tests (round-trip, tamper, dropped-capability, version-drift, replay-vs-template input) and a live demo "Export plan" beat. The *tiering* below remains the production roadmap. + +**Source of truth = the typed `WorkflowSpec`.** The compiled `Workflow` is a *derived* artifact. Storage is tiered, scoped to keep generated code and compiled graphs out of v1: + +- **v1 (required) — persist the full `FrozenWorkflowRecord` per run** (§5) under `authored_workflow:frozen_record` — for resume/replay **and** drift detection. + +- **v1.1 (recommended) — export the record as a portable JSON envelope.** The envelope **is a serialized `FrozenWorkflowRecord`** (§5) — same fields, never a weaker shape — produced by an explicit "Export plan" operation: + + ```json + { + "schema_version": "v1", + "spec": { "...": "the WorkflowSpec" }, + "spec_hash": "...", + "planner_model": "...", + "registry_version": "...", + "capability_versions": { "reviewer": "...", "triager": "..." }, + "validation": { "passed": true, "warnings": [] }, + "created_at": "", + "task_input_schema": { "...": "expected task-input JSON schema, or null" }, + "task_input_digest": "" + } + ``` + + This is the enterprise story: a model-authored plan becomes **reviewable, diffable, auditable, replayable** data. `created_at` is stamped at export (not at replay); `task_input_digest` is a digest so a portable plan doesn't carry raw task content. + + **Digest/hash definition.** `spec_hash` and `task_input_digest` are `sha256` over **canonical JSON** — `json.dumps(value, sort_keys=True, separators=(",", ":"))` — of the spec and the task input respectively. A single fixed definition so two exporters produce identical hashes for the same logical value (no whitespace/key-order drift). + + **Execution-input contract on import.** `task_input_digest` is *advisory provenance* for replaying the **original** run. Reusing a plan against a **new** task input is template behavior: ADK validates the new input against the captured `task_input_schema`. If `task_input_schema` is null (none captured), import may only **replay** with a matching `task_input_digest`, or must go through explicit **template promotion** (which attaches a `task_input_schema`) first. A stored plan must never silently bind (e.g. `task.files`) against an incompatible task shape. + + ```python + def export_plan(record: FrozenWorkflowRecord) -> dict: ... # serialize the §5 record + def import_plan(envelope, registry, *, task_input=None) -> WorkflowSpec: + # INTEGRITY (never trust the envelope's own `validation`): + # 1. recompute sha256(canonical_json(spec)); REJECT if != envelope["spec_hash"] + # 2. re-run WorkflowSpecValidator against the CURRENT registry + # 3. registry/capability drift -> fail loudly (or explicit migration); + # capability drift = manual version (secondary) AND derived contract + # hash sha256(input_kind + output schema) (primary — catches schema + # changes nobody versioned). FAIL CLOSED: a v1 envelope must carry a + # contract hash for EVERY referenced capability — a stripped field or + # entry is a hard import error, never a silent bypass. + # EXECUTION-INPUT: + # replay : task_input digest must match envelope["task_input_digest"] (else audit-only) + # template : task_input validated against envelope["task_input_schema"] before execution + # neither : do NOT execute against arbitrary new input + ``` + +- **v2 (optional) — promote an exported plan to a reusable template.** A human approves a spec and saves it as a template. **On import, ADK MUST re-validate against the *current* registry**; registry/capability drift **fails loudly or requires explicit migration** — never a silent run against a changed capability set. (The envelope's `registry_version` / `capability_versions` are what make drift detectable.) + +- **Deferred — envelope-level integrity beyond `spec_hash`.** `spec_hash` protects the *plan*; envelope metadata (`task_input_schema`, `created_at`, …) is re-checked against the current registry where possible but not integrity-protected — a tampered `task_input_schema` could turn a replay-only plan into a template. Production v1.1 should sign or hash the full serialized record. + +- **Deferred — compiled `Workflow`/graph (or generated Python) as the source of truth.** The compiled `Workflow` is regenerated from the spec on demand; it is **not** stored as canonical, because compiler behavior and ADK internals evolve. Persisting generated code or a compiled graph is explicitly out of scope. + +Net: this turns the proposal from "a model can author plans" into "**model-authored plans become durable enterprise artifacts**" — without committing to durable generated code. + +## 11. Convergence with ADK Workflow config / `root_agent.yaml` (+ storage, custom tools, observability) + +A reviewer asked whether the planner should author ADK's existing **YAML config** directly, specifically the `contributing/samples/workflows/loop_config/root_agent.yaml` pattern. Verified against source and the sample — `loop_config/root_agent.yaml` is `agent_class: Workflow` with static `edges`, function refs like `.agent.route_headline`, and child YAML refs like `generate_headline.yaml`; the lower-level loader still goes through the `AgentConfig` / `BaseAgentConfig` path and resolves code/config refs via `config_agent_utils.py`. + +**Lower to config where it fits.** ADK Workflow YAML already models a useful *static* graph shape (`agent_class: Workflow`, `edges`, route labels, child agent YAML files). The static subset **should lower to that style** rather than inventing a separate serialization. The spike **demonstrates the first step** with an illustrative structural projection (`lower_to_agent_config` — `SequentialAgent`/`LoopAgent`/`LlmAgent` shapes, leaves by capability name, dynamic blocks flagged ``); a **full loadable-`root_agent.yaml` compiler** (Workflow YAML edges + child YAML + an allow-listed capability-ref field) remains future work (§12). + +| `WorkflowSpec` block | ADK config relationship | +| -------------------------------------------- | ------------------------------------------------------------------------------------------------------------ | +| sequence / static branch / static route loop | should lower to `agent_class: Workflow` YAML (`edges`), like `contributing/samples/workflows/loop_config` | +| leaf capability | should lower to child agent YAML or an allow-listed capability-ref field, not an importable FQN from a model | +| bounded `LoopUntil` | can lower the bounded graph skeleton; its `until_capability` predicate remains interpreter/compiler logic | +| runtime `fan_out` / `pipeline` | no direct YAML equivalent for per-item runtime list dispatch / barrier-free multi-stage flow | + +`loop_config` is the right mental model for the **static** portion: a known graph with known function/agent references. It is not enough for #93's planner-facing contract because the model would be authoring those references. The safe contract is still `WorkflowSpec` → validate against the registry → optionally lower/export to Workflow YAML as a **derived artifact**. + +**Caveat (ADK source):** `Workflow` itself is not marked deprecated in this checkout; the recommended static target is the `agent_class: Workflow` graph-YAML shape. What *is* marked **`@deprecated` + `@experimental`** is the current `AgentConfig` / `BaseAgentConfig` loader path and the concrete `Sequential`/`Parallel`/`LoopAgentConfig` sugar classes (`agents/agent_config.py:72-73`, `base_agent_config.py:30`, `sequential_agent_config.py:28`, `loop_agent_config.py:30`). So this is **convergence with the Workflow config *shape* for compatibility/illustration — not a long-term dependency** on today's YAML loader or deprecated agent-config sugar. If the config surface stabilizes under a different shape, the lowering target moves with it; the `WorkflowSpec` authoring layer is unaffected. + +**Why the planner should not emit raw `root_agent.yaml`:** + +1. **It is static / load-time.** `loop_config` wires known nodes and routes ahead of time. That is great for human-authored graphs, but runtime per-item `fan_out` and barrier-free `pipeline` need dispatch over the actual input list; YAML can only call a wrapper node for that today, not express the dynamic dispatch itself. +1. **It is not a clean `response_schema`.** The loader model uses `AgentConfig` as a `RootModel` over a `Discriminator(agent_config_discriminator)` union; Gemini's `response_schema` rejects the emitted `discriminator` keyword (`Schema: extra_forbidden` — the spike's §9 lesson). It also carries open `extra='allow'` maps (`ToolArgsConfig`, `BaseAgentConfig.model_extra`). +1. **Trust-boundary mismatch on refs.** `loop_config` intentionally resolves `.agent.process_input`, `.agent.route_headline`, `output_schema_code: .agent.Feedback`, and child YAML files. Tools/agents/callbacks can also be named by **fully-qualified importable path** (`CodeConfig.name`, `AgentRefConfig.code`, `LlmAgentConfig.tools[].name`, `*_callbacks`) resolved via `importlib`. That is appropriate for **developer-authored** config; the concern is specifically letting a **model** author those raw refs. For model-authored plans we want **capability allow-listing**, not arbitrary code/config/import paths — a trust-boundary difference, not a flaw in config. + +**Direction:** keep `WorkflowSpec` as the thin **authoring** schema (closed, allow-listed, `response_schema`-safe); lower/export its static graph subset to ADK Workflow YAML so those shapes share ADK's serialization and tooling; keep runtime `fan_out` / `pipeline` + capability allow-listing as new surface only for the dynamic and trust-boundary pieces config doesn't cover. The compiled artifact is still an ordinary `Workflow` (§2). + +**Q1 — spec storage.** §5/§10: one `FrozenWorkflowRecord` in session State (`authored_workflow:frozen_record`, unprefixed/session-scoped; resume reuses, never re-plans), a state-only audit event, and a v1.1 export envelope. Compiled `Workflow` is derived, never canonical. + +**Q2 — custom tools.** A custom tool is a **registered capability** referenced by **registry name** (the registry is the allow-list), carrying per-capability policy (`max_calls`, `max_fan_out`, `side_effect`→approval, arg constraints) — §6. Deliberately *not* config's FQN `tools:` field: the model never names an import path. + +**Q3 — version control & observability.** Drift surface = `spec_hash` (sha256/canonical-JSON) + `planner_model` + `registry_version` + per-capability `capability_versions` in the record (§5); import hard-errors on schema-version, hash, registry-version, or capability-version drift (spike-enforced, §10). The export envelope is diffable for PR/audit review. Runtime observability is unchanged: the compiled `Workflow` runs on the real engine, so existing ADK tracing/events apply; the frozen record + hash anchor each run to its plan. + +## 12. Future (post-gate, NOT MVP) + +**Hierarchical / sub-plan authoring** — a registered capability that is itself an `AuthoredWorkflowAgent`, so a step can expand into its own authored sub-plan. This is the likely path to parity with Claude Code's unbounded orchestration (it lifts the single-response plan-size ceiling), but it is **out of MVP scope** and should be evaluated **only after the 3–5-task build gate**. MVP stays single-level: `WorkflowSpec` + validator + freeze/replay + export. + +**Upstream config extension (optional).** If the dynamic constructs prove their value, the cleaner long-term home for runtime `fan_out` / `pipeline` may be **new Workflow YAML block types upstream** plus an allow-listed capability-reference field — at which point authoring could converge more fully onto an extended ADK config shape. Out of scope here; depends on upstream accepting those config/compiler extensions. + +**Budget as a bindable runtime value (v1.1-sized).** #92 caps *bound* spend, but a plan cannot *react* to it. Allowing `until_input` (or any `Binding`) to source a runtime-provided budget struct — e.g. `Binding(source="runtime", path="budget.remaining_tokens")` — makes loop-until-budget expressible declaratively, with no new node kind. + +**A "no-plan" escape hatch.** Each orchestration level adds overhead; small, linear tasks are solved more efficiently by a single agent. Letting the planner's output schema include a degenerate direct-execution variant (a single `StepRef`, or an explicit `kind: "direct"`) lets trivial inputs skip orchestration — classify-and-route applied to the meta-decision of whether to orchestrate at all. + +## References + +- #92 — supervised concurrent dynamic dispatch + `ctx.pipeline` (executor). +- Claude Code Dynamic Workflows — https://code.claude.com/docs/en/workflows +- Empirical patterns & failure modes: *Claude Dynamic Workflows: Scaling Complex Work* — https://aipractitioner.substack.com/p/claude-dynamic-workflows-scaling +- ADK: `Workflow`/`Graph` (`src/google/adk/workflow/_graph.py`), `LlmAgent.output_schema` / `validate_schema`, `BaseAgent.run_async`, `_session_util.extract_state_delta`, `NodeRunner._track_event_in_context`. diff --git a/contributing/samples/workflows/authored_workflow_spike/README.md b/contributing/samples/workflows/authored_workflow_spike/README.md new file mode 100644 index 00000000000..e95b4f52c41 --- /dev/null +++ b/contributing/samples/workflows/authored_workflow_spike/README.md @@ -0,0 +1,121 @@ +# Authored Workflow Spike — demand gate for RFC #93 + +Reference spike for **agent-authored typed Workflows** (RFC #93): a model emits +a declarative, validated `WorkflowSpec` (typed data, **not** code) that the +framework validates and executes on the real ADK Workflow engine via the #92 +`DynamicNodeSupervisor`. This directory is the re-runnable demand-gate artifact +behind the RFC's "can a model author good plans?" question. + +## Environment + +- ADK: `2.1.0` +- Built against `google/adk-python` upstream `main`. +- Python 3.11+ (recursive `kind`-tagged unions; `asyncio.TaskGroup` in #92). + +## Files + +| File | Purpose | +| ---------------------------- | -------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- | +| `authoring.py` | `WorkflowSpec` (plain `kind`-tagged recursive tree), `CapabilityRegistry`, `WorkflowSpecValidator`, `SpecInterpreter` (step / fan_out / pipeline / branch / loop_until), and `FrozenWorkflowRecord` / `export_plan` / `import_plan` (portable plan envelope + defensive import). | +| `test_authoring.py` | Deterministic, CI-safe tests (no LLM). The trustworthy artifact. | +| `test_live_planner_sweep.py` | OPTIONAL env-gated live planner sweep across plan shapes. | + +## Deterministic tests (CI-safe, no network) + +```bash +pytest contributing/samples/workflows/authored_workflow_spike/test_authoring.py -q +``` + +Expected: **36 passed** — `Binding` invariant, `max_iters>=1`, validator accepts a +valid spec and rejects unknown capability / non-preceding binding / duplicate id, +the open-map warning, and interpreter execution of fan_out→aggregate, **pipeline (barrier-free per-item review→verify, plus per-stage `max_fan_out` enforcement)**, branch +(correct route), and loop_until (stops + correct output); plus **plan export/import** +(round-trip replays the same hash; import rejects a tampered spec, a dropped +capability, capability/registry version drift, an unsupported schema_version, +and a new input with no template schema); plus **ADK config lowering** of the +static subset (an illustrative projection toward static Workflow/agent config +shapes: sequence/loop/leaf by capability name; runtime fan_out/pipeline/branch +flagged no-equivalent rather than fabricated); plus **pattern coverage** +(adversarial verification and tournament via loop-carried `init`, incl. the +no-`init` validation error) and **plan-quality lints** (same-capability +self-review and unsynthesized fan-out warn; an independent plan lints clean; +`allow_self_chain` policy and recorded per-plan waivers suppress auditably); +plus **contract-hash drift** (import rejects a changed capability schema even +when the manual version string was never bumped, and **fails closed** when the +envelope's contract hashes are stripped entirely or partially). + +## Pattern coverage — the six coordination shapes + +The six empirically common coordination patterns ([Dynamic Workflows: scaling +complex work](https://aipractitioner.substack.com/p/claude-dynamic-workflows-scaling)) +are all expressible in the v1 vocabulary, with deterministic tests: + +| Pattern | `WorkflowSpec` expression | Test | +| ------------------------ | ----------------------------------------------------- | ----------------------------------------------- | +| classify & route | `StepRef(classifier)` → `Branch` | `test_interpreter_branch_takes_correct_route` | +| fan-out / synthesize | `FanOut` → `StepRef(synthesizer)` | `test_interpreter_fanout_then_aggregate` | +| generate & filter | `FanOut(generate)` → `StepRef(filter)` | same shape as above | +| loop until done | `LoopUntil` + `until_capability` | `test_interpreter_loop_until_stops_and_outputs` | +| adversarial verification | `FanOut(skeptics)` → threshold/filter step | `test_pattern_adversarial_verification` | +| tournament | `LoopUntil(init=…, body=[pair_maker, FanOut(judge)])` | `test_pattern_tournament_loop_carried` | + +## Live planner sweep (optional evidence) + +Skipped unless configured — no hardcoded project/model: + +```bash +export SPIKE_LIVE=1 GOOGLE_GENAI_USE_VERTEXAI=1 +export GOOGLE_CLOUD_PROJECT= GOOGLE_CLOUD_LOCATION=global +export SPIKE_GEMINI_MODEL=gemini-3.5-flash # 3.5 serves from `global` +pytest contributing/samples/workflows/authored_workflow_spike/test_live_planner_sweep.py -q -s +``` + +## Gate results (run on `gemini-3.5-flash`) + +**Initial gate (codebase audit):** planner authored a valid, sensible, executable +plan (`fan_out reviewer → triager`) matching a hand-wired baseline. **PASS.** + +**Shape sweep (this directory):** the planner authored + validated + executed all +three shapes: + +| Shape | Authored steps | Result | +| ----------- | ----------------------- | --------------------------------------- | +| multi-stage | `fan_out → step → step` | report → formatted note | +| branch | `step → branch` | took the matched route, produced a note | +| loop_until | `loop_until` | iterated to a headline | + +## Findings that fell out (and shaped the RFC) + +1. **Open-ended `dict[str, X]` maps are a structured-output reliability hazard.** + Surfaced **twice**: a capability's `counts: dict[str,int]` came back empty, and + the spec's own `Branch.routes: dict[str, list]` came back empty. **Both fixed by + using enumerated/list structures** — capability outputs use fixed severity + fields; `Branch.routes` is now a `list[Route]`, not a map. The validator also + warns on open-map capability outputs. +1. **The strict `unmatched=fail` branch contract earns its keep** — when the planner + bound a branch switch to a whole object instead of its field, execution failed + loudly instead of silently mis-routing. +1. **Gemini `response_schema` rejects Pydantic's `Field(discriminator=...)`.** The + plan vocabulary is a PLAIN union of models that each carry a `kind` literal (a + *structurally-tagged* union). The strict discriminated form emits a + `discriminator` keyword that genai's `response_schema` refuses + (`Schema: extra_forbidden`, verified on `gemini-3.5-flash`); the `kind` tags + still make parsing and switching unambiguous. +1. **Planning vs capability quality are separable** — authoring/structure was + reliably good; the residual variance was per-capability output quality + (prompts/schemas/retries), not planning. +1. **The pattern-coverage sweep surfaced a real vocabulary gap.** The tournament + shape (pairs recomputed each round from the prior round's winners) needs + **loop-carried state**, which the binding-scope rules statically forbade. + Fixed with `LoopUntil.init` + the rule that a body binding to the loop's own + id reads the carried value (validation error without `init`). This is why + gate tasks should be selected per coordination pattern, not ad hoc — single + tasks don't find these gaps. +1. **Typed bindings make agent independence statically checkable.** The + validator now lints same-capability self-review (self-preferential bias) + and unsynthesized fan-out; `independence_facts()` derives the positive + provenance statements (e.g. *"stage `verifier` sees ONLY stage `reviewer`'s + per-item output"*) the frozen record can prove to an auditor. Model-authored + orchestration *code* cannot be checked this way. + +This is a demand-gate artifact, not production code. diff --git a/contributing/samples/workflows/authored_workflow_spike/authoring.py b/contributing/samples/workflows/authored_workflow_spike/authoring.py new file mode 100644 index 00000000000..942d67e4381 --- /dev/null +++ b/contributing/samples/workflows/authored_workflow_spike/authoring.py @@ -0,0 +1,999 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Agent-authored typed Workflows — reference spike (RFC #93). + +A minimal, faithful implementation of the RFC's authoring layer: + +* ``WorkflowSpec`` — a plain ``kind``-tagged recursive union (a typed plan + vocabulary; not Pydantic's discriminated union — see the SpecNode note). +* ``CapabilityRegistry`` — the closed set of agents/tools a plan may compose. +* ``WorkflowSpecValidator`` — semantic validation (capability refs, binding + scope, list/loop/branch rules) + an open-map output-schema warning. +* ``SpecInterpreter`` — executes a validated spec on the real ADK Workflow + engine via the #92 ``DynamicNodeSupervisor`` (step / fan_out / pipeline / + branch / loop_until). +* ``FrozenWorkflowRecord`` / ``export_plan`` / ``import_plan`` — the frozen spec + as a first-class, portable artifact (DESIGN.md §10): export to a JSON + envelope; import recomputes the hash and re-validates against the *current* + registry, never trusting the envelope's own ``validation``. + +This is a demand-gate artifact, not production code. See README.md. +""" + +from __future__ import annotations + +import hashlib +import json +import os +import sys +from typing import Any +from typing import Literal +from typing import Optional +from typing import Union + +from pydantic import BaseModel +from pydantic import Field +from pydantic import model_validator + +# The #92 supervisor lives in a sibling sample dir. +sys.path.insert( + 0, + os.path.join( + os.path.dirname(os.path.abspath(__file__)), + "..", + "dynamic_supervisor_spike", + ), +) +from supervisor import DynamicNodeSupervisor # noqa: E402 + + +# ----------------------------------------------------------------- WorkflowSpec +class Binding(BaseModel): + """The only way a node sources input: a source + optional dotted path.""" + + source: Literal["task", "step"] + step: Optional[str] = None + path: Optional[str] = None + + @model_validator(mode="after") + def _invariant(self): + if (self.source == "step") != (self.step is not None): + raise ValueError("source=='step' iff `step` is set") + return self + + +class StepRef(BaseModel): + kind: Literal["step"] + id: str + capability: str + input: Binding + + +class FanOut(BaseModel): + kind: Literal["fan_out"] + id: str + over: Binding + capability: str + collect: Literal["list"] = "list" + + +class PipelineStage(BaseModel): + capability: str # registered; takes an item + input: Binding | None = ( + None # defaults to the previous stage's per-item output + ) + + +class Pipeline(BaseModel): + # Barrier-free per-item multi-stage flow: each item runs through ALL stages + # via #92 ctx.pipeline (item A can be in stage k while item B is in stage 1) — + # NOT two barriered fan_outs. Compiles to DynamicNodeSupervisor.pipeline. + kind: Literal["pipeline"] + id: str + over: Binding # MUST resolve to a list + stages: list[PipelineStage] + collect: Literal["list"] = "list" + + +class Route(BaseModel): + value: str + block: list["SpecNode"] + + +class Branch(BaseModel): + kind: Literal["branch"] + id: str + on: Binding + # Enumerated LIST of routes, NOT an open dict[str, ...] map: open maps are a + # structured-output reliability hazard (the model leaves them empty). The + # spike's branch shape exposed exactly this — see README. + routes: list[Route] + unmatched: Literal["fail"] = "fail" + + +class LoopUntil(BaseModel): + kind: Literal["loop_until"] + id: str + body: list["SpecNode"] + until_capability: str + until_input: Binding + max_iters: int = Field(ge=1) + # Loop-carried state (optional): seeds the value a body step reads when it + # binds the loop's OWN id; after each iteration the carried value becomes the + # body's last-node output. Surfaced by the pattern-coverage sweep: a + # tournament (pairs recomputed each round from the prior round's winners) + # is inexpressible without it — every other accumulate-and-refine loop too. + init: Optional[Binding] = None + + +# NOTE: a PLAIN union, not Pydantic's Field(discriminator="kind"). The discriminated +# form emits a JSON schema with a `discriminator` keyword that genai's response_schema +# rejects (Schema: extra_forbidden — verified on gemini-3.5-flash). Each member still +# carries a `kind` Literal, so this is a structurally-tagged union: unambiguous to parse +# and to switch on, AND accepted as a Gemini response_schema. +SpecNode = Union[StepRef, FanOut, Pipeline, Branch, LoopUntil] + + +class WorkflowSpec(BaseModel): + goal: str + steps: list[SpecNode] + output: Binding + + +for _m in ( + StepRef, + FanOut, + PipelineStage, + Pipeline, + Branch, + Route, + LoopUntil, + WorkflowSpec, +): + _m.model_rebuild() + + +# ----------------------------------------------------------------- registry +class Capability(BaseModel): + """A registered capability the planner may compose by name.""" + + model_config = {"arbitrary_types_allowed": True} + + name: str + build: Any # () -> NodeLike (an ADK Agent, or a deterministic @node fn) + input_kind: Literal["item", "list"] + output_model: Optional[type[BaseModel]] = None + serialize_input: bool = ( + True # json.dumps the node_input (True for LLM agents) + ) + max_fan_out: int = 100 + side_effect: bool = False + version: str = "1" # manual bump — a coarse SECONDARY drift signal only + # Lint policy: same-capability chains (draft -> critique own draft -> + # redraft) are legitimate refinement for some capabilities; opting in here + # suppresses the self-review lint for this capability. + allow_self_chain: bool = False + + def contract_hash(self) -> str: + """Derived drift signal — sha256 over the capability's declared contract. + + Manual version strings don't get bumped when someone tweaks a schema; the + contract hash changes automatically, so drift detection on import does + not rely on developer discipline. + """ + schema = ( + None + if self.output_model is None + else self.output_model.model_json_schema() + ) + return sha256_hex({"input_kind": self.input_kind, "output_schema": schema}) + + +class CapabilityRegistry: + + def __init__(self, capabilities: list[Capability], *, version: str = "1"): + self._by_name = {c.name: c for c in capabilities} + self.version = version # registry_version (coarse drift signal) + + def __contains__(self, name): + return name in self._by_name + + def __getitem__(self, name): + return self._by_name[name] + + def names(self) -> list[str]: + return list(self._by_name) + + def capability_versions( + self, only: Optional[set[str]] = None + ) -> dict[str, str]: + """name -> MANUAL version (coarse secondary drift signal).""" + return { + n: c.version + for n, c in self._by_name.items() + if only is None or n in only + } + + def capability_contract_hashes( + self, only: Optional[set[str]] = None + ) -> dict[str, str]: + """name -> DERIVED contract hash (the primary drift signal on import).""" + return { + n: c.contract_hash() + for n, c in self._by_name.items() + if only is None or n in only + } + + def open_map_warnings(self) -> list[str]: + """Spike lesson: open-ended dict[str, X] output fields are a structured- + output reliability hazard (Gemini fills them unreliably). Warn on them.""" + warnings = [] + for cap in self._by_name.values(): + model = cap.output_model + if model is None: + continue + for fname, field in model.model_fields.items(): + ann = str(field.annotation) + if "dict[" in ann.replace(" ", "") and "int]" not in ann.lower()[:0]: + if ann.replace(" ", "").startswith( + "dict[str," + ) or "dict[str," in ann.replace(" ", ""): + warnings.append( + f"capability '{cap.name}': output field '{fname}' is an open" + f" map ({ann}); prefer enumerated fields for reliable" + " structured output" + ) + return warnings + + +# ----------------------------------------------------------------- validator +class SpecValidationError(Exception): + pass + + +class WorkflowSpecValidator: + + def __init__(self, registry: CapabilityRegistry): + self.registry = registry + + def validate( + self, + spec: WorkflowSpec, + *, + lint_waivers: Optional[dict[str, str]] = None, + ) -> list[str]: + """Raises SpecValidationError on a hard error; returns soft warnings. + + `lint_waivers` (node id -> justification) suppresses plan-quality lints + for the named nodes; record waivers in the FrozenWorkflowRecord so the + suppression itself is auditable. + """ + ids: set[str] = set() + self._walk(spec.steps, set(), ids) + if spec.output.source == "step" and spec.output.step not in ids: + raise SpecValidationError( + f"output references unknown step {spec.output.step!r}" + ) + return self.registry.open_map_warnings() + self.quality_lints( + spec, lint_waivers=lint_waivers + ) + + def quality_lints( + self, + spec: WorkflowSpec, + *, + lint_waivers: Optional[dict[str, str]] = None, + ) -> list[str]: + """Plan-quality lints (soft warnings, never hard errors). + + Multi-agent quality rests on isolation: it is what mitigates + self-preferential bias (an agent grading its own output) and goal drift. + Typed bindings make two such properties STATICALLY checkable — something + model-authored orchestration *code* cannot offer: + + * self-review: a node consuming output produced by the SAME capability + cannot provide independent verification; + * unsynthesized fan-out: a plan whose terminal output is a bare per-item + fan_out never combined or verified by a downstream capability. + + Suppression (so the lints stay credible instead of globally disabled): + a capability registered with `allow_self_chain=True` opts out of the + self-review lint (legitimate draft -> critique -> redraft refinement); + `lint_waivers` suppresses lints for specific node ids per plan. + """ + waivers = lint_waivers or {} + lints: list[str] = [] + producer_cap: dict[str, str] = {} # node id -> capability producing output + consumed: set[str] = set() # step ids some other node reads from + + def walk(nodes): + for n in nodes: + if isinstance(n, (StepRef, FanOut)): + producer_cap[n.id] = n.capability + elif isinstance(n, Pipeline): + producer_cap[n.id] = n.stages[-1].capability if n.stages else "" + for b in _bindings(n): + if b.source == "step": + consumed.add(b.step) + if isinstance(n, Pipeline): + for prev, st in zip(n.stages, n.stages[1:]): + if st.input is not None and st.input.source == "step": + consumed.add(st.input.step) + if ( + st.capability == prev.capability + and st.input is None + and n.id not in waivers + and not ( + st.capability in self.registry + and self.registry[st.capability].allow_self_chain + ) + ): + lints.append( + f"plan-quality: pipeline {n.id!r} stage" + f" {st.capability!r} re-checks its own capability's output —" + " same-capability review cannot provide independent" + " verification (self-preferential bias)" + ) + if isinstance(n, Branch): + for route in n.routes: + walk(route.block) + if isinstance(n, LoopUntil): + walk(n.body) + + walk(spec.steps) + + def walk_consumers(nodes): + for n in nodes: + my_cap = getattr(n, "capability", None) + b = getattr(n, "input", None) or getattr(n, "over", None) + if ( + my_cap + and isinstance(b, Binding) + and b.source == "step" + and producer_cap.get(b.step) == my_cap + and n.id not in waivers + and not ( + my_cap in self.registry + and self.registry[my_cap].allow_self_chain + ) + ): + lints.append( + f"plan-quality: {n.id!r} consumes the output of {b.step!r} via" + f" the same capability {my_cap!r} — same-capability review" + " cannot provide independent verification (self-preferential" + " bias)" + ) + for route in getattr(n, "routes", None) or []: + walk_consumers(route.block) + if getattr(n, "body", None): + walk_consumers(n.body) + + walk_consumers(spec.steps) + + if spec.output.source == "step": + terminal = spec.output.step + + def find(nodes): + for n in nodes: + if n.id == terminal: + return n + for route in getattr(n, "routes", None) or []: + hit = find(route.block) + if hit is not None: + return hit + if getattr(n, "body", None): + hit = find(n.body) + if hit is not None: + return hit + return None + + node_ = find(spec.steps) + if ( + isinstance(node_, FanOut) + and terminal not in consumed + and terminal not in waivers + ): + lints.append( + f"plan-quality: output binds directly to fan_out {terminal!r}" + " with no downstream synthesis or verification step — parallel" + " findings are never combined or independently checked" + ) + return lints + + def _walk(self, nodes, preceding: set[str], ids: set[str]) -> set[str]: + preceding = set(preceding) + for n in nodes: + if n.id in ids: + raise SpecValidationError(f"duplicate id {n.id!r}") + ids.add(n.id) + if isinstance(n, (StepRef, FanOut)) and n.capability not in self.registry: + raise SpecValidationError(f"unknown capability {n.capability!r}") + if isinstance(n, LoopUntil) and n.until_capability not in self.registry: + raise SpecValidationError( + f"unknown until_capability {n.until_capability!r}" + ) + # Entry bindings (input/over/on/init) reference a PRIOR step on this path. + for f in ("input", "over", "on", "init"): + b = getattr(n, f, None) + if ( + isinstance(b, Binding) + and b.source == "step" + and b.step not in preceding + ): + raise SpecValidationError( + f"{n.id}: binding references non-preceding step {b.step!r}" + ) + if ( + isinstance(n, FanOut) + and self.registry[n.capability].input_kind != "item" + ): + raise SpecValidationError( + f"fan_out {n.id}: capability must take an item" + ) + if isinstance(n, Pipeline): + if not n.stages: + raise SpecValidationError(f"pipeline {n.id}: needs >= 1 stage") + for st in n.stages: + if st.capability not in self.registry: + raise SpecValidationError(f"unknown capability {st.capability!r}") + if self.registry[st.capability].input_kind != "item": + raise SpecValidationError( + f"pipeline {n.id}: stage {st.capability!r} must take an item" + ) + if ( + isinstance(st.input, Binding) + and st.input.source == "step" + and st.input.step not in preceding + ): + raise SpecValidationError( + f"pipeline {n.id}: stage input references non-preceding step" + f" {st.input.step!r}" + ) + if isinstance(n, LoopUntil): + # A body step may bind the loop's OWN id to read the loop-carried + # value — but only if `init` seeds it (else iteration 0 has nothing). + if n.init is None and _references_step(n.body, n.id): + raise SpecValidationError( + f"loop {n.id}: body reads the loop-carried value (binds the" + " loop's own id) but no `init` binding seeds it" + ) + # body executes in-scope; until_input may reference a body step. + body_scope = self._walk(n.body, preceding | {n.id}, ids) + ui = n.until_input + if ui.source == "step" and ui.step not in body_scope: + raise SpecValidationError( + f"loop {n.id}: until_input references step {ui.step!r} not in its" + " body/scope" + ) + if isinstance(n, Branch): + for route in n.routes: + self._walk(route.block, preceding | {n.id}, ids) + preceding.add(n.id) + return preceding + + +def _bindings(n) -> list[Binding]: + out = [] + for f in ("input", "over", "on", "until_input", "init"): + b = getattr(n, f, None) + if isinstance(b, Binding): + out.append(b) + for st in getattr(n, "stages", None) or []: + if isinstance(st.input, Binding): + out.append(st.input) + return out + + +def _references_step(nodes, step_id: str) -> bool: + """True if any binding in `nodes` (recursively) reads `step_id`.""" + for n in nodes: + if any(b.source == "step" and b.step == step_id for b in _bindings(n)): + return True + for route in getattr(n, "routes", None) or []: + if _references_step(route.block, step_id): + return True + if getattr(n, "body", None) and _references_step(n.body, step_id): + return True + return False + + +# ----------------------------------------------------------- export / import +# +# DESIGN.md §10: the frozen spec is a first-class, exportable artifact. The +# source of truth is the typed WorkflowSpec; the compiled Workflow is derived +# and never stored. A single canonical hash definition keeps two exporters in +# agreement, and import NEVER trusts the envelope's own `validation` — it +# recomputes the hash and re-validates against the *current* registry. + + +def canonical_json(value) -> str: + """The one fixed serialization for hashing (no whitespace/key-order drift).""" + return json.dumps(value, sort_keys=True, separators=(",", ":")) + + +def sha256_hex(value) -> str: + return hashlib.sha256(canonical_json(value).encode()).hexdigest() + + +def referenced_capabilities(spec: WorkflowSpec) -> set[str]: + """Every capability name a spec composes (walks pipeline stages, branch + routes, and loop bodies — not just top-level steps).""" + found: set[str] = set() + + def walk(nodes): + for n in nodes: + cap = getattr(n, "capability", None) + if cap: + found.add(cap) + for st in getattr(n, "stages", None) or []: + found.add(st.capability) + for route in getattr(n, "routes", None) or []: + walk(route.block) + if getattr(n, "until_capability", None): + found.add(n.until_capability) + if getattr(n, "body", None): + walk(n.body) + + walk(spec.steps) + return found + + +def independence_facts(spec: WorkflowSpec) -> list[str]: + """Human-readable provenance facts derivable STATICALLY from the bindings. + + Each fact states what a step can possibly see — its only input is a typed + binding, so isolation (no shared context, no inherited reasoning) is a + checkable property of the frozen plan, not a runtime hope. This is what + makes structural bias controls auditable: the record proves a verifier saw + only the producer's output and that synthesis traces back to the task input. + """ + facts: list[str] = [] + + def walk(nodes): + for n in nodes: + if isinstance(n, Pipeline): + for prev, st in zip(n.stages, n.stages[1:]): + if st.input is None and st.capability != prev.capability: + facts.append( + f"pipeline {n.id!r}: stage {st.capability!r} sees ONLY stage" + f" {prev.capability!r}'s per-item output — independent" + " verification, per item" + ) + b = getattr(n, "input", None) or getattr(n, "over", None) + if isinstance(b, Binding): + src = ( + "the task input" + if b.source == "task" + else f"the typed output of {b.step!r}" + ) + facts.append(f"{n.id!r} consumes ONLY {src}") + for route in getattr(n, "routes", None) or []: + walk(route.block) + if getattr(n, "body", None): + walk(n.body) + + walk(spec.steps) + return facts + + +class ValidationResult(BaseModel): + passed: bool + warnings: list[str] = Field(default_factory=list) + + +class FrozenWorkflowRecord(BaseModel): + """The single shape behind session state, the audit event, and the export + envelope (DESIGN.md §5) — v1 storage is never a weaker subset.""" + + schema_version: str = "v1" + spec: WorkflowSpec + spec_hash: str + planner_model: str + registry_version: str + capability_versions: dict[str, str] + # DERIVED sha256 over each referenced capability's declared contract — the + # primary drift signal (manual versions above are secondary). + capability_contract_hashes: dict[str, str] = Field(default_factory=dict) + # Per-plan lint waivers (node id -> justification), recorded so a + # suppressed lint is an AUDITABLE decision, not a silenced one. + lint_waivers: dict[str, str] = Field(default_factory=dict) + validation: ValidationResult + created_at: str # ISO-8601, stamped at freeze (caller supplies; not now()) + task_input_schema: Optional[dict] = None + task_input_digest: Optional[str] = None + + @classmethod + def freeze( + cls, + spec: WorkflowSpec, + *, + planner_model: str, + registry: CapabilityRegistry, + created_at: str, + task_input=None, + task_input_schema: Optional[dict] = None, + lint_waivers: Optional[dict[str, str]] = None, + ) -> "FrozenWorkflowRecord": + """Validate + capture everything needed for replay and drift detection.""" + warnings = WorkflowSpecValidator(registry).validate( + spec, lint_waivers=lint_waivers + ) # raises on hard error + refs = referenced_capabilities(spec) + return cls( + spec=spec, + spec_hash=sha256_hex(spec.model_dump(mode="json")), + planner_model=planner_model, + registry_version=registry.version, + capability_versions=registry.capability_versions(only=refs), + capability_contract_hashes=registry.capability_contract_hashes( + only=refs + ), + lint_waivers=dict(lint_waivers or {}), + validation=ValidationResult(passed=True, warnings=warnings), + created_at=created_at, + task_input_schema=task_input_schema, + task_input_digest=( + None if task_input is None else sha256_hex(task_input) + ), + ) + + +class PlanImportError(Exception): + """Raised when an exported plan fails integrity, drift, or input checks.""" + + +SUPPORTED_SCHEMA_VERSION = "v1" + + +def export_plan(record: FrozenWorkflowRecord) -> dict: + """Serialize the §5 record to a portable JSON-able envelope.""" + return record.model_dump(mode="json") + + +def import_plan( + envelope: dict, registry: CapabilityRegistry, *, task_input=None +) -> WorkflowSpec: + """Re-hydrate an exported plan, NEVER trusting the envelope's own checks. + + Integrity + drift (DESIGN.md §10): + 0. reject an unsupported schema_version; + 1. recompute sha256(canonical_json(spec)); REJECT if != envelope spec_hash; + 2. re-run WorkflowSpecValidator against the CURRENT registry (catches a + dropped/renamed capability); + 3. registry-version and per-capability version drift -> fail loudly. + Execution-input contract: + * replay (no schema): task_input digest MUST match the envelope's; + * template (schema): task_input is validated against task_input_schema; + * neither: do NOT execute against arbitrary new input. + """ + # 0. schema_version — a defensive importer refuses formats it doesn't know. + schema_version = envelope.get("schema_version") + if schema_version != SUPPORTED_SCHEMA_VERSION: + raise PlanImportError( + f"unsupported schema_version {schema_version!r} (this importer supports" + f" {SUPPORTED_SCHEMA_VERSION!r})" + ) + + spec = WorkflowSpec.model_validate(envelope["spec"]) + + # 1. integrity — recompute, don't trust. + recomputed = sha256_hex(spec.model_dump(mode="json")) + if recomputed != envelope.get("spec_hash"): + raise PlanImportError( + "spec_hash mismatch: envelope has" + f" {envelope.get('spec_hash')!r}, recomputed {recomputed!r} — the spec" + " was tampered with or re-serialized under a different definition" + ) + + # 2. re-validate against the CURRENT registry (dropped capability fails here). + try: + WorkflowSpecValidator(registry).validate(spec) + except SpecValidationError as e: + raise PlanImportError(f"re-validation against current registry failed: {e}") + + # 3a. registry-version drift is a hard error (DESIGN.md §10). + if envelope.get("registry_version") != registry.version: + raise PlanImportError( + "registry_version drift (recorded" + f" {envelope.get('registry_version')!r} vs current" + f" {registry.version!r}) — re-validate / migrate before reuse" + ) + + # 3b. per-capability MANUAL version drift (coarse secondary signal). + current = registry.capability_versions(only=referenced_capabilities(spec)) + recorded = envelope.get("capability_versions", {}) + drifted = { + n: (recorded.get(n), current.get(n)) + for n in current + if recorded.get(n) != current.get(n) + } + if drifted: + raise PlanImportError( + f"capability version drift (recorded vs current): {drifted} — promote" + " to a template with explicit migration before reuse" + ) + + # 3c. per-capability CONTRACT drift (primary, derived signal): catches a + # changed input_kind / output schema even when nobody bumped a version. + # FAIL CLOSED: a v1 envelope MUST record a contract hash for every + # referenced capability — otherwise stripping the field (or one entry) + # from the envelope would silently bypass drift detection. + current_ch = registry.capability_contract_hashes( + only=referenced_capabilities(spec) + ) + recorded_ch = envelope.get("capability_contract_hashes") or {} + missing_ch = sorted(n for n in current_ch if n not in recorded_ch) + if missing_ch: + raise PlanImportError( + f"envelope is missing contract hashes for {missing_ch} — a v1" + " envelope must record a contract hash for every referenced" + " capability (fail closed: a stripped field must not bypass drift" + " detection)" + ) + contract_drift = { + n: (recorded_ch[n], current_ch[n]) + for n in current_ch + if recorded_ch[n] != current_ch[n] + } + if contract_drift: + raise PlanImportError( + "capability contract drift (recorded vs current schema hash):" + f" {contract_drift} — the capability's declared contract changed" + " since export; re-validate / migrate before reuse" + ) + + # Execution-input contract. + if task_input is not None: + schema = envelope.get("task_input_schema") + if schema is not None: + missing = [k for k in schema.get("required", []) if k not in task_input] + if missing: + raise PlanImportError( + f"task input missing required keys {missing} for this template" + ) + else: + digest = sha256_hex(task_input) + if digest != envelope.get("task_input_digest"): + raise PlanImportError( + "task_input digest mismatch and no task_input_schema captured:" + " this plan can only be REPLAYED on its original input (promote to" + " a template to reuse it on new input)" + ) + + return spec + + +# ------------------------------------------------- AgentConfig lowering (§11) +# A STRUCTURAL PROJECTION of a WorkflowSpec's static skeleton onto ADK +# `AgentConfig` shapes — the convergence direction from DESIGN §11, shown +# concretely. It is deliberately NOT a loadable `root_agent.yaml`: +# * the static subset projects to SequentialAgent / LoopAgent / LlmAgent shapes; +# * leaf agents are referenced by ALLOW-LISTED capability name, never by an +# importable FQN (the trust-boundary point — a model never names an import); +# * the dynamic blocks (fan_out over a runtime list, pipeline, branch) have NO +# `AgentConfig` equivalent and are emitted as explicit `unsupported` markers, +# never fabricated as config. +# A full loadable-config compiler (child YAML / an allow-listed capability-ref +# field) is future work (DESIGN §12). + +AGENTCONFIG_UNSUPPORTED = "" + + +def _lower_block(node) -> dict: + if isinstance(node, StepRef): + return { + "agent_class": "LlmAgent", + "name": node.id, + "capability": node.capability, + } + if isinstance(node, LoopUntil): + return { + "agent_class": "LoopAgent", + "name": node.id, + "max_iterations": node.max_iters, + "sub_agents": [_lower_block(b) for b in node.body], + "_note": ( + f"until-predicate ({node.until_capability}) has no AgentConfig" + " field; enforced by SpecInterpreter" + ), + } + if isinstance(node, FanOut): + return { + "agent_class": AGENTCONFIG_UNSUPPORTED, + "workflowspec_kind": "fan_out", + "name": node.id, + "capability": node.capability, + } + if isinstance(node, Pipeline): + return { + "agent_class": AGENTCONFIG_UNSUPPORTED, + "workflowspec_kind": "pipeline", + "name": node.id, + "stages": [st.capability for st in node.stages], + } + if isinstance(node, Branch): + return { + "agent_class": AGENTCONFIG_UNSUPPORTED, + "workflowspec_kind": "branch", + "name": node.id, + } + raise TypeError(f"unknown block: {type(node).__name__}") + + +def lower_to_agent_config( + spec: WorkflowSpec, *, name: str = "authored_workflow" +) -> dict: + """Project the static skeleton of `spec` onto an ADK `AgentConfig` shape. + + Illustrative (see the module note above), not a loadable `root_agent.yaml`: + the ordered `steps` sequence projects to a `SequentialAgent`; leaf steps to + `LlmAgent` (by capability name, not FQN); dynamic blocks are flagged + `unsupported`, never fabricated. + """ + return { + "agent_class": "SequentialAgent", + "name": name, + "sub_agents": [_lower_block(s) for s in spec.steps], + } + + +def agent_config_coverage(spec: WorkflowSpec) -> dict: + """A quick 'X of N top-level blocks lower to config' number for the demo.""" + lowered = lower_to_agent_config(spec)["sub_agents"] + dynamic = [ + b["workflowspec_kind"] + for b in lowered + if b["agent_class"] == AGENTCONFIG_UNSUPPORTED + ] + return { + "total": len(lowered), + "lowerable": len(lowered) - len(dynamic), + "dynamic": dynamic, + } + + +# ----------------------------------------------------------------- interpreter +class SpecInterpreter: + """Executes a validated WorkflowSpec on the real ADK engine via the #92 + supervisor. Handles step / fan_out / pipeline / branch / loop_until.""" + + def __init__(self, registry: CapabilityRegistry, ctx, *, gate: int = 8): + self.registry = registry + self.ctx = ctx + self.sup = DynamicNodeSupervisor(ctx, gate=gate) + self.state: dict[str, Any] = {} + self.dispatch_count = 0 # capability dispatches — cheap cost visibility + + def _resolve(self, binding: Binding, task_input): + base = task_input if binding.source == "task" else self.state[binding.step] + if binding.path: + cur = base + for part in binding.path.split("."): + cur = cur[part] if isinstance(cur, dict) else getattr(cur, part) + return cur + return base + + def _arg(self, cap: Capability, value): + return json.dumps(value, default=str) if cap.serialize_input else value + + def _dispatch_cap(self, cap: Capability, value, run_id: str): + self.dispatch_count += 1 + return self.sup.dispatch( + cap.build(), node_input=self._arg(cap, value), run_id=run_id + ) + + async def _dispatch(self, cap_name: str, value, run_id: str): + return await self._dispatch_cap(self.registry[cap_name], value, run_id) + + async def execute(self, spec: WorkflowSpec, task_input) -> Any: + await self._run_block(spec.steps, task_input, prefix="") + return self._resolve(spec.output, task_input) + + async def _run_block(self, nodes, task_input, prefix: str): + last = None + for n in nodes: + rid = f"{prefix}{n.id}" + if isinstance(n, StepRef): + self.state[n.id] = await self._dispatch( + n.capability, self._resolve(n.input, task_input), rid + ) + elif isinstance(n, FanOut): + cap = self.registry[n.capability] + items = self._resolve(n.over, task_input) + if len(items) > cap.max_fan_out: + raise SpecValidationError( + f"runtime: fan_out {len(items)} exceeds max_fan_out" + f" {cap.max_fan_out}" + ) + self.state[n.id] = await self.sup.pipeline( + items, + ( + lambda _p, it, i, c=cap, rid=rid: self._dispatch_cap( + c, it, f"{rid}_{i}" + ) + ), + ) + elif isinstance(n, Pipeline): + # Barrier-free per-item multi-stage flow via #92 ctx.pipeline — each item + # threads ALL stages; item A can be in stage k while item B is in stage 1 + # (NOT two barriered fan_outs). stage[0] input defaults to the per-item + # element; stage[k] input defaults to stage[k-1]'s per-item output. + items = self._resolve(n.over, task_input) + # Each stage dispatches once per item, so every stage capability is + # subject to the same data-dependent fan-out cap as a FanOut. + for st in n.stages: + cap = self.registry[st.capability] + if len(items) > cap.max_fan_out: + raise SpecValidationError( + f"runtime: pipeline stage {st.capability!r} fan_out" + f" {len(items)} exceeds max_fan_out {cap.max_fan_out}" + ) + stage_fns = [] + for si, st in enumerate(n.stages): + + def stage(prev, it, i, si=si, st=st, rid=rid): + cap = self.registry[st.capability] + value = ( + self._resolve(st.input, task_input) + if st.input is not None + else (it if si == 0 else prev) + ) + return self._dispatch_cap(cap, value, f"{rid}_{i}_{si}") + + stage_fns.append(stage) + self.state[n.id] = await self.sup.pipeline(items, *stage_fns) + elif isinstance(n, Branch): + value = str(self._resolve(n.on, task_input)) + routes = {r.value: r.block for r in n.routes} + if value not in routes: + raise SpecValidationError( + f"runtime: branch {n.id} unmatched value {value!r}" + " (unmatched=fail)" + ) + out = await self._run_block( + routes[value], task_input, prefix=f"{rid}_{value}_" + ) + self.state[n.id] = out + elif isinstance(n, LoopUntil): + # Loop-carried state: `init` seeds state[loop.id]; after every + # iteration the carried value becomes the body's last-node output, so + # a body step binding the loop's own id reads the PRIOR round's result + # (tournament: pairs recomputed each round from the prior winners). + if n.init is not None: + self.state[n.id] = self._resolve(n.init, task_input) + out = None + for i in range(n.max_iters): + out = await self._run_block(n.body, task_input, prefix=f"{rid}_i{i}_") + self.state[n.id] = out + verdict = await self._dispatch( + n.until_capability, + self._resolve(n.until_input, task_input), + f"{rid}_i{i}_until", + ) + if _truthy(verdict): + break + self.state[n.id] = out + last = self.state.get(n.id) + return last + + +def _truthy(v) -> bool: + if isinstance(v, bool): + return v + if isinstance(v, dict): + for k in ("result", "value", "done", "ok"): + if k in v: + return bool(v[k]) + return bool(v) diff --git a/contributing/samples/workflows/authored_workflow_spike/test_authoring.py b/contributing/samples/workflows/authored_workflow_spike/test_authoring.py new file mode 100644 index 00000000000..ee59d490596 --- /dev/null +++ b/contributing/samples/workflows/authored_workflow_spike/test_authoring.py @@ -0,0 +1,902 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Deterministic, CI-safe tests for the authored-workflow spike (RFC #93). + +No LLM. Capabilities are deterministic stub nodes, so these exercise the +validator + the interpreter (step / fan_out / pipeline / branch / loop_until + binding +scope) on the real ADK Workflow engine. The live planner sweep lives in +test_live_planner_sweep.py (env-gated). +""" + +from __future__ import annotations + +import json +import os +import sys + +from google.adk import Event +from google.adk import Workflow +from google.adk.runners import Runner +from google.adk.sessions.in_memory_session_service import InMemorySessionService +from google.adk.workflow import node +from google.genai import types +from pydantic import BaseModel +import pytest + +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +from authoring import agent_config_coverage +from authoring import AGENTCONFIG_UNSUPPORTED +from authoring import Binding # noqa: E402 +from authoring import Branch +from authoring import Capability +from authoring import CapabilityRegistry +from authoring import export_plan +from authoring import FanOut +from authoring import FrozenWorkflowRecord +from authoring import import_plan +from authoring import LoopUntil +from authoring import lower_to_agent_config +from authoring import Pipeline +from authoring import PipelineStage +from authoring import PlanImportError +from authoring import Route +from authoring import sha256_hex +from authoring import SpecInterpreter +from authoring import SpecValidationError +from authoring import StepRef +from authoring import WorkflowSpec +from authoring import WorkflowSpecValidator + + +# ----------------------------------------------------------------- stub caps +def _cap_node(name, fn): + def build(): + @node(name=name) + async def n(ctx, node_input): + yield Event(output=fn(node_input)) + + return n + + return build + + +def _registry(): + return CapabilityRegistry([ + Capability( + name="review", + build=_cap_node( + "review", + lambda f: { + "path": f["path"], + "severity": "HIGH" if "bad" in f["code"] else "NONE", + }, + ), + input_kind="item", + serialize_input=False, + max_fan_out=10, + ), + Capability( + name="count", + build=_cap_node( + "count", + lambda findings: { + "n": len(findings), + "high": sum(1 for x in findings if x["severity"] == "HIGH"), + }, + ), + input_kind="list", + serialize_input=False, + ), + Capability( + name="classify", + build=_cap_node( + "classify", lambda s: "tech" if "code" in str(s) else "other" + ), + input_kind="item", + serialize_input=False, + ), + Capability( + name="tech_summary", + build=_cap_node("tech_summary", lambda s: "TECH:" + str(s)), + input_kind="item", + serialize_input=False, + ), + Capability( + name="other_summary", + build=_cap_node("other_summary", lambda s: "OTHER:" + str(s)), + input_kind="item", + serialize_input=False, + ), + Capability( + name="draft", + build=_cap_node("draft", lambda s: {"text": "v", "len": len(str(s))}), + input_kind="item", + serialize_input=False, + ), + Capability( + name="is_good", + build=_cap_node("is_good", lambda s: True), + input_kind="item", + serialize_input=False, + ), + ]) + + +async def _run_spec(spec, registry, task_input): + holder = {} + + @node(rerun_on_resume=True) + async def parent(ctx, node_input): + interp = SpecInterpreter(registry, ctx) + holder["out"] = await interp.execute(spec, task_input) + yield Event(output={"_done": True}) + + wf = Workflow(name="t", edges=[("START", parent)]) + ss = InMemorySessionService() + r = Runner(app_name=wf.name, node=wf, session_service=ss) + s = await ss.create_session(app_name=wf.name, user_id="u") + async for _ in r.run_async( + user_id="u", + session_id=s.id, + new_message=types.Content(parts=[types.Part(text="go")], role="user"), + ): + pass + return holder["out"] + + +# ----------------------------------------------------------------- validator +def test_binding_invariant(): + with pytest.raises(Exception): + Binding(source="step") # step missing + with pytest.raises(Exception): + Binding(source="task", step="x") # step set for task + + +def test_loop_max_iters_must_be_positive(): + with pytest.raises(Exception): + LoopUntil( + kind="loop_until", + id="l", + body=[], + until_capability="is_good", + until_input=Binding(source="task"), + max_iters=0, + ) + + +def _fanout_aggregate_spec(): + return WorkflowSpec( + goal="audit", + steps=[ + FanOut( + kind="fan_out", + id="rev", + over=Binding(source="task", path="files"), + capability="review", + ), + StepRef( + kind="step", + id="agg", + capability="count", + input=Binding(source="step", step="rev"), + ), + ], + output=Binding(source="step", step="agg"), + ) + + +def test_validator_accepts_valid_spec(): + WorkflowSpecValidator(_registry()).validate( + _fanout_aggregate_spec() + ) # no raise + + +def test_validator_rejects_unknown_capability(): + spec = _fanout_aggregate_spec() + spec.steps[0].capability = "nope" + with pytest.raises(SpecValidationError): + WorkflowSpecValidator(_registry()).validate(spec) + + +def test_validator_rejects_nonpreceding_binding(): + spec = WorkflowSpec( + goal="x", + steps=[ + StepRef( + kind="step", + id="a", + capability="count", + input=Binding(source="step", step="later"), + ) + ], # references a later/unknown step + output=Binding(source="step", step="a"), + ) + with pytest.raises(SpecValidationError): + WorkflowSpecValidator(_registry()).validate(spec) + + +def test_validator_rejects_duplicate_id(): + spec = WorkflowSpec( + goal="x", + steps=[ + StepRef( + kind="step", + id="a", + capability="classify", + input=Binding(source="task"), + ), + StepRef( + kind="step", + id="a", + capability="classify", + input=Binding(source="task"), + ), + ], + output=Binding(source="step", step="a"), + ) + with pytest.raises(SpecValidationError): + WorkflowSpecValidator(_registry()).validate(spec) + + +def test_open_map_warning(): + class BadReport(BaseModel): + total: int + counts: dict[str, int] # open map — should warn + + reg = CapabilityRegistry([ + Capability( + name="triage", + build=lambda: None, + input_kind="list", + output_model=BadReport, + ) + ]) + warnings = reg.open_map_warnings() + assert any("open map" in w for w in warnings) + + +# ----------------------------------------------------------------- interpreter +@pytest.mark.asyncio +async def test_interpreter_fanout_then_aggregate(): + files = [ + {"path": "a.py", "code": "bad thing"}, + {"path": "b.py", "code": "fine"}, + {"path": "c.py", "code": "bad"}, + ] + out = await _run_spec(_fanout_aggregate_spec(), _registry(), {"files": files}) + assert out == {"n": 3, "high": 2} + + +@pytest.mark.asyncio +async def test_interpreter_branch_takes_correct_route(): + spec = WorkflowSpec( + goal="branch", + steps=[ + StepRef( + kind="step", + id="cls", + capability="classify", + input=Binding(source="task"), + ), + Branch( + kind="branch", + id="br", + on=Binding(source="step", step="cls"), + routes=[ + Route( + value="tech", + block=[ + StepRef( + kind="step", + id="t", + capability="tech_summary", + input=Binding(source="task"), + ) + ], + ), + Route( + value="other", + block=[ + StepRef( + kind="step", + id="o", + capability="other_summary", + input=Binding(source="task"), + ) + ], + ), + ], + ), + ], + output=Binding(source="step", step="br"), + ) + WorkflowSpecValidator(_registry()).validate(spec) + assert (await _run_spec(spec, _registry(), "this is code")).startswith( + "TECH:" + ) + assert (await _run_spec(spec, _registry(), "hello world")).startswith( + "OTHER:" + ) + + +@pytest.mark.asyncio +async def test_interpreter_loop_until_stops_and_outputs(): + spec = WorkflowSpec( + goal="loop", + steps=[ + LoopUntil( + kind="loop_until", + id="lp", + body=[ + StepRef( + kind="step", + id="d", + capability="draft", + input=Binding(source="task"), + ) + ], + until_capability="is_good", + until_input=Binding(source="step", step="d"), + max_iters=3, + ), + ], + output=Binding(source="step", step="lp"), + ) + WorkflowSpecValidator(_registry()).validate(spec) + out = await _run_spec(spec, _registry(), "topic") + assert out == { + "text": "v", + "len": len("topic"), + } # loop output = last body node output + + +# ----------------------------------------------------------------- pipeline +def _timed_registry(log): + """reviewer (stage 0) + verifier (stage 1) as deterministic timed stubs.""" + import asyncio + import time + + def stage_cap(name, slow_for=None, key="r"): + def build(): + @node(name=name) + async def n(ctx, node_input): + item = node_input + log.append((name, "start", time.perf_counter())) + await asyncio.sleep( + 0.05 if (slow_for is not None and item == slow_for) else 0.0 + ) + log.append((name, "end", time.perf_counter())) + yield Event(output={key: item}) + + return n + + return Capability( + name=name, build=build, input_kind="item", serialize_input=False + ) + + return CapabilityRegistry([ + stage_cap("reviewer", slow_for=1, key="review"), + stage_cap("verifier", key="verdict"), + ]) + + +def _pipeline_spec(): + return WorkflowSpec( + goal="pipe", + steps=[ + Pipeline( + kind="pipeline", + id="pp", + over=Binding(source="task", path="items"), + stages=[ + PipelineStage(capability="reviewer"), + PipelineStage(capability="verifier"), + ], + ) + ], + output=Binding(source="step", step="pp"), + ) + + +def test_validator_accepts_pipeline(): + log = [] + WorkflowSpecValidator(_timed_registry(log)).validate(_pipeline_spec()) + + +def test_validator_rejects_pipeline_list_stage(): + spec = _pipeline_spec() + # "count" takes a list, not an item -> invalid pipeline stage + spec.steps[0].stages[1] = PipelineStage(capability="count") + with pytest.raises(SpecValidationError): + WorkflowSpecValidator(_registry()).validate(spec) + + +@pytest.mark.asyncio +async def test_interpreter_pipeline_ordered_and_barrier_free(): + log = [] + reg = _timed_registry(log) + # input items [0, 1]; reviewer is slow for item 1 only. + out = await _run_spec(_pipeline_spec(), reg, {"items": [0, 1]}) + + # Ordered, per-item review->verify (verdict carries the reviewed value): + assert out == [{"verdict": {"review": 0}}, {"verdict": {"review": 1}}] + + starts = {n: t for (n, p, t) in log if p == "start"} + ends = {n: t for (n, p, t) in log if p == "end"} + # BARRIER-FREE proof: item 0 reaches stage 2 (verifier) BEFORE item 1 finishes + # stage 1 (reviewer). Two barriered fan_outs could NOT do this — every + # reviewer would finish before any verifier started. + assert "verifier" in starts and "reviewer" in ends + # earliest verifier start precedes the latest reviewer end: + first_verifier_start = min( + t for (n, p, t) in log if n == "verifier" and p == "start" + ) + last_reviewer_end = max( + t for (n, p, t) in log if n == "reviewer" and p == "end" + ) + assert first_verifier_start < last_reviewer_end + + +@pytest.mark.asyncio +async def test_interpreter_pipeline_enforces_max_fan_out(): + # Each stage dispatches once per item, so a stage capability's max_fan_out is + # a data-dependent cap that must be enforced at runtime (same as FanOut). + log = [] + reg = _timed_registry(log) + reg["verifier"].max_fan_out = 1 # 2 items > cap -> reject before dispatch + with pytest.raises(SpecValidationError): + await _run_spec(_pipeline_spec(), reg, {"items": [0, 1]}) + # rejected pre-dispatch: no stage ran. + assert log == [] + + +# ----------------------------------------------------------------- export/import +_TASK = {"files": [{"path": "a.py", "code": "bad"}]} + + +def _frozen(): + return FrozenWorkflowRecord.freeze( + _fanout_aggregate_spec(), + planner_model="gemini-3.5-flash", + registry=_registry(), + created_at="2026-06-02T00:00:00Z", + task_input=_TASK, + ) + + +def test_export_then_import_roundtrip_replays_same_hash(): + env = export_plan(_frozen()) + # the envelope is JSON-serializable and carries the full §5 record. + assert json.loads(json.dumps(env))["schema_version"] == "v1" + assert set(env["capability_versions"]) == {"review", "count"} + # re-import on the ORIGINAL input (replay path) succeeds and recomputes the + # SAME hash from the spec — integrity holds. + spec = import_plan(env, _registry(), task_input=_TASK) + assert sha256_hex(spec.model_dump(mode="json")) == env["spec_hash"] + + +def test_import_rejects_tampered_spec(): + env = export_plan(_frozen()) + # tamper with the spec but leave the recorded hash -> integrity check fires. + env["spec"]["goal"] = "exfiltrate" + with pytest.raises(PlanImportError, match="spec_hash mismatch"): + import_plan(env, _registry(), task_input=_TASK) + + +def test_import_rejects_dropped_capability(): + env = export_plan(_frozen()) + # current registry no longer has `count` -> re-validation against the CURRENT + # registry fails (we never trust the envelope's own `validation`). + shrunk = CapabilityRegistry([_registry()["review"]]) + with pytest.raises(PlanImportError, match="re-validation"): + import_plan(env, shrunk, task_input=_TASK) + + +def test_import_rejects_capability_version_drift(): + env = export_plan(_frozen()) + # same capabilities, but `review` was bumped since export -> drift. + bumped = _registry() + bumped["review"].version = "2" + with pytest.raises(PlanImportError, match="version drift"): + import_plan(env, bumped, task_input=_TASK) + + +def test_import_rejects_unsupported_schema_version(): + env = export_plan(_frozen()) + env["schema_version"] = "v2" # an importer must refuse formats it can't read + with pytest.raises(PlanImportError, match="schema_version"): + import_plan(env, _registry(), task_input=_TASK) + + +def test_import_rejects_registry_version_drift(): + env = export_plan(_frozen()) + # same capabilities/versions, but the whole registry was re-versioned -> + # hard error per DESIGN.md §10. + v2_registry = CapabilityRegistry( + list(_registry()._by_name.values()), version="2" + ) + with pytest.raises(PlanImportError, match="registry_version"): + import_plan(env, v2_registry, task_input=_TASK) + + +def test_lower_static_sequence_to_sequential_agent(): + spec = WorkflowSpec( + goal="x", + steps=[ + StepRef( + kind="step", + id="c", + capability="classify", + input=Binding(source="task"), + ), + StepRef( + kind="step", + id="s", + capability="tech_summary", + input=Binding(source="step", step="c"), + ), + ], + output=Binding(source="step", step="s"), + ) + cfg = lower_to_agent_config(spec) + assert cfg["agent_class"] == "SequentialAgent" + assert [s["agent_class"] for s in cfg["sub_agents"]] == [ + "LlmAgent", + "LlmAgent", + ] + assert [s["capability"] for s in cfg["sub_agents"]] == [ + "classify", + "tech_summary", + ] + assert AGENTCONFIG_UNSUPPORTED not in [ + s["agent_class"] for s in cfg["sub_agents"] + ] + + +def test_lower_loop_to_loop_agent(): + spec = WorkflowSpec( + goal="x", + steps=[ + LoopUntil( + kind="loop_until", + id="lp", + body=[ + StepRef( + kind="step", + id="d", + capability="draft", + input=Binding(source="task"), + ) + ], + until_capability="is_good", + until_input=Binding(source="step", step="d"), + max_iters=3, + ) + ], + output=Binding(source="step", step="lp"), + ) + loop = lower_to_agent_config(spec)["sub_agents"][0] + assert loop["agent_class"] == "LoopAgent" + assert loop["max_iterations"] == 3 + assert loop["sub_agents"][0]["capability"] == "draft" + + +def test_lower_marks_dynamic_blocks_unsupported(): + # pipeline is per-item over a runtime list -> no AgentConfig equivalent. + cov = agent_config_coverage(_pipeline_spec()) + assert cov == {"total": 1, "lowerable": 0, "dynamic": ["pipeline"]} + + +def test_lower_never_emits_importable_fqn(): + # leaves are referenced by allow-listed capability name, never by an + # importable path; the FQN-bearing keys ADK config would use are absent. + spec = WorkflowSpec( + goal="x", + steps=[ + StepRef( + kind="step", + id="c", + capability="classify", + input=Binding(source="task"), + ) + ], + output=Binding(source="step", step="c"), + ) + blob = json.dumps(lower_to_agent_config(spec)) + assert '"code"' not in blob and '"config_path"' not in blob + assert '"capability": "classify"' in blob + + +def test_import_rejects_new_input_without_template_schema(): + env = export_plan(_frozen()) # no task_input_schema captured -> replay-only + other = {"files": [{"path": "z.py", "code": "ok"}]} + with pytest.raises(PlanImportError, match="digest mismatch"): + import_plan(env, _registry(), task_input=other) + # but template promotion (a captured schema) lets a new input through: + env["task_input_schema"] = {"required": ["files"]} + assert import_plan(env, _registry(), task_input=other) is not None + + +# ------------------------------------------------------------ pattern coverage +# The six empirically common coordination patterns (classify-route, fan-out/ +# synthesize, generate-filter, loop-until-done, adversarial verification, +# tournament) must all be expressible in the v1 vocabulary. Four are already +# exercised above (branch test = classify-route; fanout_then_aggregate = +# fan-out/synthesize AND generate-filter; loop test = loop-until-done). The two +# non-obvious shapes get explicit tests here. Tournament is the one that +# surfaced a vocabulary gap: data-dependent pairing needs LOOP-CARRIED state +# (`LoopUntil.init` + body bindings to the loop's own id). + + +def _pattern_registry(): + return CapabilityRegistry([ + Capability( + name="pair_maker", + build=_cap_node( + "pair_maker", + lambda lst: [lst[i : i + 2] for i in range(0, len(lst), 2)], + ), + input_kind="list", + serialize_input=False, + ), + Capability( + name="judge", + build=_cap_node("judge", lambda pair: min(pair)), + input_kind="item", + serialize_input=False, + ), + Capability( + name="single_winner", + build=_cap_node("single_winner", lambda lst: len(lst) == 1), + input_kind="list", + serialize_input=False, + ), + Capability( + name="skeptic", + build=_cap_node( + "skeptic", + lambda f: {"claim": f["claim"], "refuted": not f["evidence"]}, + ), + input_kind="item", + serialize_input=False, + ), + Capability( + name="keep_unrefuted", + build=_cap_node( + "keep_unrefuted", + lambda vs: [v["claim"] for v in vs if not v["refuted"]], + ), + input_kind="list", + serialize_input=False, + ), + ]) + + +def _tournament_spec(): + return WorkflowSpec( + goal="single elimination", + steps=[ + LoopUntil( + kind="loop_until", + id="tourney", + init=Binding(source="task", path="candidates"), + body=[ + StepRef( + kind="step", + id="pairs", + capability="pair_maker", + # reads the LOOP-CARRIED value: the candidates on round 0, + # the prior round's winners afterwards. + input=Binding(source="step", step="tourney"), + ), + FanOut( + kind="fan_out", + id="round_winners", + over=Binding(source="step", step="pairs"), + capability="judge", + ), + ], + until_capability="single_winner", + until_input=Binding(source="step", step="round_winners"), + max_iters=4, + ), + ], + output=Binding(source="step", step="tourney"), + ) + + +@pytest.mark.asyncio +async def test_pattern_tournament_loop_carried(): + reg = _pattern_registry() + assert WorkflowSpecValidator(reg).validate(_tournament_spec()) == [] + out = await _run_spec( + _tournament_spec(), + reg, + {"candidates": ["delta", "bravo", "charlie", "alpha"]}, + ) + # round 1: (delta,bravo)->bravo, (charlie,alpha)->alpha; round 2: -> alpha. + assert out == ["alpha"] + + +def test_validator_rejects_loop_carried_read_without_init(): + spec = _tournament_spec() + spec.steps[0].init = None # body still binds the loop's own id + with pytest.raises(SpecValidationError, match="init"): + WorkflowSpecValidator(_pattern_registry()).validate(spec) + + +@pytest.mark.asyncio +async def test_pattern_adversarial_verification(): + # Independent skeptics per finding (fan_out) + a threshold/filter step: + # only evidence-backed claims survive. No new vocabulary needed. + spec = WorkflowSpec( + goal="verify findings adversarially", + steps=[ + FanOut( + kind="fan_out", + id="verdicts", + over=Binding(source="task", path="findings"), + capability="skeptic", + ), + StepRef( + kind="step", + id="confirmed", + capability="keep_unrefuted", + input=Binding(source="step", step="verdicts"), + ), + ], + output=Binding(source="step", step="confirmed"), + ) + reg = _pattern_registry() + assert WorkflowSpecValidator(reg).validate(spec) == [] + out = await _run_spec( + spec, + reg, + { + "findings": [ + {"claim": "A", "evidence": True}, + {"claim": "B", "evidence": False}, + {"claim": "C", "evidence": True}, + ] + }, + ) + assert out == ["A", "C"] + + +# ------------------------------------------------------------ quality lints +def _self_review_spec(): + return WorkflowSpec( + goal="x", + steps=[ + StepRef( + kind="step", + id="a", + capability="classify", + input=Binding(source="task"), + ), + StepRef( + kind="step", + id="b", + capability="classify", + input=Binding(source="step", step="a"), + ), + ], + output=Binding(source="step", step="b"), + ) + + +def test_lint_warns_on_same_capability_review(): + # classify reviewing classify's own output cannot be independent. + warnings = WorkflowSpecValidator(_registry()).validate(_self_review_spec()) + assert any("same capability 'classify'" in w for w in warnings) + + +def test_lint_self_chain_policy_suppresses(): + # draft -> critique-own-draft -> redraft is legitimate refinement; a + # capability can opt out of the self-review lint via allow_self_chain. + reg = _registry() + reg["classify"].allow_self_chain = True + warnings = WorkflowSpecValidator(reg).validate(_self_review_spec()) + assert [w for w in warnings if w.startswith("plan-quality")] == [] + + +def test_lint_waiver_suppresses_and_is_recorded(): + # A per-plan waiver (node id -> justification) suppresses the lint AND is + # recorded in the frozen record — auditable suppression, not silence. + waivers = {"b": "intentional self-refinement pass"} + warnings = WorkflowSpecValidator(_registry()).validate( + _self_review_spec(), lint_waivers=waivers + ) + assert [w for w in warnings if w.startswith("plan-quality")] == [] + rec = FrozenWorkflowRecord.freeze( + _self_review_spec(), + planner_model="gemini-3.5-flash", + registry=_registry(), + created_at="2026-06-09T00:00:00Z", + lint_waivers=waivers, + ) + assert export_plan(rec)["lint_waivers"] == waivers + + +def test_import_rejects_missing_contract_hashes(): + # Review finding (High): stripping capability_contract_hashes from the + # envelope must NOT bypass drift detection. Exact reproduction: export, + # delete the field, change a capability's output schema without bumping + # the manual version — import must fail closed on the missing hashes. + env = export_plan(_frozen()) + del env["capability_contract_hashes"] + + class NewCountReport(BaseModel): + n: int + + changed = _registry() + changed["count"].output_model = NewCountReport # version string unchanged + with pytest.raises(PlanImportError, match="missing contract hashes"): + import_plan(env, changed, task_input=_TASK) + # fail closed even with NO drift at all — the field itself is required: + env2 = export_plan(_frozen()) + del env2["capability_contract_hashes"] + with pytest.raises(PlanImportError, match="missing contract hashes"): + import_plan(env2, _registry(), task_input=_TASK) + + +def test_import_rejects_partial_contract_hashes(): + # Dropping a SINGLE capability's hash must fail closed too. + env = export_plan(_frozen()) + del env["capability_contract_hashes"]["count"] + with pytest.raises( + PlanImportError, match=r"missing contract hashes for \['count'\]" + ): + import_plan(env, _registry(), task_input=_TASK) + + +def test_import_rejects_contract_hash_drift(): + # The DERIVED drift signal: change a capability's declared contract (here, + # its output schema) WITHOUT bumping the manual version — manual-version + # drift stays silent; the contract hash catches it. + env = export_plan(_frozen()) + + class NewCountReport(BaseModel): + n: int # narrower contract than before + + changed = _registry() + changed["count"].output_model = NewCountReport # version string unchanged + with pytest.raises(PlanImportError, match="contract drift"): + import_plan(env, changed, task_input=_TASK) + + +def test_lint_warns_on_unsynthesized_fanout(): + spec = WorkflowSpec( + goal="x", + steps=[ + FanOut( + kind="fan_out", + id="rev", + over=Binding(source="task", path="files"), + capability="review", + ), + ], + output=Binding(source="step", step="rev"), + ) + warnings = WorkflowSpecValidator(_registry()).validate(spec) + assert any("no downstream synthesis" in w for w in warnings) + + +def test_lints_clean_on_independent_plan(): + # review -> count: different capabilities, fan_out is synthesized. Clean. + assert ( + WorkflowSpecValidator(_registry()).validate(_fanout_aggregate_spec()) + == [] + ) diff --git a/contributing/samples/workflows/authored_workflow_spike/test_live_planner_sweep.py b/contributing/samples/workflows/authored_workflow_spike/test_live_planner_sweep.py new file mode 100644 index 00000000000..27425b5dc86 --- /dev/null +++ b/contributing/samples/workflows/authored_workflow_spike/test_live_planner_sweep.py @@ -0,0 +1,328 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""OPTIONAL live planner sweep for RFC #93 — coverage across plan shapes. + +Skipped unless a real model is configured (no hardcoded project/model). Asks a +planner LlmAgent(output_schema=WorkflowSpec) to author plans for three shapes — +multi-stage, branch, loop_until — then validates and executes each on the real +ADK engine. Demonstrates authoring quality beyond the single fan-out/aggregate +shape from the original gate. + +Enable (Vertex): + export SPIKE_LIVE=1 GOOGLE_GENAI_USE_VERTEXAI=1 + export GOOGLE_CLOUD_PROJECT= GOOGLE_CLOUD_LOCATION=global + export SPIKE_GEMINI_MODEL=gemini-3.5-flash # 3.5 serves from `global` + pytest test_live_planner_sweep.py -q -s +""" + +from __future__ import annotations + +import os +import sys +from typing import Literal + +from google.adk import Agent +from google.adk import Event +from google.adk import Workflow +from google.adk.runners import Runner +from google.adk.sessions.in_memory_session_service import InMemorySessionService +from google.adk.workflow import node +from google.genai import types +from pydantic import BaseModel +import pytest + +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +from authoring import Capability # noqa: E402 +from authoring import CapabilityRegistry +from authoring import SpecInterpreter +from authoring import WorkflowSpec +from authoring import WorkflowSpecValidator + +_LIVE = os.environ.get("SPIKE_LIVE") == "1" and bool( + os.environ.get("GOOGLE_CLOUD_PROJECT") +) +pytestmark = pytest.mark.skipif( + not _LIVE, reason="set SPIKE_LIVE=1 + project/model env to run" +) +MODEL = os.environ.get("SPIKE_GEMINI_MODEL", "gemini-2.5-flash") +DET = types.GenerateContentConfig(temperature=0) + + +def _agent(name, schema, instr): + return Capability( + name=name, + input_kind="item", + output_model=schema, + serialize_input=True, + build=lambda: Agent( + name=name, + model=MODEL, + output_schema=schema, + generate_content_config=DET, + instruction=instr, + ), + ) + + +# Enumerated fields (NOT an open dict) — the contract lesson from the first gate. +class ReportFixed(BaseModel): + total: int + critical: int + high: int + medium: int + low: int + none: int + summary: str + + +class Finding(BaseModel): + path: str + severity: Literal["CRITICAL", "HIGH", "MEDIUM", "LOW", "NONE"] + issue: str + + +class Verdict(BaseModel): + is_tech: bool + + +class Category(BaseModel): + category: Literal["tech", "other"] + + +class Note(BaseModel): + note: str + + +def _registry(): + caps = [ + _agent( + "reviewer", + Finding, + "Input JSON with keys path and code. Output a Finding (echo path).", + ), + Capability( + name="triager", + input_kind="list", + output_model=ReportFixed, + serialize_input=True, + build=lambda: Agent( + name="triager", + model=MODEL, + output_schema=ReportFixed, + generate_content_config=DET, + instruction=( + "Input: JSON list of Findings. Output ReportFixed: total, " + "per-severity counts (sum to total), one-line summary." + ), + ), + ), + _agent( + "formatter", + Note, + "Input: a ReportFixed JSON. Output a Note: a one-line markdown" + " bullet.", + ), + _agent( + "writer", + Note, + "Input: a topic (maybe with feedback). Output a Note: a short tech" + " headline.", + ), + _agent( + "is_tech", + Verdict, + "Input: a headline/Note JSON. Output Verdict.is_tech=true iff it is" + " about technology/software.", + ), + _agent( + "classifier", + Category, + "Input: a short text. Output Category 'tech' or 'other'.", + ), + _agent( + "tech_note", + Note, + "Input: text. Output a Note summarizing it as a tech item.", + ), + _agent( + "other_note", + Note, + "Input: text. Output a Note summarizing it as a general item.", + ), + ] + # mark reviewer as item/list correctly + caps[0] = Capability( + name="reviewer", + input_kind="item", + output_model=Finding, + serialize_input=True, + build=lambda: Agent( + name="reviewer", + model=MODEL, + output_schema=Finding, + generate_content_config=DET, + instruction=( + "Input JSON with keys path and code. Output a Finding (echo" + " path)." + ), + ), + ) + return CapabilityRegistry(caps) + + +SHAPES = { + "multi_stage": { + "registry_desc": ( + "reviewer (item: a file with path and code -> Finding), triager" + " (LIST of Findings -> ReportFixed), formatter (item: a ReportFixed" + " -> Note)." + ), + "task": ( + "Audit files for security. Fan out reviewer over task.files (a list" + " of {path,code}), triager on the findings, then formatter on the" + " report. output=formatter." + ), + "task_input": { + "files": [ + {"path": "a.py", "code": "os.system('ping '+host)"}, + {"path": "b.py", "code": "def add(x,y): return x+y"}, + ] + }, + }, + "branch": { + "registry_desc": ( + "classifier (item: text -> Category with category tech or other)," + " tech_note (item -> Note), other_note (item -> Note)." + ), + "task": ( + "Classify task.text with classifier, then branch on the category." + " The classifier outputs a Category object, so bind the branch `on`" + " to its category field (Binding source=step, step=," + " path='category'). Routes: tech->tech_note, other->other_note" + " (both run on task.text). output=the branch." + ), + "task_input": {"text": "a new programming language for systems code"}, + }, + "loop": { + "registry_desc": ( + "writer (item: a topic -> a Note headline), is_tech (item: a Note" + " -> a Verdict with boolean is_tech)." + ), + "task": ( + "loop_until: body=[writer on task.topic], until_capability=is_tech" + " with until_input bound to the writer step, max_iters=3." + " output=the loop." + ), + "task_input": {"topic": "quantum computing"}, + }, +} + + +async def _author_validate_execute(shape, cfg): + reg = _registry() + planner = Agent( + name="planner", + model=MODEL, + output_schema=WorkflowSpec, + generate_content_config=DET, + instruction=( + "Author a WorkflowSpec using ONLY these capabilities: " + + cfg["registry_desc"] + + " Use Binding(source='task', path=...) for task input and" + " Binding(source='step', step=) to chain. " + + cfg["task"] + ), + ) + holder = {} + + @node(rerun_on_resume=True) + async def parent(ctx, node_input): + raw = await ctx.run_node( + planner, node_input=f"Shape: {shape}. Author the plan.", run_id="plan" + ) + spec = WorkflowSpec.model_validate(raw) + holder["spec"] = spec + WorkflowSpecValidator(reg).validate(spec) # raises on invalid + holder["valid"] = True + interp = SpecInterpreter(reg, ctx) + holder["output"] = await interp.execute(spec, cfg["task_input"]) + yield Event(output={"_done": True}) + + wf = Workflow(name=shape, edges=[("START", parent)]) + ss = InMemorySessionService() + r = Runner(app_name=wf.name, node=wf, session_service=ss) + s = await ss.create_session(app_name=wf.name, user_id="u") + async for _ in r.run_async( + user_id="u", + session_id=s.id, + new_message=types.Content(parts=[types.Part(text="go")], role="user"), + ): + pass + return holder + + +def _all_capabilities(nodes) -> set: + """Capabilities referenced anywhere in the plan tree (incl. branch/loop).""" + out = set() + for n in nodes: + if n.kind in ("step", "fan_out"): + out.add(n.capability) + elif n.kind == "loop_until": + out.add(n.until_capability) + out |= _all_capabilities(n.body) + elif n.kind == "branch": + for route in n.routes: + out |= _all_capabilities(route.block) + return out + + +@pytest.mark.parametrize("shape", list(SHAPES)) +@pytest.mark.asyncio +async def test_planner_sweep(shape): + h = await _author_validate_execute(shape, SHAPES[shape]) + spec = h["spec"] + top_kinds = [s.kind for s in spec.steps] + caps = _all_capabilities(spec.steps) + print( + f"\n[{shape}] top_kinds={top_kinds} caps={sorted(caps)} " + f"valid={h.get('valid')} output={str(h.get('output'))[:100]}" + ) + assert h.get("valid") is True + assert h.get("output") is not None + + # Shape-specific structure — a degenerate plan must NOT pass. + if shape == "multi_stage": + assert top_kinds == ["fan_out", "step", "step"] + assert {"reviewer", "triager", "formatter"} <= caps + elif shape == "branch": + assert "branch" in top_kinds and "step" in top_kinds + branch = next(n for n in spec.steps if n.kind == "branch") + route_values = {r.value for r in branch.routes} + route_caps = set() + for r in branch.routes: + route_caps |= _all_capabilities(r.block) + assert route_values == { + "tech", + "other", + }, route_values # exact routes, not just >=2 + assert { + "tech_note", + "other_note", + } <= route_caps # both routes wired correctly + assert "classifier" in caps + elif shape == "loop": + loop = next(n for n in spec.steps if n.kind == "loop_until") + assert len(loop.body) >= 1 + assert "writer" in caps and "is_tech" in caps diff --git a/contributing/samples/workflows/dynamic_supervisor_spike/README.md b/contributing/samples/workflows/dynamic_supervisor_spike/README.md new file mode 100644 index 00000000000..f89917ec51c --- /dev/null +++ b/contributing/samples/workflows/dynamic_supervisor_spike/README.md @@ -0,0 +1,103 @@ +# Dynamic Supervisor Spike — concurrent dynamic dispatch for ADK Workflows + +Reproducible harness for an RFC proposing leaf-gated concurrent dynamic +dispatch (`ctx.pipeline` / `ctx.parallel`) on the ADK Workflow engine. + +**The harness exists to prove the design on the real engine, not to ship an +API.** It pins exactly which properties hold: all five merge-gate properties +hold with a wrapper supervisor on the unmodified engine. The v1 interrupt +behavior is decided — cancel in-flight siblings and re-run them on resume; +checkpoint-then-pause is a deferred v2 product decision. + +## Environment + +- Built/run against `google/adk-python` (branch rebased onto current `main`). +- *Historical run evidence below was captured on ADK `2.0.0` at `origin/main` @ `4006fe40`; results re-verified on the rebased branch.* +- Python: 3.11+ (uses `asyncio.TaskGroup` + `except*`) + +## Files + +| File | Purpose | +| ---------------------------------- | ------------------------------------------------------------------------------------------------------ | +| `supervisor.py` | Prototype `DynamicNodeSupervisor` (gate-on-leaf + `TaskGroup` fan-out) over the real `ctx.run_node()`. | +| `test_dynamic_supervisor_spike.py` | Deterministic regression harness (no LLM). The trustworthy artifact. | +| `test_live_gemini_e2e.py` | OPTIONAL live-model evidence; env-gated, skipped by default. | + +## Run the deterministic harness (CI-safe, no network) + +```bash +pytest contributing/samples/workflows/dynamic_supervisor_spike/test_dynamic_supervisor_spike.py -q +``` + +### Expected current result: **11 passed** + +1. `test_concurrent_dispatch_correct_and_barrier_free` — concurrent `ctx.run_node` + executes correctly (distinct results, no corruption), wall ≈ max-delay not serial sum. +1. `test_pipeline_barrier_free` — item 0 enters stage 2 before item 1 finishes stage 1. +1. `test_parallel_failed_item_isolation` — ordinary error → `None`, siblings unaffected. +1. `test_control_exception_propagates_and_cancels_siblings` — `NodeInterruptedError` + propagates **and cancels the running sibling**. Requires `asyncio.TaskGroup`: + `asyncio.gather` propagates but does **not** cancel siblings, so the supervisor + contract mandates TaskGroup-equivalent structured concurrency. +1. `test_nested_combinator_no_deadlock_leaf_gating` — a pipeline stage calling + `parallel` with `gate=2` completes; peak in-flight ≤ gate. +1. `test_driver_gating_deadlocks_as_predicted` — CONTRAST: gating *drivers* instead + of *leaves* deadlocks (timeout). Proves the leaf-gating decision empirically. +1. `test_sequential_resume_is_exactly_once` — sequential dispatch resumes + exactly-once (completed children fast-forward; interrupted node re-runs). +1. `test_concurrent_resume_completed_children_fast_forward` — **the merge gate.** + Under *concurrent* dispatch, children that COMPLETE before the interrupt + fast-forward on resume (exactly-once). No double-spend. +1. `test_concurrent_inflight_children_cancelled_on_interrupt_rerun` — pins the + **decided v1 semantic**: a sibling that interrupts while others are still IN + FLIGHT cancels them; cancelled (never-completed) children correctly re-run on + resume. (Checkpoint-then-pause is deferred to v2.) +1. `test_child_cancellederror_does_not_cancel_siblings` — a branch-originated + `asyncio.CancelledError` is asyncio task-cancellation: not propagated, siblings + untouched, slot left `None`. Only `NodeInterruptedError` / non-cancellation + `BaseException` cancel siblings. +1. `test_gate_must_be_positive` — `gate=0`/negative raises `ValueError` at + construction (would otherwise deadlock every dispatch). + +## Resume exactly-once: there is no engine gap (a correction) + +An earlier draft of this harness reported a resume "engine gap." That was a +**test artifact and has been retracted.** The earlier test let the +`RequestInput` child interrupt *before* its siblings finished, so the +`TaskGroup` cancelled still-running siblings; those **cancelled (never +completed)** children then re-ran on resume — which is *correct*, not a bug. + +With the timing separated (test 8 vs test 9), the truth is: + +- **Completed** concurrent children **fast-forward** on resume (exactly-once) — + identical to sequential. No double-spend of completed LLM work. +- **In-flight** children cancelled by an interrupting sibling **re-run** on + resume — correctness-preserving (they never completed). + +The `"Workflow ...: cancelling N leftover tasks"` log is **benign cleanup** — it +appears even in the sequential exactly-once run, and completion is still +checkpointed correctly. It is not corruption. + +**Net: all five merge-gate properties hold with a wrapper supervisor + the real +engine; no `_workflow.py` change is required for resume correctness.** The one +behavior worth calling out in the RFC is a design trade-off, not a bug: +interrupting one branch cancels in-flight siblings and discards their partial +progress. If preserving that progress is desired, that is a separate design +decision (e.g. checkpoint-then-pause instead of cancel). + +## Optional: live model evidence (supporting only) + +Skipped unless explicitly configured — never runs in CI by accident: + +```bash +export SPIKE_LIVE=1 +export GOOGLE_GENAI_USE_VERTEXAI=1 +export GOOGLE_CLOUD_PROJECT= +export GOOGLE_CLOUD_LOCATION=global # gemini-3.5-flash serves here +export SPIKE_GEMINI_MODEL=gemini-3.5-flash # or any flash model you can access +pytest contributing/samples/workflows/dynamic_supervisor_spike/test_live_gemini_e2e.py -q -s +``` + +Asserts the concurrent pipeline wall-clock is well under the serial sum of +per-call latencies. The deterministic engine tests — not this — are the +artifact maintainers should trust. diff --git a/contributing/samples/workflows/dynamic_supervisor_spike/supervisor.py b/contributing/samples/workflows/dynamic_supervisor_spike/supervisor.py new file mode 100644 index 00000000000..0c899b01202 --- /dev/null +++ b/contributing/samples/workflows/dynamic_supervisor_spike/supervisor.py @@ -0,0 +1,189 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Prototype DynamicNodeSupervisor for concurrent dynamic dispatch. + +This is the RFC spike artifact (see README.md). It layers concurrent +``ctx.run_node()`` orchestration on the real ADK Workflow engine via a +framework-owned supervisor. Two design decisions are encoded here and +verified by the tests: + +1. The concurrency gate is acquired around each LEAF dispatch (a single + ``ctx.run_node`` call), NOT around drivers. Orchestration frames + (drivers, fan-out, nested pipeline/parallel) hold no permit while + awaiting children, so nesting a combinator inside a stage cannot + deadlock. (Gating drivers DOES deadlock — see the contrast test.) + +2. Fan-out uses ``asyncio.TaskGroup`` (structured concurrency), NOT + ``asyncio.gather``. ``gather`` propagates an exception but does not + cancel siblings. + +Failure / cancellation contract (verified by the tests): + +* Ordinary ``Exception`` in a branch -> that branch becomes ``None``; + siblings are unaffected. +* ``NodeInterruptedError`` (and other non-cancellation ``BaseException`` + such as ``KeyboardInterrupt`` / ``SystemExit``) -> propagates, and + ``TaskGroup`` cancels the remaining branches. +* External cancellation of the combinator -> propagates down to the + in-flight branches (standard structured concurrency). +* A branch raising ``asyncio.CancelledError`` itself is treated by asyncio + as that task's own cancellation: ``TaskGroup`` does NOT propagate it and + does NOT cancel siblings; the branch's slot is left ``None`` and the + others run to completion. (This is asyncio semantics, not something the + supervisor can override without bespoke handling — see the test.) +""" + +from __future__ import annotations + +import asyncio +import os +from typing import Any +from typing import Awaitable +from typing import Callable +from typing import Sequence + +from google.adk.workflow._errors import NodeInterruptedError + +# Control exceptions are NEVER converted to None. NodeInterruptedError is a +# BaseException by design so it cannot be swallowed by ``except Exception``. +_CONTROL_EXC = ( + NodeInterruptedError, + asyncio.CancelledError, + KeyboardInterrupt, + SystemExit, +) + + +def default_gate() -> int: + """min(16, cpu-2): matches the Claude Code reference runtime cap.""" + return min(16, max(1, (os.cpu_count() or 3) - 2)) + + +class DynamicNodeSupervisor: + """Drives concurrent dynamic ``ctx.run_node()`` chains under one parent.""" + + def __init__(self, ctx, *, gate: int | None = None) -> None: + self.ctx = ctx + resolved_gate = gate if gate is not None else default_gate() + if resolved_gate < 1: + raise ValueError(f"gate must be >= 1, got {resolved_gate}") + self.gate = asyncio.Semaphore(resolved_gate) + self.peak_in_flight = 0 + self._in_flight = 0 + + async def dispatch( + self, child, *, node_input: Any = None, run_id: str | None = None + ) -> Any: + """One leaf dispatch. The gate is held ONLY for the child execution. + + Each dispatch runs in its OWN sub-branch AND its own isolation scope + (parent_scope::run_id). Parallel siblings share an author name, and a + single_turn LLM child making MULTIPLE model calls (tool loops) rebuilds + its context per call by scanning for the latest input event in its + isolation scope — with the parent's shared scope, sibling inputs landing + in between get picked up instead (observed: fanned-out tool-using + skeptics all answering the LAST sibling's claim). The wrapper stamps the + child's input event and its FC/FR trail with ctx.isolation_scope, so a + per-dispatch scope makes context independence structural for multi-call + children rather than an artifact of single-call timing. + """ + async with self.gate: + self._in_flight += 1 + self.peak_in_flight = max(self.peak_in_flight, self._in_flight) + parent_scope = getattr(self.ctx, "isolation_scope", None) + scope = f"{parent_scope}::{run_id}" if run_id else parent_scope + try: + return await self.ctx.run_node( + child, + node_input=node_input, + run_id=run_id, + use_sub_branch=True, + override_isolation_scope=scope, + ) + finally: + self._in_flight -= 1 + + async def _guard_ordinary(self, factory: Callable[[], Awaitable[Any]]) -> Any: + """Ordinary Exception -> None (drop the branch). Control exceptions escape.""" + try: + return await factory() + except _CONTROL_EXC: + raise + except Exception: # noqa: BLE001 - includes DynamicNodeFailError + return None + + async def _supervise( + self, factories: Sequence[Callable[[], Awaitable[Any]]] + ) -> list[Any]: + """Structured fan-out via TaskGroup. See the failure/cancellation contract + in the module docstring: ordinary failure -> None; NodeInterruptedError + (and other non-cancellation BaseException) propagates and cancels the rest; + a branch's own CancelledError leaves its slot None without cancelling + siblings. Results preserve input order. + """ + results: list[Any] = [None] * len(factories) + + async def _run_one(i: int, f: Callable[[], Awaitable[Any]]) -> None: + results[i] = await self._guard_ordinary(f) + + try: + async with asyncio.TaskGroup() as tg: + for i, f in enumerate(factories): + tg.create_task(_run_one(i, f)) + except* NodeInterruptedError: + raise NodeInterruptedError() + return results + + async def parallel( + self, thunks: Sequence[Callable[[], Awaitable[Any]]] + ) -> list[Any]: + """BARRIER fan-out. thunks: zero-arg callables returning awaitables.""" + return await self._supervise(thunks) + + async def pipeline( + self, + items: Sequence[Any], + *stages: Callable[[Any, Any, int], Awaitable[Any]], + gate_drivers: bool = False, + ) -> list[Any]: + """Barrier-free per-item pipelining. Stage signature: (prev, item, index). + + Each item flows through all stages independently; item A may be in stage + k while item B is in stage 1. An ordinary Exception in a stage drops that + item to None; control exceptions propagate. + + ``gate_drivers=True`` is the intentionally-BUGGY variant used by the + contrast test to demonstrate the nested-combinator deadlock. + """ + + def make_driver(item: Any, i: int) -> Callable[[], Awaitable[Any]]: + async def drive() -> Any: + prev = item + for stage in stages: + prev = await stage(prev, item, i) + return prev + + if gate_drivers: + + async def gated() -> Any: + async with self.gate: # gating the DRIVER -> deadlock on nesting + return await drive() + + return gated + return drive + + return await self._supervise( + [make_driver(it, i) for i, it in enumerate(items)] + ) diff --git a/contributing/samples/workflows/dynamic_supervisor_spike/test_dynamic_supervisor_spike.py b/contributing/samples/workflows/dynamic_supervisor_spike/test_dynamic_supervisor_spike.py new file mode 100644 index 00000000000..f7d6af655cd --- /dev/null +++ b/contributing/samples/workflows/dynamic_supervisor_spike/test_dynamic_supervisor_spike.py @@ -0,0 +1,472 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""Deterministic regression harness for the DynamicNodeSupervisor spike. + +These tests use deterministic FunctionNodes (no LLM) and run against the REAL +ADK Workflow engine (Runner + InMemorySessionService). They are the artifact +that makes the RFC credible: they pin exactly which properties hold. + +Expected result: ALL PASS. (First captured on ADK 2.0.0; re-verified on the +branch rebased onto current upstream main.) +All five merge-gate properties hold with a wrapper supervisor on the unmodified +engine — barrier-free execution, pipeline barrier-free, failed-item isolation, +control-exception cancellation, nested no-deadlock (+ driver-gating deadlock +contrast), and resume exactly-once for children that COMPLETE before an +interrupt (both sequential and concurrent). The only documented behavior is a +design trade-off, not a bug: a child that interrupts while siblings are still +IN FLIGHT causes those siblings to be cancelled and re-run on resume. + +(An earlier draft reported a concurrent-resume "engine gap"; that was a test +artifact — the interrupt fired before siblings completed, so they were +cancelled, not completed. It has been retracted.) +""" + +from __future__ import annotations + +import asyncio +import collections +import os +import sys +import time + +from google.adk import Context +from google.adk import Event +from google.adk import Workflow +from google.adk.apps.app import App +from google.adk.apps.app import ResumabilityConfig +from google.adk.events.request_input import RequestInput +from google.adk.runners import Runner +from google.adk.sessions.in_memory_session_service import InMemorySessionService +from google.adk.workflow import node +from google.adk.workflow._errors import NodeInterruptedError +from google.adk.workflow.utils._workflow_hitl_utils import create_request_input_response +from google.adk.workflow.utils._workflow_hitl_utils import get_request_input_interrupt_ids +from google.adk.workflow.utils._workflow_hitl_utils import REQUEST_INPUT_FUNCTION_CALL_NAME +from google.genai import types +import pytest + +sys.path.insert(0, os.path.dirname(os.path.abspath(__file__))) +from supervisor import DynamicNodeSupervisor # noqa: E402 (local spike module) + + +# -------------------------------------------------------------------------- +# Harness helpers +# -------------------------------------------------------------------------- +async def _run(parent_fn, *, app_name="spike"): + wf = Workflow(name=app_name, edges=[("START", parent_fn)]) + ss = InMemorySessionService() + runner = Runner(app_name=wf.name, node=wf, session_service=ss) + session = await ss.create_session(app_name=wf.name, user_id="u") + msg = types.Content(parts=[types.Part(text="go")], role="user") + probes = [] + async for ev in runner.run_async( + user_id="u", session_id=session.id, new_message=msg + ): + if ( + isinstance(ev, Event) + and isinstance(ev.output, dict) + and "probe" in ev.output + ): + probes.append(ev.output) + return probes + + +def _child(name, delay=0.0, fail=None, log=None): + @node(name=name) + async def child(ctx, node_input): + if log is not None: + log.append((name, "start", time.perf_counter())) + await asyncio.sleep(delay) + if fail == "error": + raise ValueError(f"{name} boom") + if log is not None: + log.append((name, "end", time.perf_counter())) + yield Event(output=f"{name}<-{node_input}") + + return child + + +# -------------------------------------------------------------------------- +# 1. Concurrent dispatch executes correctly and barrier-free +# -------------------------------------------------------------------------- +@pytest.mark.asyncio +async def test_concurrent_dispatch_correct_and_barrier_free(): + log = [] + + @node(rerun_on_resume=True) + async def parent(ctx: Context, node_input): + sup = DynamicNodeSupervisor(ctx) + delays = [0.05, 0.05, 0.05, 0.05] + res = await sup.parallel([ + ( + lambda i=i, d=delays[i]: sup.dispatch( + _child(f"c{i}", d, log=log), node_input=i, run_id=f"r{i}" + ) + ) + for i in range(4) + ]) + # peak_in_flight is the primary, timing-independent proof of concurrency. + yield Event(output={"probe": "bf", "res": res, "peak": sup.peak_in_flight}) + + out = (await _run(parent))[0] + assert sorted(out["res"]) == [ + f"c{i}<-{i}" for i in range(4) + ] # correct + distinct + assert len(set(out["res"])) == 4 # no aliasing / corruption + assert out["peak"] == 4 # all 4 truly ran at once + # event-order overlap: every child starts before any child ends (true fan-out) + starts = sorted(t for (_, p, t) in log if p == "start") + ends = sorted(t for (_, p, t) in log if p == "end") + assert max(starts) < min(ends) # all started before any ended + + +# -------------------------------------------------------------------------- +# 2. pipeline barrier-free: item0 enters stage2 before item1 finishes stage1 +# -------------------------------------------------------------------------- +@pytest.mark.asyncio +async def test_pipeline_barrier_free(): + log = [] + + @node(rerun_on_resume=True) + async def parent(ctx: Context, node_input): + sup = DynamicNodeSupervisor(ctx) + + async def s1(prev, item, i): + return await sup.dispatch( + _child(f"s1_{i}", 0.25 if i == 1 else 0.0, log=log), + node_input=item, + run_id=f"s1x{i}", + ) + + async def s2(prev, item, i): + return await sup.dispatch( + _child(f"s2_{i}", 0.0, log=log), node_input=prev, run_id=f"s2x{i}" + ) + + res = await sup.pipeline([0, 1], s1, s2) + yield Event(output={"probe": "pf", "res": res}) + + await _run(parent) + starts = {n: t for (n, p, t) in log if p == "start"} + ends = {n: t for (n, p, t) in log if p == "end"} + assert "s2_0" in starts and "s1_1" in ends + assert starts["s2_0"] < ends["s1_1"] # no inter-stage barrier + + +# -------------------------------------------------------------------------- +# 3. parallel failed-item isolation: ordinary error -> None, siblings fine +# -------------------------------------------------------------------------- +@pytest.mark.asyncio +async def test_parallel_failed_item_isolation(): + @node(rerun_on_resume=True) + async def parent(ctx: Context, node_input): + sup = DynamicNodeSupervisor(ctx) + res = await sup.parallel([ + (lambda: sup.dispatch(_child("p0", 0.02), node_input=0, run_id="p0")), + ( + lambda: sup.dispatch( + _child("p1", 0.01, fail="error"), node_input=1, run_id="p1" + ) + ), + (lambda: sup.dispatch(_child("p2", 0.02), node_input=2, run_id="p2")), + ]) + yield Event(output={"probe": "fi", "res": res}) + + res = (await _run(parent))[0]["res"] + assert res == ["p0<-0", None, "p2<-2"] + + +# -------------------------------------------------------------------------- +# 4. Supervisor fan-out contract: ordinary -> None; control exception +# PROPAGATES and CANCELS siblings. Requires TaskGroup (gather would not +# cancel). Tested directly on the supervisor (no engine needed). +# -------------------------------------------------------------------------- +@pytest.mark.asyncio +async def test_control_exception_propagates_and_cancels_siblings(): + sup = DynamicNodeSupervisor(ctx=None) + cancelled = {"v": False} + + async def boom(): + raise ValueError("ordinary") + + async def interrupt(): + raise NodeInterruptedError() + + async def sibling(): + try: + await asyncio.sleep(1.0) + return "finished" + except asyncio.CancelledError: + cancelled["v"] = True + raise + + async def okk(): + return "ok" + + # ordinary error -> None; sibling unaffected + assert await sup.parallel([lambda: boom(), lambda: okk()]) == [None, "ok"] + + # control exception propagates AND cancels the running sibling + with pytest.raises(NodeInterruptedError): + await sup.parallel([lambda: interrupt(), lambda: sibling()]) + await asyncio.sleep(0) # let cancellation settle + assert cancelled["v"] is True # explicit sibling-cancellation assertion + + +@pytest.mark.asyncio +async def test_child_cancellederror_does_not_cancel_siblings(): + """Contract boundary (narrowed): a branch raising asyncio.CancelledError is + asyncio's own task-cancellation. TaskGroup does NOT propagate it and does NOT + cancel siblings — the branch's slot is left None and siblings complete. This + is asyncio semantics; the supervisor does not (and is not claimed to) override + it. Only NodeInterruptedError / non-cancellation BaseException cancel siblings. + """ + sup = DynamicNodeSupervisor(ctx=None) + sib_finished = {"v": False} + + async def canceller(): + raise asyncio.CancelledError() + + async def sibling(): + await asyncio.sleep(0.03) + sib_finished["v"] = True + return "sib-done" + + res = await sup.parallel([lambda: canceller(), lambda: sibling()]) + assert res == [ + None, + "sib-done", + ] # cancelled branch -> None; sibling NOT cancelled + assert sib_finished["v"] is True + + +def test_gate_must_be_positive(): + """gate=0 would deadlock every dispatch; reject it at construction.""" + with pytest.raises(ValueError): + DynamicNodeSupervisor(ctx=None, gate=0) + with pytest.raises(ValueError): + DynamicNodeSupervisor(ctx=None, gate=-1) + + +# -------------------------------------------------------------------------- +# 5. Nested combinator no-deadlock with LEAF gating (gate=2). +# -------------------------------------------------------------------------- +@pytest.mark.asyncio +async def test_nested_combinator_no_deadlock_leaf_gating(): + @node(rerun_on_resume=True) + async def parent(ctx: Context, node_input): + sup = DynamicNodeSupervisor(ctx, gate=2) + + async def stage(prev, item, i): + return await sup.parallel([ + ( + lambda k=k: sup.dispatch( + _child(f"n{item}_{k}", 0.02), + node_input=k, + run_id=f"n{item}x{k}", + ) + ) + for k in range(3) + ]) + + res = await sup.pipeline(list(range(5)), stage) + yield Event( + output={"probe": "nest", "n": len(res), "peak": sup.peak_in_flight} + ) + + out = await asyncio.wait_for(_run(parent), timeout=10.0) # must NOT hang + assert out[0]["n"] == 5 + assert out[0]["peak"] <= 2 # leaf-gating bounds in-flight to the gate + + +# -------------------------------------------------------------------------- +# 5b. CONTRAST: gating DRIVERS deadlocks on nesting (proves leaf-gating matters) +# -------------------------------------------------------------------------- +@pytest.mark.asyncio +async def test_driver_gating_deadlocks_as_predicted(): + @node(rerun_on_resume=True) + async def parent(ctx: Context, node_input): + sup = DynamicNodeSupervisor(ctx, gate=2) + + async def stage(prev, item, i): + return await sup.parallel([ + ( + lambda k=k: sup.dispatch( + _child(f"d{item}_{k}", 0.02), + node_input=k, + run_id=f"d{item}x{k}", + ) + ) + for k in range(3) + ]) + + res = await sup.pipeline(list(range(5)), stage, gate_drivers=True) # BUGGY + yield Event(output={"probe": "dead", "n": len(res)}) + + with pytest.raises(asyncio.TimeoutError): + await asyncio.wait_for(_run(parent), timeout=3.0) + + +# -------------------------------------------------------------------------- +# Resume exactly-once — the merge gate. +# +# CORRECTION (vs an earlier draft of this harness): there is NO resume engine +# gap. An earlier test let the RequestInput child interrupt *before* its +# siblings finished, so the TaskGroup CANCELLED still-running siblings; those +# cancelled (never-completed) children then re-ran on resume — which is +# correct, not a bug. The tests below separate the two cases cleanly: +# * children that COMPLETE before the interrupt -> fast-forward (exactly-once) +# * children still IN FLIGHT at the interrupt -> cancelled -> correctly re-run +# Both hold for sequential AND concurrent dispatch. +# -------------------------------------------------------------------------- +async def _resume_scenario(*, concurrent, ask_delay, child_delay): + """Dispatch 3 plain children + 1 RequestInput child, interrupt, resume. + + Returns (body_runs, completed) where body_runs[name] counts body ENTRIES and + `completed` lists children that ran to completion (emitted output) on run 1. + The counter is captured by closure (NOT a pydantic field) so every body + execution is observed by the same object. + + Timing knobs decide whether children complete before the interrupt: + ask_delay -- ask sleeps this long before issuing RequestInput + child_delay -- each plain child sleeps this long before completing + """ + body_runs = collections.Counter() + completed = [] + + def plain(name): + @node(name=name) + async def child(ctx, node_input): + body_runs[name] += 1 + await asyncio.sleep(child_delay) + completed.append(name) + yield Event(output=f"{name}=done") + + return child + + @node(name="ask", rerun_on_resume=True) + async def ask(ctx: Context, node_input): + body_runs["ask"] += 1 + resume = getattr(ctx, "resume_inputs", {}).get("ask") + if resume is None: + await asyncio.sleep(ask_delay) + yield RequestInput(interrupt_id="ask", message="approve ask?") + else: + yield Event(output="ask=approved") + + @node(rerun_on_resume=True) + async def parent(ctx: Context, node_input): + sup = DynamicNodeSupervisor(ctx, gate=8) + thunks = [ + (lambda: sup.dispatch(plain("a"), node_input=1, run_id="ax")), + (lambda: sup.dispatch(plain("b"), node_input=2, run_id="bx")), + (lambda: sup.dispatch(plain("c"), node_input=3, run_id="cx")), + (lambda: sup.dispatch(ask, node_input=4, run_id="askx")), + ] + if concurrent: + res = await sup.parallel(thunks) + else: + res = [await t() for t in thunks] # sequential control + yield Event(output={"probe": "resume", "res": res}) + + wf = Workflow(name="resume_wf", edges=[("START", parent)]) + app = App( + name="resume_app", + root_agent=wf, + resumability_config=ResumabilityConfig(is_resumable=True), + ) + ss = InMemorySessionService() + runner = Runner(app=app, session_service=ss) + session = await ss.create_session(app_name=app.name, user_id="u") + + # run 1 -> expect RequestInput interrupt + msg = types.Content(parts=[types.Part(text="go")], role="user") + ev1 = [ + e + async for e in runner.run_async( + user_id="u", session_id=session.id, new_message=msg + ) + ] + req = None + for e in ev1: + if getattr(e, "content", None) and e.content and e.content.parts: + for p in e.content.parts: + if ( + p.function_call + and p.function_call.name == REQUEST_INPUT_FUNCTION_CALL_NAME + ): + req = e + assert req is not None, "expected a RequestInput interrupt on run 1" + completed_on_run1 = list(completed) + interrupt_id = get_request_input_interrupt_ids(req)[0] + invocation_id = req.invocation_id + + # resume + part = create_request_input_response(interrupt_id, {"approved": "yes"}) + _ = [ + e + async for e in runner.run_async( + user_id="u", + session_id=session.id, + new_message=types.Content(parts=[part], role="user"), + invocation_id=invocation_id, + ) + ] + return body_runs, completed_on_run1 + + +@pytest.mark.asyncio +async def test_sequential_resume_is_exactly_once(): + """Baseline: sequential dispatch — children complete in order before ask.""" + runs, completed1 = await _resume_scenario( + concurrent=False, ask_delay=0.0, child_delay=0.0 + ) + assert set(completed1) == {"a", "b", "c"} # all completed on run 1 + assert ( + runs["a"] == 1 and runs["b"] == 1 and runs["c"] == 1 + ) # fast-forward on resume + assert runs["ask"] == 2 # interrupted node re-runs + + +@pytest.mark.asyncio +async def test_concurrent_resume_completed_children_fast_forward(): + """Merge gate: under CONCURRENT dispatch, children that COMPLETE before the + interrupt fast-forward on resume (exactly-once). ask sleeps so a/b/c finish + first.""" + runs, completed1 = await _resume_scenario( + concurrent=True, ask_delay=0.10, child_delay=0.0 + ) + assert set(completed1) == {"a", "b", "c"} # genuinely completed + assert ( + runs["a"] == 1 and runs["b"] == 1 and runs["c"] == 1 + ) # NOT re-run -> exactly-once + assert runs["ask"] == 2 + + +@pytest.mark.asyncio +async def test_concurrent_inflight_children_cancelled_on_interrupt_rerun(): + """Documents the one real behavior: under CONCURRENT dispatch, a sibling that + interrupts while others are still IN FLIGHT causes the TaskGroup to cancel + them. Cancelled (never-completed) children correctly re-run on resume. This + is correctness-preserving (not a double-spend of completed work), though it + does discard the cancelled siblings' partial progress — a design trade-off + the RFC should note.""" + runs, completed1 = await _resume_scenario( + concurrent=True, ask_delay=0.0, child_delay=0.10 + ) + assert completed1 == [] # none completed (all cancelled) + assert ( + runs["a"] == 2 and runs["b"] == 2 and runs["c"] == 2 + ) # re-run is CORRECT here diff --git a/contributing/samples/workflows/dynamic_supervisor_spike/test_live_gemini_e2e.py b/contributing/samples/workflows/dynamic_supervisor_spike/test_live_gemini_e2e.py new file mode 100644 index 00000000000..7a638eae785 --- /dev/null +++ b/contributing/samples/workflows/dynamic_supervisor_spike/test_live_gemini_e2e.py @@ -0,0 +1,159 @@ +# Copyright 2026 Google LLC +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +"""OPTIONAL live end-to-end evidence for the DynamicNodeSupervisor spike. + +This is supporting evidence only — NOT part of the deterministic merge gate. +It is skipped unless a real model is explicitly configured via env vars, so it +never runs in CI by accident and contains no hardcoded project/location/model. + +Enable with, e.g. (Vertex): + export SPIKE_LIVE=1 + export GOOGLE_GENAI_USE_VERTEXAI=1 + export GOOGLE_CLOUD_PROJECT= + export GOOGLE_CLOUD_LOCATION=global # gemini-3.5-flash serves here + export SPIKE_GEMINI_MODEL=gemini-3.5-flash # or any flash model you can access + +The model is read from ``SPIKE_GEMINI_MODEL`` and **defaults to +``gemini-2.5-flash``** (broadly available in regional Vertex). To use +``gemini-3.5-flash`` set ``SPIKE_GEMINI_MODEL=gemini-3.5-flash`` and +``GOOGLE_CLOUD_LOCATION=global`` (it does not serve from ``us-central1``). + +It runs a 2-stage (review -> severity) pipeline over a few snippets, fanned out +concurrently through the supervisor, and asserts a real concurrency speedup. +""" + +from __future__ import annotations + +import asyncio +import os +import time + +import pytest + +_LIVE = os.environ.get("SPIKE_LIVE") == "1" and bool( + os.environ.get("GOOGLE_CLOUD_PROJECT") or os.environ.get("GOOGLE_API_KEY") +) + +pytestmark = pytest.mark.skipif( + not _LIVE, + reason=( + "live model not configured; set SPIKE_LIVE=1 and model/project env vars" + ), +) + +MODEL = os.environ.get("SPIKE_GEMINI_MODEL", "gemini-2.5-flash") + +SNIPPETS = [ + "def login(pw): return pw == 'admin123' # hardcoded password", + "query = f\"SELECT * FROM users WHERE id = {request.args['id']}\"", + "def add(a, b): return a + b", + "os.system('ping ' + user_supplied_host)", +] + + +@pytest.mark.asyncio +async def test_live_gemini_pipeline_speedup(): + import os as _os + import sys as _sys + + from google.adk import Agent + from google.adk import Context + from google.adk import Event + from google.adk import Workflow + from google.adk.runners import Runner + from google.adk.sessions.in_memory_session_service import InMemorySessionService + from google.adk.workflow import node + from google.genai import types + + _sys.path.insert(0, _os.path.dirname(_os.path.abspath(__file__))) + from supervisor import DynamicNodeSupervisor # noqa: E402 + + reviewer = Agent( + name="reviewer", + model=MODEL, + instruction=( + "You are a security reviewer. The user message is a code " + "snippet. In ONE short sentence, state the single biggest " + "security concern, or 'none'." + ), + ) + rater = Agent( + name="rater", + model=MODEL, + instruction=( + "The user message is a security concern. Reply with EXACTLY " + "one word: CRITICAL, HIGH, MEDIUM, LOW, or NONE." + ), + ) + + latencies: list[float] = [] + + async def timed(coro): + t = time.perf_counter() + out = await coro + latencies.append(time.perf_counter() - t) + return out + + @node(rerun_on_resume=True) + async def parent(ctx: Context, node_input): + sup = DynamicNodeSupervisor(ctx, gate=8) + + async def review(_prev, snippet, i): + return await timed( + sup.dispatch(reviewer, node_input=snippet, run_id=f"rev{i}") + ) + + async def rate(concern, snippet, i): + return await timed( + sup.dispatch(rater, node_input=str(concern), run_id=f"rate{i}") + ) + + t0 = time.perf_counter() + res = await sup.pipeline(SNIPPETS, review, rate) + yield Event( + output={ + "probe": "live", + "res": res, + "wall": time.perf_counter() - t0, + "sum": sum(latencies), + "n": len(latencies), + } + ) + + wf = Workflow(name="live", edges=[("START", parent)]) + ss = InMemorySessionService() + runner = Runner(app_name=wf.name, node=wf, session_service=ss) + session = await ss.create_session(app_name=wf.name, user_id="u") + msg = types.Content(parts=[types.Part(text="go")], role="user") + out = None + async for ev in runner.run_async( + user_id="u", session_id=session.id, new_message=msg + ): + if ( + isinstance(ev, Event) + and isinstance(ev.output, dict) + and ev.output.get("probe") == "live" + ): + out = ev.output + + assert out is not None + assert out["n"] == len(SNIPPETS) * 2 # 2 real calls per item + assert len([r for r in out["res"] if r]) == len(SNIPPETS) + # concurrent pipeline wall-clock is well under the serial sum of call latencies + assert out["wall"] < out["sum"] * 0.6 + print( + f"\nlive {MODEL}: {out['n']} calls, wall={out['wall']:.2f}s " + f"vs serial-sum={out['sum']:.2f}s = {out['sum']/out['wall']:.1f}x" + )