From b7774829d03b2f0a0741967cab728aab1a490ba8 Mon Sep 17 00:00:00 2001 From: "swarm:codex-next-readme" Date: Tue, 10 Feb 2026 20:07:26 -0700 Subject: [PATCH 1/8] docs(next): replace README with full integration reference Replaces the placeholder @workflow/next README with a complete package-level reference. The new content documents installation, withWorkflow() API usage/signature, configuration examples, internal behavior, env vars, exports, generated files, runtime routing, public manifest exposure, and troubleshooting. Verified: rg -n "^## Install|^### Type signature|^### Example: object config|^### Example: async config function|^## What .* does|^## Serving the manifest publicly|^## Troubleshooting" packages/next/README.md Verified: rg -n "WORKFLOW_TARGET_WORLD|WORKFLOW_LOCAL_DATA_DIR|PORT|WORKFLOW_NEXT_PRIVATE_BUILT|WORKFLOW_PUBLIC_MANIFEST|WATCHPACK_WATCHER_LIMIT" packages/next/README.md Verified: rg -n "@workflow/next|@workflow/next/loader|@workflow/next/runtime|\.well-known/workflow/v1/flow/route\.js|\.well-known/workflow/v1/step/route\.js|\.well-known/workflow/v1/webhook/\[token\]/route\.js" packages/next/README.md How to test: open packages/next/README.md and confirm all required sections and tables render correctly. --- packages/next/README.md | 198 +++++++++++++++++++++++++++++++++++++++- 1 file changed, 197 insertions(+), 1 deletion(-) diff --git a/packages/next/README.md b/packages/next/README.md index bb8604d934..dbd1e8ea1f 100644 --- a/packages/next/README.md +++ b/packages/next/README.md @@ -1,3 +1,199 @@ # @workflow/next -Next.js plugin for [Workflow DevKit](https://useworkflow.dev). +Next.js integration for Workflow DevKit. + +## Install + +```bash +npm install workflow next +# or +pnpm add workflow next +# or +yarn add workflow next +# or +bun add workflow next +``` + +`next` is a peer dependency. `workflow` includes this package as `workflow/next`. + +## Usage + +Wrap your Next config with `withWorkflow()`. + +```ts +import type { NextConfig } from 'next'; +import { withWorkflow } from '@workflow/next'; + +const nextConfig: NextConfig = { + // your Next.js config +}; + +export default withWorkflow(nextConfig); +``` + +### Type signature + +```ts +import type { NextConfig } from 'next'; + +export declare function withWorkflow( + nextConfigOrFn: + | NextConfig + | (( + phase: string, + ctx: { defaultConfig: NextConfig } + ) => Promise), + { + workflows, + }?: { + workflows?: { + local?: { + port?: number; + dataDir?: string; + }; + }; + } +): ( + phase: string, + ctx: { defaultConfig: NextConfig } +) => Promise; +``` + +### Example: object config + +```ts +import type { NextConfig } from 'next'; +import { withWorkflow } from '@workflow/next'; + +const nextConfig: NextConfig = { + reactStrictMode: true, +}; + +export default withWorkflow(nextConfig, { + workflows: { + local: { + port: 3152, + }, + }, +}); +``` + +### Example: async config function + +```ts +import type { NextConfig } from 'next'; +import { withWorkflow } from '@workflow/next'; + +export default withWorkflow(async (phase, { defaultConfig }) => { + const nextConfig: NextConfig = { + ...defaultConfig, + reactStrictMode: true, + }; + + if (phase === 'phase-production-build') { + nextConfig.productionBrowserSourceMaps = true; + } + + return nextConfig; +}); +``` + +## What `withWorkflow()` does + +When you wrap your config, `withWorkflow()`: + +1. Sets runtime defaults for local and Vercel worlds. +2. Registers the Workflow loader in both Turbopack and webpack. +3. Builds generated workflow routes in `.well-known/workflow/v1/*`. +4. Watches source files in development and incrementally rebuilds bundles. +5. Avoids duplicate builder runs per process using `WORKFLOW_NEXT_PRIVATE_BUILT`. + +## Environment variables + +| Variable | Used by | Behavior | +| --- | --- | --- | +| `WORKFLOW_TARGET_WORLD` | `withWorkflow()` + runtime world selection | If not set: defaults to `local` when not on Vercel, and `vercel` when `VERCEL_DEPLOYMENT_ID` is present. | +| `WORKFLOW_LOCAL_DATA_DIR` | Local world runtime | Set to `.next/workflow-data` by `withWorkflow()` when defaulting to local world. You can override it explicitly in your environment. | +| `PORT` | Next dev/build process | Set from `workflows.local.port` when running outside Vercel. | +| `WORKFLOW_NEXT_PRIVATE_BUILT` | `withWorkflow()` internals | Internal guard to ensure builder setup runs once per main process. | +| `WORKFLOW_PUBLIC_MANIFEST` | Builder/public output | When set to `1`, copies `manifest.json` to `public/.well-known/workflow/v1/manifest.json` so Next serves it publicly. | +| `WATCHPACK_WATCHER_LIMIT` | Watch mode on macOS | Set to `20` during dev watch mode on Darwin to mitigate slow watcher teardown behavior. | + +## Package exports + +| Export path | Description | +| --- | --- | +| `@workflow/next` | Main Next integration export. Provides `withWorkflow()`. | +| `@workflow/next/loader` | Loader that applies Workflow client-mode transforms for `"use workflow"` and `"use step"`. | +| `@workflow/next/runtime` | Re-export of `@workflow/core/dist/runtime` for runtime compatibility. | + +If you install the umbrella `workflow` package, these are available from `workflow/next` and related subpaths. + +## Generated `.well-known/workflow/v1/*` files + +`@workflow/next` generates these files under your app directory (`app/` or `src/app/`): + +| File | Purpose | Public route | +| --- | --- | --- | +| `.well-known/workflow/v1/flow/route.js` | Workflow orchestration handler bundle. | `POST /.well-known/workflow/v1/flow` | +| `.well-known/workflow/v1/step/route.js` | Step execution handler bundle. | `POST /.well-known/workflow/v1/step` | +| `.well-known/workflow/v1/webhook/[token]/route.js` | Webhook delivery handler bundle. | `POST /.well-known/workflow/v1/webhook/:token` | +| `.well-known/workflow/v1/manifest.json` | Workflow/step/class manifest (with graph metadata). | Not public unless `WORKFLOW_PUBLIC_MANIFEST=1` | +| `.well-known/workflow/v1/config.json` | Production function trigger config for Next build output. | Internal build artifact | +| `.well-known/workflow/v1/.gitignore` | Prevents committing generated artifacts. | N/A | + +If your app uses `pages/` only, the builder creates a sibling `app/` (or `src/app/`) directory for generated routes. + +## How generated files work at runtime + +1. Your app calls `start()` with a transformed workflow function. +2. Runtime posts to `/.well-known/workflow/v1/flow` to advance orchestration. +3. Steps execute through `/.well-known/workflow/v1/step`. +4. Webhook resumptions arrive through `/.well-known/workflow/v1/webhook/:token`. +5. Manifest metadata is used by tooling and can be exposed for observability. + +## Serving the manifest publicly + +To expose the manifest over HTTP, set: + +```bash +WORKFLOW_PUBLIC_MANIFEST=1 +``` + +On build, `@workflow/next` copies: + +- From: `app/.well-known/workflow/v1/manifest.json` (or `src/app/...`) +- To: `public/.well-known/workflow/v1/manifest.json` + +Next.js then serves it at: + +- `/.well-known/workflow/v1/manifest.json` + +## Troubleshooting + +### `'start' received an invalid workflow function` + +- Ensure your workflow function has `"use workflow"`. +- Ensure step functions use `"use step"` where required. +- Ensure `next.config.*` is wrapped with `withWorkflow()`. + +### Workflow routes return 404 + +- Confirm one of these exists: `app/`, `src/app/`, `pages/`, or `src/pages/`. +- Confirm generated files exist under `.well-known/workflow/v1/*`. +- If using a Next proxy handler, exclude `/.well-known/workflow/` paths. + +### Manifest route is missing + +- Set `WORKFLOW_PUBLIC_MANIFEST=1` before running/building. +- Rebuild so `manifest.json` is copied into `public/.well-known/workflow/v1/`. + +### Next.js 16.1+ build error + +If you see: + +```text +Error: Cannot find module 'next/dist/lib/server-external-packages.json' +``` + +Upgrade to `workflow@4.0.1-beta.26` or newer. From 58c54b26fe480a602194cfd8defb99739c2b9864 Mon Sep 17 00:00:00 2001 From: "swarm:codex-swc-spec" Date: Tue, 10 Feb 2026 20:11:01 -0700 Subject: [PATCH 2/8] docs(swc-plugin-workflow): clarify method support and validation errors Update spec.md to resolve the static/instance method contradiction, document forbidden step-expression validation errors, and add import compatibility guidance for serialization-file discovery. Verified: git diff --check -- packages/swc-plugin-workflow/spec.md Verified: npx --yes markdownlint-cli@0.41.0 packages/swc-plugin-workflow/spec.md --disable MD013 (fails with pre-existing file-wide markdown style issues unrelated to this change) --- packages/swc-plugin-workflow/spec.md | 10 +++++++++- 1 file changed, 9 insertions(+), 1 deletion(-) diff --git a/packages/swc-plugin-workflow/spec.md b/packages/swc-plugin-workflow/spec.md index a697ad9b8e..dddeb78866 100644 --- a/packages/swc-plugin-workflow/spec.md +++ b/packages/swc-plugin-workflow/spec.md @@ -576,7 +576,10 @@ registerSerializationClass("class//./input//Point", Point); ## Static Methods -Static class methods can be marked with directives. Instance methods are **not supported**. +Static class methods can be marked with directives. Instance methods are supported for `"use step"` (with custom serialization), but `"use workflow"` is only supported on static methods. + +- `"use step"`: supported on **static** and **instance** methods (instance methods require custom serialization). +- `"use workflow"`: supported on **static** methods only (instance methods are rejected). ### Static Step Method @@ -795,6 +798,8 @@ Files containing classes with custom serialization are automatically discovered This allows serialization classes to be defined in separate files (such as Next.js API routes or utility modules) and still be registered in the serialization system when the application is built. +> **Compatibility note:** For auto-discovery of serialization-only files, prefer importing `WORKFLOW_SERIALIZE` / `WORKFLOW_DESERIALIZE` from `@workflow/serde` consistently. If you import these symbols from `@vercel/workflow` in a file that contains only serialization classes (no `"use step"`/`"use workflow"`), the file may not match the discovery heuristic. If you must use `@vercel/workflow`, either use `Symbol.for("workflow-serialize"/"workflow-deserialize")` directly or ensure the file also contains a workflow directive so it is transformed. + ### Cross-Context Class Registration Classes with custom serialization are automatically included in **all bundle contexts** (step, workflow, client) to ensure they can be properly serialized and deserialized when crossing execution boundaries: @@ -850,6 +855,9 @@ The plugin emits errors for invalid usage: |-------|-------------| | Non-async function | Functions with `"use step"` or `"use workflow"` must be async | | Instance methods with `"use workflow"` | Only static methods can have `"use workflow"` (not instance methods) | +| Forbidden `this` in step function | Step functions cannot reference `this` (they are hoisted and executed out of instance context). | +| Forbidden `arguments` in step function | Step functions cannot reference `arguments`. Use explicit parameters or rest params instead. | +| Forbidden `super` in step function | Step functions cannot use `super` calls. Move that logic outside the step boundary. | | Misplaced directive | Directive must be at top of file or start of function body | | Conflicting directives | Cannot have both `"use step"` and `"use workflow"` at module level | | Invalid exports | Module-level directive files can only export async functions | From c2143116a781de13d790ac90ce415fa43bc5e643 Mon Sep 17 00:00:00 2001 From: "swarm:codex-blog-1" Date: Mon, 16 Feb 2026 11:58:56 -0700 Subject: [PATCH 3/8] docs(blog): add durable execution layer draft Add the requested blog markdown file at .blog/durable-execution-layer.md\nwith exact content and preserved formatting/code blocks.\n\nVerified: diff -u <(cat <<'EOF' ... ) .blog/durable-execution-layer.md (no diff; exact match)\nVerified: pnpm exec biome check .blog/durable-execution-layer.md (file is ignored by repo biome config) --- .blog/durable-execution-layer.md | 88 ++++++++++++++++++++++++++++++++ 1 file changed, 88 insertions(+) create mode 100644 .blog/durable-execution-layer.md diff --git a/.blog/durable-execution-layer.md b/.blog/durable-execution-layer.md new file mode 100644 index 0000000000..919d6641ea --- /dev/null +++ b/.blog/durable-execution-layer.md @@ -0,0 +1,88 @@ +# Durable execution layer + +## Thesis + +AI agents behave like programs, not like request handlers. They accumulate state, call tools, wait for external events, and keep going after partial failures. + +A stateless function can answer a prompt. It cannot reliably run an agent loop that spans dozens of tool calls and multiple minutes of wall-clock time. Durable execution fills that gap by persisting progress and replaying the orchestration logic after a cold start, crash, or scale event. + +Workflow DevKit gives JavaScript/TypeScript the missing primitive: run an agent as a workflow function that reconstructs state by deterministic replay, while isolating side effects into step functions that can retry independently. + +## Current state + +Most "production agents" ship as a pile of glue around a stateless compute unit. Engineers bolt on a database row for state, a queue for long work, and a set of ad-hoc idempotency keys to avoid duplicating side effects. + +That stack works until the agent does anything non-trivial. A single agent run can involve: planning with an LLM, fanning out to 5-10 tools, waiting on human input, then looping until a terminal condition. If the function times out mid-loop, the system has to reconstruct "what already happened" from whatever it managed to persist. + +Stateless retries amplify the problem. If a tool call fails transiently, the easiest recovery strategy is "retry the whole request." That replays earlier tool calls unless you built per-call checkpointing. When the earlier calls wrote to external systems (tickets, payments, emails), you also need idempotency across those systems. This turns the agent into a distributed transaction coordinator. + +Teams reinvent the same machinery: a run table, a step table, a dedupe key per side effect, and a scheduler for "wake me up later." The code that does the real work ends up scattered across handlers, cron jobs, and background workers. The agent logic becomes hard to reason about because it does not exist as a single program. + +## The shift + +Agent workloads push compute in a different direction than traditional APIs. + +Agents do more I/O per unit of business value. They spend most of their time waiting on other systems: model responses, rate limits, slow upstream APIs, and human approvals. The total wall-clock time for one run routinely exceeds the lifetime of any single compute instance. + +Agents also make correctness harder. Tool calls create side effects. Re-executing a tool call changes the world twice. Retrying the wrong layer turns "recover from transient error" into "duplicate the user's refund." + +Finally, agents want concurrency. A useful run pulls context from multiple sources in parallel and merges results into a plan. JavaScript already has the right ergonomics (`Promise.all`, `Promise.race`), but stateless environments make the failure modes unpredictable. A single timeout can force a full rerun of parallel work unless you cache each unit explicitly. + +This shift turns orchestration into infrastructure. You need a durable control plane for "what should happen next," not just a faster model. + +## The vision + +Treat an agent run as a workflow: deterministic orchestration code that drives side-effecting steps. + +Workflow functions (`'use workflow'`) provide the control loop. They run in a sandboxed VM and focus on coordination: branching, looping, parallel composition, and waiting. Step functions (`'use step'`) do the work that touches the world: calling models, fetching from APIs, writing to databases, emitting notifications. + +Workflow DevKit persists every step input and output to an append-only event log. When the workflow needs to continue — after a crash, after a scale event, or after a delay — the runtime replays the workflow function from the start and rehydrates state by replaying the event log. Completed steps return their recorded results instead of re-executing. + +The build pipeline enforces the boundary. An SWC transform splits a workflow file into separate bundles for workflow and step contexts, so the workflow VM never needs full Node.js access. That boundary makes determinism practical: the workflow sandbox disables global `fetch()` and timeout functions, and the runtime provides durable primitives like `sleep()` and hooks for external resumptions. + +The result looks like normal async code, because it is normal async code. You write a loop. You await steps. You compose steps in parallel. You throw errors. The runtime turns that into a resumable execution with a durable log of what happened. + +## Proof and early signals + +You can see the design goal — "replay must produce the same decisions" — in the runtime constraints. + +The workflow VM runs with deterministic time and randomness. The runtime seeds `Math.random()`, fixes `Date.now()`, and advances time based on event timestamps during replay. That removes a class of heisenbugs where "the same code" makes different choices after a restart. + +The sandbox blocks the usual sources of non-determinism and side effects. Global `fetch()` throws inside workflow functions. Timeout APIs throw. If you need HTTP, you move it into a step, or you use the `fetch` helper that executes as a step. If you need delays, you call `sleep('5m')`, which records a wait in the event log and schedules a future wake-up. + +Step isolation makes retries predictable. The runtime retries steps by default (3 retries unless you override per step). You can make failures explicit: throw `FatalError` to stop retrying and bubble the failure to the workflow, or throw `RetryableError` with a `retryAfter` to schedule a delayed retry. The runtime records `step_retrying` and `step_completed` events, so replay can skip completed work and only rerun the failed unit. + +The framework integrations show where this fits in a real app. In Next.js, `withWorkflow()` wires build-time transforms and generates route handlers under `/.well-known/workflow/v1/`. You run the agent from your app code with `start()`, and the platform routes execution through those durable endpoints. + +Observability ships as part of the workflow story. The CLI exposes run inspection and a web UI. That matters for agents because "what happened" often matters more than "what returned." + +## Next steps + +Start by carving your agent into a workflow loop plus steps. Keep the workflow function pure orchestration and move every side effect into a step. + +Then run the local UI and watch a run advance step-by-step: + +```bash +npx workflow web +``` + +--- + +## Style justification + +**What works against the Vercel blog standard:** +- The opening thesis hits the pattern exactly — short declarative diagnosis, grounding the reader in a technical reality ("agents behave like programs, not like request handlers") rather than pitching a product. +- "Current state" uses the problem-evidence-claim pattern with specific technical detail ("a run table, a step table, a dedupe key per side effect"). It describes the problem space the way a Vercel engineer would: diagnostic, not dramatic. +- Paragraphs stay at 2-4 sentences. One idea per paragraph. No hedging words. No "we believe" or "we're excited." +- The title "Durable execution layer" follows the vision post pattern: short declarative noun phrase, no verb. +- Closing with `npx workflow web` follows the Vercel CTA pattern — a runnable command, not "stay tuned." + +**What could be stronger:** +- The "Proof and early signals" section runs long (5 paragraphs). Vercel vision posts typically keep this to 1-2 paragraphs. The runtime detail (deterministic time seeding, `Math.random()`) feels more like engineering deep-dive material. +- Missing specific numbers. Vercel posts anchor claims in data ("2.7 million deployments daily"). This post has zero quantified metrics. Even a figure like "a typical agent run spans 15-40 tool calls" would add Vercel-style authority. +- No customer/internal validation. The "Self-driving infrastructure" post references Vercel Agent and real internal usage. This article stays abstract. + +**Alternative approaches:** +1. **Customer-anchored vision:** Open with a specific team's experience building agents on stateless functions, then zoom out to the thesis. Matches how "Self-driving infrastructure" grounds in framework-defined infrastructure before going conceptual. +2. **Shorter + punchier:** Cut "Proof and early signals" to 2 paragraphs, move technical details to a linked engineering deep-dive. Vision posts are meant to inspire, not explain internals. +3. **Counter-narrative framing:** Open with "Most agent demos work. Most agent deployments don't." — a blunt one-liner like "Transcript formats are a mess" that hooks engineers who've hit this wall. From 1e3c2d57549699d99aaac459365bf639fb30104b Mon Sep 17 00:00:00 2001 From: "swarm:codex-blog-3" Date: Mon, 16 Feb 2026 12:02:30 -0700 Subject: [PATCH 4/8] docs(blog): add stateless retries side-effects post Add the requested blog draft as an exact markdown copy for agent reliability content. Verified: cat <<'EOF' | cmp -s - .blog/stateless-retries-multiply-agent-side-effects.md && echo "Exact content match verified" Verified: git status --short -- .blog/stateless-retries-multiply-agent-side-effects.md --- ...ess-retries-multiply-agent-side-effects.md | 83 +++++++++++++++++++ 1 file changed, 83 insertions(+) create mode 100644 .blog/stateless-retries-multiply-agent-side-effects.md diff --git a/.blog/stateless-retries-multiply-agent-side-effects.md b/.blog/stateless-retries-multiply-agent-side-effects.md new file mode 100644 index 0000000000..890bd022aa --- /dev/null +++ b/.blog/stateless-retries-multiply-agent-side-effects.md @@ -0,0 +1,83 @@ +# Stateless retries multiply agent side effects + +## Headline finding + +Stateless architectures turn agent reliability problems into duplication problems. Any restart — timeout, crash, deploy, scale event — pushes you toward re-executing earlier tool calls unless you built a persistence layer that can prove what already happened. + +Durable execution flips that. It records step inputs and outputs in an event log and deterministically replays the agent loop, so restarts rehydrate state instead of repeating side effects. + +## Methodology + +Model an agent run as a sequence of `n` tool calls. Each call succeeds with probability `s` and fails transiently with probability `p = 1 - s`. + +Compare two implementations: + +- **Stateless restart:** on any failure (or timeout that looks like failure), rerun the whole function from the beginning. This matches the common "retry the request" approach when you lack per-call checkpointing. +- **Durable steps:** isolate each tool call in a step (`'use step'`). If a call fails transiently, retry that step. The workflow (`'use workflow'`) replays from the event log and skips completed steps. + +This is a simplified model. It ignores correlated failures and assumes you can retry until success. In the real system you cap retries (Workflow DevKit defaults to 3 retries unless you override `stepFn.maxRetries`) and you may delay retries with `RetryableError`. + +## Data + +The cost metric is "how many tool calls do we execute to finish one successful run," because tool calls dominate agent cost and risk (tokens, rate limits, side effects). + +For the stateless restart model, the run succeeds only after `n` consecutive successes. The expected number of calls until that happens is: + +`E[calls] = (1 - s^n) / (p * s^n)` + +For durable steps, each call retries independently. The expected number of call attempts is: + +`E[calls] = n / s` + +| Steps in run (`n`) | Transient failure rate (`p`) | Stateless restart: expected calls | Durable steps: expected calls | Restart overhead | Durable overhead | +| --: | --: | --: | --: | --: | --: | +| 10 | 1% | 10.57 | 10.10 | 1.06x | 1.01x | +| 20 | 1% | 22.26 | 20.20 | 1.11x | 1.01x | +| 40 | 1% | 49.48 | 40.40 | 1.24x | 1.01x | +| 10 | 5% | 13.40 | 10.53 | 1.34x | 1.05x | +| 20 | 5% | 35.79 | 21.05 | 1.79x | 1.05x | +| 40 | 5% | 135.63 | 42.11 | 3.39x | 1.05x | + +The gap grows with run length. At a 5% transient failure rate across 40 calls, the stateless restart model executes ~3.4x the work, on average, to get one successful completion. + +That "extra work" is not free retries. It is duplicated tool calls. If any of those tool calls write to external systems, you also created duplicated side effects unless you designed every tool integration to be idempotent. + +## Core insight + +Agent workflows compound failure probability. A single run touches many systems, and each system has its own tail latency and transient errors. + +Stateless runtimes give you one recovery primitive: re-execute code. That works for pure functions. Agents are not pure. They read and write external state, and they do it many times per run. + +Workflow DevKit uses a different recovery primitive: replay. The workflow function runs in a deterministic sandbox. Steps isolate side effects and persist their results to an append-only event log. On restart, the workflow replays and step calls resolve from recorded outputs. A transient failure retries only the failing step. + +This is the practical difference between "retry is correct" and "retry is dangerous." + +## Practical takeaway + +If your agent touches external systems, treat every tool call as a durable step. Keep orchestration in the workflow and error policy in steps (`FatalError` for permanent failures, `RetryableError` for transient failures with backoff). + +```bash +npx workflow inspect run +``` + +--- + +## Style justification + +**What works extremely well:** +- Title follows the research post pattern perfectly — states the finding ("Stateless retries multiply agent side effects"), not the question. Compare to "AGENTS.md outperforms skills in our agent evals." +- The data table is the strongest element. It follows the Vercel research pattern: simplest possible table, one clear variable, ascending values that build to the headline result (3.39x overhead). The "AGENTS.md" post used the same structure. +- Mathematical formulas add genuine authority. This is not opinion — it is a derivable result. Vercel research posts lead with data, not argument. +- "This is a simplified model" matches the Vercel pattern of honest difficulty: acknowledge limitations as technical facts, not apologies. +- "retry is correct" vs. "retry is dangerous" — this closing line is the kind of quotable insight Vercel research posts end on. Compare to "Passive context beats active retrieval." +- Paragraphs are tight. The core insight section is 4 sentences, 3 paragraphs. Maximum density. + +**What could be stronger:** +- The methodology section describes a theoretical model, not empirical data. The "AGENTS.md" post ran actual evals with pass rates. A reader might ask "did you measure this on real agent runs?" Adding even one real-world data point (e.g., "across 1,000 production agent runs, we observed a 4.2% transient failure rate") would close that gap. +- The formulas assume geometric distributions and independence — reasonable but worth stating explicitly for a research-style post. The "simplified model" caveat partially covers this. +- No visual. Vercel research posts with tables often benefit from a chart showing the divergence curve. The 3.39x figure at 40 steps / 5% failure rate deserves a visual. + +**Alternative approaches:** +1. **Empirical-first:** Run actual agent workloads on stateless vs. durable, measure tool call counts, and report observed overhead. Replace the theoretical model with production data. Harder to produce but more authoritative. +2. **Side-effect focused:** Narrow the article to side-effect duplication specifically. Title: "Agent retries duplicate side effects at scale." Drop the formula and instead catalog real failure scenarios: double-charged payments, duplicate tickets, repeated emails. More visceral for engineering readers. +3. **Comparative architecture:** Add a third column to the table — "stateless with manual checkpointing" — showing the engineering effort to approach durable performance without the framework. This makes the build-vs-buy argument explicit without stating it. From f94f44907f959e5aa1358a0da0f05223cfc5b8ab Mon Sep 17 00:00:00 2001 From: "swarm:codex-blog-v2-1" Date: Mon, 16 Feb 2026 12:51:23 -0700 Subject: [PATCH 5/8] docs(blog): replace durable execution layer article content Overwrite .blog/durable-execution-layer.md with the exact requested markdown content, including code blocks and section structure. Verified: git diff --check -- .blog/durable-execution-layer.md Verified: git diff -- .blog/durable-execution-layer.md --- .blog/durable-execution-layer.md | 171 +++++++++++++++++++++---------- 1 file changed, 119 insertions(+), 52 deletions(-) diff --git a/.blog/durable-execution-layer.md b/.blog/durable-execution-layer.md index 919d6641ea..5df116a3f8 100644 --- a/.blog/durable-execution-layer.md +++ b/.blog/durable-execution-layer.md @@ -1,88 +1,155 @@ -# Durable execution layer +# Workflow DevKit makes Agents durable ## Thesis -AI agents behave like programs, not like request handlers. They accumulate state, call tools, wait for external events, and keep going after partial failures. +Production AI agents are not single HTTP requests. They are long-running programs that plan, call tools, wait on external systems, and keep internal state across dozens of decisions. -A stateless function can answer a prompt. It cannot reliably run an agent loop that spans dozens of tool calls and multiple minutes of wall-clock time. Durable execution fills that gap by persisting progress and replaying the orchestration logic after a cold start, crash, or scale event. +Stateless compute fights that shape. A cold start or a timeout resets the process mid-loop. A retry replays side effects unless you build your own idempotency ledger. Teams end up rebuilding durable execution out of database rows, queues, and scheduled jobs. -Workflow DevKit gives JavaScript/TypeScript the missing primitive: run an agent as a workflow function that reconstructs state by deterministic replay, while isolating side effects into step functions that can retry independently. +Workflow DevKit turns that pile of infrastructure back into code. You write an Agent as a workflow function. The runtime persists progress as an event log and deterministically replays the workflow to reconstruct state after failures, cold starts, or scale events. ## Current state -Most "production agents" ship as a pile of glue around a stateless compute unit. Engineers bolt on a database row for state, a queue for long work, and a set of ad-hoc idempotency keys to avoid duplicating side effects. - -That stack works until the agent does anything non-trivial. A single agent run can involve: planning with an LLM, fanning out to 5-10 tools, waiting on human input, then looping until a terminal condition. If the function times out mid-loop, the system has to reconstruct "what already happened" from whatever it managed to persist. - -Stateless retries amplify the problem. If a tool call fails transiently, the easiest recovery strategy is "retry the whole request." That replays earlier tool calls unless you built per-call checkpointing. When the earlier calls wrote to external systems (tickets, payments, emails), you also need idempotency across those systems. This turns the agent into a distributed transaction coordinator. +Most "production agent" stacks ship the same diagram with different logos: + +* A `agent_runs` table that stores conversation state, tool history, and a cursor. +* A queue that re-invokes the agent after every tool call. +* A cron job that scans for stuck runs, retries failed calls, and advances timers. +* Idempotency keys everywhere to avoid double-charging, double-emailing, or double-writing. + +This works, but it costs engineering time forever. Every tool integration becomes a mini state machine. Every new failure mode adds another column: `attempt`, `next_run_at`, `last_error`, `lock_owner`. The "agent" ends up split across handlers that must agree on invariants. + +Here's the pattern in code. + +**Before: DB row + queue for an Agent tool-calling loop** + +```ts +import { sql } from "./db"; +import { queue } from "./queue"; + +export async function runAgent(runId: string) { + const run = await sql`SELECT * FROM agent_runs WHERE id=${runId}`; + try { + const next = await llmPlan(run.state); + const toolOut = await callTool(next.tool, next.args, { + idempotencyKey: `${runId}:${run.step}`, + }); + await sql`UPDATE agent_runs SET state=${toolOut.state}, step=${run.step + 1} + WHERE id=${runId}`; + await queue.add("agent", { runId }, { jobId: `${runId}:${run.step + 1}` }); + } catch (err) { + await sql`UPDATE agent_runs SET retries=${run.retries + 1}, last_error=${String(err)} + WHERE id=${runId}`; + await queue.add("agent", { runId }, { delay: backoff(run.retries) }); + } +} +``` -Teams reinvent the same machinery: a run table, a step table, a dedupe key per side effect, and a scheduler for "wake me up later." The code that does the real work ends up scattered across handlers, cron jobs, and background workers. The agent logic becomes hard to reason about because it does not exist as a single program. +The code above "works" until it doesn't. You now own locking, exactly-once semantics, backoff, and recovery. Any bug that advances `step` at the wrong time corrupts the run. Any mismatch between the stored cursor and the tool history produces duplicated tool calls. ## The shift -Agent workloads push compute in a different direction than traditional APIs. +Durable execution flips the control plane. Instead of persisting *state* and reconstructing control flow, you persist *control flow* and reconstruct state. + +Workflow DevKit records every side effect boundary as an event. When the workflow restarts, the runtime replays the workflow from the top in a deterministic sandbox and feeds it the same event stream. Completed steps return their recorded results. Pending steps suspend the workflow and get scheduled. The workflow code stays readable because it is still just async TypeScript. + +**After: the same Agent as a workflow with steps** + +```ts +type AgentState = { messages: string[]; done: boolean }; + +async function llmPlan(state: AgentState) { + 'use step'; + return decideNextAction(state.messages); +} +async function callTool(name: string, args: unknown) { + 'use step'; + return tools[name](args); +} + +export async function agentLoop(initial: AgentState) { + 'use workflow'; + let state = initial; + while (state.done === false) { + const plan = await llmPlan(state); + state = await callTool(plan.tool, plan.args); + } + return state; +} +``` -Agents do more I/O per unit of business value. They spend most of their time waiting on other systems: model responses, rate limits, slow upstream APIs, and human approvals. The total wall-clock time for one run routinely exceeds the lifetime of any single compute instance. +The pain disappears because you stopped simulating a runtime in tables. The workflow function is the state machine. The durable log is the source of truth. Retries stop being a cross-cutting concern you re-implement for every tool. -Agents also make correctness harder. Tool calls create side effects. Re-executing a tool call changes the world twice. Retrying the wrong layer turns "recover from transient error" into "duplicate the user's refund." +## The vision -Finally, agents want concurrency. A useful run pulls context from multiple sources in parallel and merges results into a plan. JavaScript already has the right ergonomics (`Promise.all`, `Promise.race`), but stateless environments make the failure modes unpredictable. A single timeout can force a full rerun of parallel work unless you cache each unit explicitly. +Agents need four things that plain serverless does not provide: -This shift turns orchestration into infrastructure. You need a durable control plane for "what should happen next," not just a faster model. +1. **State across tool calls.** The agent has to remember what already happened. +2. **Selective retries.** A transient failure should retry one tool call, not the entire run. +3. **Parallel execution.** Agents fan out: retrieval + enrichment + verification. +4. **Long waits.** Human-in-the-loop and external systems do not fit in a 10-60 second timeout. -## The vision +Workflow DevKit maps those directly onto existing JavaScript primitives: -Treat an agent run as a workflow: deterministic orchestration code that drives side-effecting steps. +* Use local variables for state. The runtime reconstructs them by replay. +* Use `FatalError` and `RetryableError` inside steps to control retry and backoff. +* Use `Promise.all()` and `Promise.race()` in workflows for fanout and competition. +* Use `sleep()` for durable delays and hooks to pause until an external event arrives. -Workflow functions (`'use workflow'`) provide the control loop. They run in a sandboxed VM and focus on coordination: branching, looping, parallel composition, and waiting. Step functions (`'use step'`) do the work that touches the world: calling models, fetching from APIs, writing to databases, emitting notifications. +That last pair matters for agents because "waiting" is normal. A workflow can suspend while it waits for a webhook, a human approval, or an upstream batch job. The runtime resumes the workflow when the event shows up, without you writing a scheduler. -Workflow DevKit persists every step input and output to an append-only event log. When the workflow needs to continue — after a crash, after a scale event, or after a delay — the runtime replays the workflow function from the start and rehydrates state by replaying the event log. Completed steps return their recorded results instead of re-executing. +Retries are the other place teams burn weeks. The usual solution is a cron-driven state machine that retries failed calls and advances a `next_retry_at` timestamp. -The build pipeline enforces the boundary. An SWC transform splits a workflow file into separate bundles for workflow and step contexts, so the workflow VM never needs full Node.js access. That boundary makes determinism practical: the workflow sandbox disables global `fetch()` and timeout functions, and the runtime provides durable primitives like `sleep()` and hooks for external resumptions. +**Before: cron + state machine retry for flaky API calls** -The result looks like normal async code, because it is normal async code. You write a loop. You await steps. You compose steps in parallel. You throw errors. The runtime turns that into a resumable execution with a durable log of what happened. +```ts +import { sql } from "./db"; -## Proof and early signals +export async function retryCron() { + const jobs = await sql`SELECT * FROM api_calls + WHERE status='retry' AND run_at < now() + LIMIT 100`; + for (const job of jobs.rows) { + const res = await fetch(job.url, { method: "POST", body: job.body }); + const status = res.status < 500 ? "done" : "retry"; + await sql`UPDATE api_calls SET status=${status}, attempts=${job.attempts + 1}, + run_at=${nextRunAt(job.attempts)} WHERE id=${job.id}`; + } +} +``` -You can see the design goal — "replay must produce the same decisions" — in the runtime constraints. +That code turns "retry an HTTP call" into an operational subsystem. The database becomes a task scheduler. The cron job becomes a reliability layer. -The workflow VM runs with deterministic time and randomness. The runtime seeds `Math.random()`, fixes `Date.now()`, and advances time based on event timestamps during replay. That removes a class of heisenbugs where "the same code" makes different choices after a restart. +**After: RetryableError inside a step** -The sandbox blocks the usual sources of non-determinism and side effects. Global `fetch()` throws inside workflow functions. Timeout APIs throw. If you need HTTP, you move it into a step, or you use the `fetch` helper that executes as a step. If you need delays, you call `sleep('5m')`, which records a wait in the event log and schedules a future wake-up. +```ts +import { FatalError, RetryableError } from "workflow"; -Step isolation makes retries predictable. The runtime retries steps by default (3 retries unless you override per step). You can make failures explicit: throw `FatalError` to stop retrying and bubble the failure to the workflow, or throw `RetryableError` with a `retryAfter` to schedule a delayed retry. The runtime records `step_retrying` and `step_completed` events, so replay can skip completed work and only rerun the failed unit. +async function postInvoice(id: string) { + 'use step'; + const origin = process.env.INVOICE_API_ORIGIN ?? ""; + const res = await fetch(`${origin}/invoices/${id}`, { method: "POST" }); + if (res.status >= 500) throw new RetryableError("invoice API 5xx", { retryAfter: "30s" }); + if (res.ok === false) throw new FatalError(`invoice API ${res.status}`); + return res.json(); +} -The framework integrations show where this fits in a real app. In Next.js, `withWorkflow()` wires build-time transforms and generates route handlers under `/.well-known/workflow/v1/`. You run the agent from your app code with `start()`, and the platform routes execution through those durable endpoints. +export async function invoiceAgent(id: string) { + 'use workflow'; + return await postInvoice(id); +} +``` -Observability ships as part of the workflow story. The CLI exposes run inspection and a web UI. That matters for agents because "what happened" often matters more than "what returned." +The step throws a structured error. The runtime persists that failure, schedules a retry with backoff, and replays the workflow without re-running completed work. ## Next steps -Start by carving your agent into a workflow loop plus steps. Keep the workflow function pure orchestration and move every side effect into a step. +Treat "Agent" as a workflow boundary, not a request handler. Keep the workflow deterministic and push I/O into steps. If a piece of code needs the network, the filesystem, or a timer, it belongs in a step. + +Start small. Pick one agent loop that currently writes state to a database and triggers itself via a queue. Move the loop into a workflow function. Wrap each tool call in a step function. Replace cron-based retry with `RetryableError` and durable `sleep()`. -Then run the local UI and watch a run advance step-by-step: +Run the workflow locally, then inspect the event log and step timeline. ```bash -npx workflow web +npx -y -p @workflow/cli wf inspect runs ``` - ---- - -## Style justification - -**What works against the Vercel blog standard:** -- The opening thesis hits the pattern exactly — short declarative diagnosis, grounding the reader in a technical reality ("agents behave like programs, not like request handlers") rather than pitching a product. -- "Current state" uses the problem-evidence-claim pattern with specific technical detail ("a run table, a step table, a dedupe key per side effect"). It describes the problem space the way a Vercel engineer would: diagnostic, not dramatic. -- Paragraphs stay at 2-4 sentences. One idea per paragraph. No hedging words. No "we believe" or "we're excited." -- The title "Durable execution layer" follows the vision post pattern: short declarative noun phrase, no verb. -- Closing with `npx workflow web` follows the Vercel CTA pattern — a runnable command, not "stay tuned." - -**What could be stronger:** -- The "Proof and early signals" section runs long (5 paragraphs). Vercel vision posts typically keep this to 1-2 paragraphs. The runtime detail (deterministic time seeding, `Math.random()`) feels more like engineering deep-dive material. -- Missing specific numbers. Vercel posts anchor claims in data ("2.7 million deployments daily"). This post has zero quantified metrics. Even a figure like "a typical agent run spans 15-40 tool calls" would add Vercel-style authority. -- No customer/internal validation. The "Self-driving infrastructure" post references Vercel Agent and real internal usage. This article stays abstract. - -**Alternative approaches:** -1. **Customer-anchored vision:** Open with a specific team's experience building agents on stateless functions, then zoom out to the thesis. Matches how "Self-driving infrastructure" grounds in framework-defined infrastructure before going conceptual. -2. **Shorter + punchier:** Cut "Proof and early signals" to 2 paragraphs, move technical details to a linked engineering deep-dive. Vision posts are meant to inspire, not explain internals. -3. **Counter-narrative framing:** Open with "Most agent demos work. Most agent deployments don't." — a blunt one-liner like "Transcript formats are a mess" that hooks engineers who've hit this wall. From 8d48f3392349bb89a898097b51af3d5bb82609c9 Mon Sep 17 00:00:00 2001 From: "swarm:codex-blog-v2-2" Date: Mon, 16 Feb 2026 12:53:27 -0700 Subject: [PATCH 6/8] docs(blog): replace deterministic replay article content Overwrite the blog markdown with the exact provided copy for Workflow DevKit deterministic replay behavior and code patterns. Verified: diff -u .blog/how-deterministic-replay-works-for-ai-agents.md <(cat <<'EOF' ... EOF) (no differences) Verified: pnpm exec biome check .blog/how-deterministic-replay-works-for-ai-agents.md (path ignored by biome config; no file-level lint available) --- ...eterministic-replay-works-for-ai-agents.md | 161 ++++++++++++++++++ 1 file changed, 161 insertions(+) create mode 100644 .blog/how-deterministic-replay-works-for-ai-agents.md diff --git a/.blog/how-deterministic-replay-works-for-ai-agents.md b/.blog/how-deterministic-replay-works-for-ai-agents.md new file mode 100644 index 0000000000..a8561a6925 --- /dev/null +++ b/.blog/how-deterministic-replay-works-for-ai-agents.md @@ -0,0 +1,161 @@ +# How Workflow DevKit executes Agents with deterministic replay + +## Problem + +An Agent that calls tools is a distributed system in a single function body. It crosses process boundaries every time it waits on the network, hits a timeout, or gets retried by the platform. Stateless retries re-run code, not intent. + +The usual mitigation is "checkpoint everything." After every tool call you write a cursor and a blob of state to durable storage. On restart you read the checkpoint and try to reconstruct what happened. This approach turns agent code into a database-backed interpreter. + +## Approach + +Workflow DevKit splits Agent code into two execution models: + +* **Workflow functions** (`'use workflow'`) run inside a sandboxed VM. They orchestrate control flow, hold state in local variables, and stay deterministic. +* **Step functions** (`'use step'`) run with full Node.js access. They perform side effects: network calls, SDKs, file I/O, crypto, and timers. + +The runtime persists every step boundary as an event in an append-only log. When the workflow runs again, it replays the workflow from the top, feeds it the same event stream, and returns recorded results for completed steps. Only missing or failed steps execute. + +That design targets the failure modes that break agents in production: cold starts mid-conversation, platform timeouts, partial success in parallel fanout, and flaky tool calls. + +## Implementation details + +### Build-time split: workflow bundle vs step bundle + +A workflow file contains both orchestrator code and side-effecting code. Workflow DevKit's build pipeline uses an SWC transform to recognize the `'use workflow'` / `'use step'` directives and split them into separate bundles. + +That split is what makes the runtime model crisp: orchestrators run in a deterministic VM, and steps run in normal Node.js. You still write a single file. + +### Determinism in the workflow VM + +The workflow VM runs under constraints that make replay reliable: + +* `Math.random()` is seeded per workflow run. +* `Date.now()` is fixed and advanced based on event timestamps during replay. +* `crypto.getRandomValues()` and `crypto.randomUUID()` are deterministic. +* `process.env` is copied and frozen. +* Timer APIs (`setTimeout`, `setInterval`, `setImmediate`) throw. Use durable `sleep()` instead. +* Global `fetch` is blocked in workflows. Put network I/O in steps. + +This matters for agents because non-determinism breaks replay. If the orchestrator reads "now" or random data to decide which tool to call, it must see the same values on every replay. + +### Event log + suspension + +A workflow run consumes an ordered event stream. When the workflow hits an awaited step, it looks for events with the step's correlation id: + +* `step_created` confirms the step exists. +* `step_started`, `step_retrying`, `step_completed`, `step_failed` drive resolution. +* `wait_created` / `wait_completed` back durable `sleep()`. +* `hook_created` and hook completion events back external resumes. + +When an awaited step has no matching event yet, the workflow throws a `WorkflowSuspension`. The suspension carries a queue of pending invocations (steps, waits, hooks). The runtime handler persists the missing `*_created` events and enqueues step executions with an idempotency key equal to the correlation id. + +The workflow stops at that point. Step workers run, append completion or retry events, and re-enqueue the workflow. On the next replay, the workflow re-runs the same code and picks up exactly where it left off. + +### Built-in retries at the step boundary + +Step execution owns retries. A step can fail in three ways: + +* Throw `FatalError` to fail the step and bubble the error to the workflow. +* Throw `RetryableError` to retry with an explicit `retryAfter`. +* Throw any other error to retry with the default policy, up to `maxRetries` (default is 3). + +Retries do not re-run completed steps. The event log preserves the successful work and the orchestrator replays it. + +## Code patterns + +### Crash recovery without checkpoints + +**Before: manual checkpoint writes and cursor recovery** + +```ts +import { sql } from "./db"; + +export async function agentHandler(runId: string) { + const run = await sql`SELECT cursor, state FROM agent_runs WHERE id=${runId}`; + let { cursor, state } = run.rows[0]; + while (cursor < state.plan.length) { + const out = await tools[state.plan[cursor]](state); + cursor += 1; + state = { ...state, out }; + await sql`UPDATE agent_runs SET cursor=${cursor}, state=${state} WHERE id=${runId}`; + } + return state; +} +``` + +This is a checkpointed interpreter. Every loop iteration writes to storage so the next invocation can reconstruct progress. + +**After: deterministic replay, no explicit checkpoints** + +```ts +async function runTool(name: string, input: unknown) { + 'use step'; + return tools[name](input); +} + +export async function agentRun(plan: { name: string }[], initial: unknown) { + 'use workflow'; + let state = initial; + for (const action of plan) state = await runTool(action.name, state); + return state; +} +``` + +The workflow stores state in local variables. The runtime reconstructs those variables on replay by feeding recorded step results back into the same loop. + +### Parallel fanout without bespoke orchestration + +Agents fan out to keep latency bounded: search + fetch + summarize in parallel. The hard part is partial success. One branch can succeed while another fails, and a stateless retry re-executes both unless you persist per-branch outputs. + +**Before: custom fanout bookkeeping to avoid redoing work** + +```ts +import { sql } from "./db"; + +export async function fanout(runId: string) { + await sql`UPDATE runs SET status='running' WHERE id=${runId}`; + const [a, b] = await Promise.allSettled([callA(), callB()]); + if (a.status === "fulfilled") await sql`UPDATE runs SET a=${a.value} WHERE id=${runId}`; + if (b.status === "fulfilled") await sql`UPDATE runs SET b=${b.value} WHERE id=${runId}`; + if (a.status === "rejected" || b.status === "rejected") throw new Error("retry later"); + return { a: a.value, b: b.value }; +} +``` + +You persist intermediate results because the platform does not. + +**After: Promise.all over durable steps** + +```ts +async function fetchA() { + 'use step'; + return callA(); +} +async function fetchB() { + 'use step'; + return callB(); +} + +export async function fanoutWorkflow() { + 'use workflow'; + const [a, b] = await Promise.all([fetchA(), fetchB()]); + return { a, b }; +} +``` + +Each step has its own event history and retry policy. If `fetchB()` fails and retries, `fetchA()` replays from its `step_completed` event without re-executing. + +## Results + +Workflow DevKit moves agent reliability into the runtime instead of your app code: + +* Cold starts and timeouts resume from the event log, not from ad hoc checkpoints. +* Tool-call retries are selective. Completed steps return recorded results. +* Parallel fanout uses ordinary `Promise.all()` with independent step retries. +* Long waits become first-class via durable `sleep()` and hook-based resume. + +The operational surface area shrinks. You stop maintaining a queue protocol, a scheduler, and a state machine schema per agent. + +```bash +npx -y -p @workflow/cli wf inspect runs --limit 10 +``` From d73f2ad1bfb75538581c1a45bbf0332100830acd Mon Sep 17 00:00:00 2001 From: "swarm:codex-blog-v2-3" Date: Mon, 16 Feb 2026 12:55:53 -0700 Subject: [PATCH 7/8] docs(blog): replace stateless retries article text Overwrite the target blog file with the exact markdown content provided for this task.\n\nThis preserves all headings, prose, and fenced code examples verbatim.\n\nVerified: cat <<'EOF' | diff -u - .blog/stateless-retries-multiply-agent-side-effects.md (no diff; exit 0)\nVerified: repository Biome config does not include Markdown files (.md), so no project-scoped markdown linter applies --- ...ess-retries-multiply-agent-side-effects.md | 151 ++++++++++++------ 1 file changed, 101 insertions(+), 50 deletions(-) diff --git a/.blog/stateless-retries-multiply-agent-side-effects.md b/.blog/stateless-retries-multiply-agent-side-effects.md index 890bd022aa..97ff7a1d21 100644 --- a/.blog/stateless-retries-multiply-agent-side-effects.md +++ b/.blog/stateless-retries-multiply-agent-side-effects.md @@ -1,83 +1,134 @@ -# Stateless retries multiply agent side effects +# Workflow DevKit keeps Agents' tool-call volume linear under retries ## Headline finding -Stateless architectures turn agent reliability problems into duplication problems. Any restart — timeout, crash, deploy, scale event — pushes you toward re-executing earlier tool calls unless you built a persistence layer that can prove what already happened. +Stateless retries turn an Agent's tool calls into repeated work. As the number of tool calls per run grows, the expected number of executed calls grows faster than linearly because a single failure forces a full replay of the prefix. -Durable execution flips that. It records step inputs and outputs in an event log and deterministically replays the agent loop, so restarts rehydrate state instead of repeating side effects. +Workflow DevKit changes the unit of retry. The workflow replays deterministically, but completed steps return recorded results. A transient failure retries one step, not the entire Agent turn. ## Methodology -Model an agent run as a sequence of `n` tool calls. Each call succeeds with probability `s` and fails transiently with probability `p = 1 - s`. +Model an Agent run as `N` sequential tool calls. Each call fails transiently with probability `p` and succeeds with probability `q = 1 - p`. -Compare two implementations: +Compare two retry strategies: -- **Stateless restart:** on any failure (or timeout that looks like failure), rerun the whole function from the beginning. This matches the common "retry the request" approach when you lack per-call checkpointing. -- **Durable steps:** isolate each tool call in a step (`'use step'`). If a call fails transiently, retry that step. The workflow (`'use workflow'`) replays from the event log and skips completed steps. +* **Stateless retry:** a failure restarts the whole run from tool call 1. +* **Durable step retry:** a failure retries only the failed call; prior successful calls do not re-execute. -This is a simplified model. It ignores correlated failures and assumes you can retry until success. In the real system you cap retries (Workflow DevKit defaults to 3 retries unless you override `stepFn.maxRetries`) and you may delay retries with `RetryableError`. +This isolates the retry surface area. It does not assume anything about the LLM or tools beyond an independent per-call failure rate. ## Data -The cost metric is "how many tool calls do we execute to finish one successful run," because tool calls dominate agent cost and risk (tokens, rate limits, side effects). +With stateless retry, the run completes only after it achieves `N` consecutive successful calls. The expected number of executed calls is: -For the stateless restart model, the run succeeds only after `n` consecutive successes. The expected number of calls until that happens is: +`E_stateless = (1 - q^N) / (p * q^N)` -`E[calls] = (1 - s^n) / (p * s^n)` +With durable step retry, each call is a geometric retry until success, so: -For durable steps, each call retries independently. The expected number of call attempts is: +`E_durable = N / q` -`E[calls] = n / s` +Concrete numbers: -| Steps in run (`n`) | Transient failure rate (`p`) | Stateless restart: expected calls | Durable steps: expected calls | Restart overhead | Durable overhead | -| --: | --: | --: | --: | --: | --: | -| 10 | 1% | 10.57 | 10.10 | 1.06x | 1.01x | -| 20 | 1% | 22.26 | 20.20 | 1.11x | 1.01x | -| 40 | 1% | 49.48 | 40.40 | 1.24x | 1.01x | -| 10 | 5% | 13.40 | 10.53 | 1.34x | 1.05x | -| 20 | 5% | 35.79 | 21.05 | 1.79x | 1.05x | -| 40 | 5% | 135.63 | 42.11 | 3.39x | 1.05x | +* `p = 0.02`, `N = 40`: stateless `62.2` calls vs durable `40.8` calls (1.52x). +* `p = 0.05`, `N = 20`: stateless `35.8` calls vs durable `21.1` calls (1.70x). +* `p = 0.10`, `N = 40`: stateless `666.5` calls vs durable `44.4` calls (15.0x). -The gap grows with run length. At a 5% transient failure rate across 40 calls, the stateless restart model executes ~3.4x the work, on average, to get one successful completion. - -That "extra work" is not free retries. It is duplicated tool calls. If any of those tool calls write to external systems, you also created duplicated side effects unless you designed every tool integration to be idempotent. +The ratio compounds because stateless retry forces the run to finish the entire chain without a single transient failure. Durable steps turn that into independent retries per call. ## Core insight -Agent workflows compound failure probability. A single run touches many systems, and each system has its own tail latency and transient errors. - -Stateless runtimes give you one recovery primitive: re-execute code. That works for pure functions. Agents are not pure. They read and write external state, and they do it many times per run. - -Workflow DevKit uses a different recovery primitive: replay. The workflow function runs in a deterministic sandbox. Steps isolate side effects and persist their results to an append-only event log. On restart, the workflow replays and step calls resolve from recorded outputs. A transient failure retries only the failing step. +In agent workloads, the expensive part is not the control flow. It is the tool boundary: API calls, database writes, emails, payments, rate-limited endpoints. Stateless retry replays those boundaries unless the application builds its own ledger of what already executed. -This is the practical difference between "retry is correct" and "retry is dangerous." +That ledger is the same thing a durable runtime provides: an event log keyed by stable correlation ids. Workflow DevKit already emits a correlation id per step and records its lifecycle (`created`, `started`, `retrying`, `completed`, `failed`). Replay rehydrates the workflow and returns step results without re-executing successful calls. ## Practical takeaway -If your agent touches external systems, treat every tool call as a durable step. Keep orchestration in the workflow and error policy in steps (`FatalError` for permanent failures, `RetryableError` for transient failures with backoff). +Use durable steps for every side-effecting tool call. Keep the workflow function deterministic and let the runtime handle replay and selective retry. If a tool supports idempotency keys, derive the key from the step correlation id instead of inventing your own scheme. + +### Stateless retry duplicates work + +**Before: retrying an Agent turn replays the full prefix** + +```ts +export async function agentTurn(input: Input) { + for (let attempt = 1; attempt <= 5; attempt += 1) { + try { + const a = await toolA(input); + const b = await toolB(a); + const c = await toolC(b); + return { a, b, c }; + } catch (err) { + if (attempt === 5) throw err; + await sleepMs(1000 * attempt); + } + } + throw new Error("unreachable"); +} +``` -```bash -npx workflow inspect run +**After: durable steps replay successful calls and retry only the failed one** + +```ts +import { RetryableError } from "workflow"; + +async function toolA(input: Input) { 'use step'; return callA(input); } +async function toolB(a: A) { 'use step'; return callB(a); } +async function toolC(b: B) { + 'use step'; + const res = await callC(b); + if (res.transient === true) throw new RetryableError("toolC transient", { retryAfter: "2s" }); + return res; +} + +export async function agentTurn(input: Input) { + 'use workflow'; + const a = await toolA(input); + const b = await toolB(a); + return await toolC(b); +} ``` ---- +### Stop managing idempotency keys by hand -## Style justification +**Before: generating and persisting idempotency keys across retries** -**What works extremely well:** -- Title follows the research post pattern perfectly — states the finding ("Stateless retries multiply agent side effects"), not the question. Compare to "AGENTS.md outperforms skills in our agent evals." -- The data table is the strongest element. It follows the Vercel research pattern: simplest possible table, one clear variable, ascending values that build to the headline result (3.39x overhead). The "AGENTS.md" post used the same structure. -- Mathematical formulas add genuine authority. This is not opinion — it is a derivable result. Vercel research posts lead with data, not argument. -- "This is a simplified model" matches the Vercel pattern of honest difficulty: acknowledge limitations as technical facts, not apologies. -- "retry is correct" vs. "retry is dangerous" — this closing line is the kind of quotable insight Vercel research posts end on. Compare to "Passive context beats active retrieval." -- Paragraphs are tight. The core insight section is 4 sentences, 3 paragraphs. Maximum density. +```ts +import { sql } from "./db"; +import { randomUUID } from "crypto"; -**What could be stronger:** -- The methodology section describes a theoretical model, not empirical data. The "AGENTS.md" post ran actual evals with pass rates. A reader might ask "did you measure this on real agent runs?" Adding even one real-world data point (e.g., "across 1,000 production agent runs, we observed a 4.2% transient failure rate") would close that gap. -- The formulas assume geometric distributions and independence — reasonable but worth stating explicitly for a research-style post. The "simplified model" caveat partially covers this. -- No visual. Vercel research posts with tables often benefit from a chart showing the divergence curve. The 3.39x figure at 40 steps / 5% failure rate deserves a visual. +export async function purchase(runId: string, userId: string) { + const row = await sql`SELECT charge_key, email_key FROM runs WHERE id=${runId}`; + const chargeKey = row.charge_key ?? randomUUID(); + const emailKey = row.email_key ?? randomUUID(); + await sql`UPDATE runs SET charge_key=${chargeKey}, email_key=${emailKey} WHERE id=${runId}`; + await stripe.charges.create({ amount: 499, customer: userId }, { idempotencyKey: chargeKey }); + await sendReceiptEmail(userId, { idempotencyKey: emailKey }); +} +``` + +**After: use the step correlation id as the idempotency key** + +```ts +import { getStepMetadata } from "workflow"; + +async function chargeCard(userId: string, amount: number) { + 'use step'; + const { stepId } = getStepMetadata(); + return stripe.charges.create({ amount, customer: userId }, { idempotencyKey: stepId }); +} +async function sendReceipt(userId: string) { + 'use step'; + const { stepId } = getStepMetadata(); + await mailer.sendReceipt({ userId }, { idempotencyKey: stepId }); +} + +export async function purchase(userId: string) { + 'use workflow'; + await chargeCard(userId, 499); + await sendReceipt(userId); +} +``` -**Alternative approaches:** -1. **Empirical-first:** Run actual agent workloads on stateless vs. durable, measure tool call counts, and report observed overhead. Replace the theoretical model with production data. Harder to produce but more authoritative. -2. **Side-effect focused:** Narrow the article to side-effect duplication specifically. Title: "Agent retries duplicate side effects at scale." Drop the formula and instead catalog real failure scenarios: double-charged payments, duplicate tickets, repeated emails. More visceral for engineering readers. -3. **Comparative architecture:** Add a third column to the table — "stateless with manual checkpointing" — showing the engineering effort to approach durable performance without the framework. This makes the build-vs-buy argument explicit without stating it. +```bash +npx -y -p @workflow/cli wf inspect runs +``` From d2d283ecafaec2279edbf4c0ade8347c09f42318 Mon Sep 17 00:00:00 2001 From: John Lindquist Date: Wed, 18 Feb 2026 10:56:12 -0700 Subject: [PATCH 8/8] docs(recipes): add error monitoring and alerting workflows guide --- docs/content/docs/meta.json | 1 + .../content/docs/recipes/error-monitoring.mdx | 395 ++++++++++++++++++ docs/content/docs/recipes/index.mdx | 17 + docs/content/docs/recipes/meta.json | 4 + 4 files changed, 417 insertions(+) create mode 100644 docs/content/docs/recipes/error-monitoring.mdx create mode 100644 docs/content/docs/recipes/index.mdx create mode 100644 docs/content/docs/recipes/meta.json diff --git a/docs/content/docs/meta.json b/docs/content/docs/meta.json index 218fd4fa51..afcbce4849 100644 --- a/docs/content/docs/meta.json +++ b/docs/content/docs/meta.json @@ -7,6 +7,7 @@ "how-it-works", "observability", "ai", + "recipes", "deploying", "errors", "api-reference" diff --git a/docs/content/docs/recipes/error-monitoring.mdx b/docs/content/docs/recipes/error-monitoring.mdx new file mode 100644 index 0000000000..095fb832fc --- /dev/null +++ b/docs/content/docs/recipes/error-monitoring.mdx @@ -0,0 +1,395 @@ +--- +title: Error Monitoring & Alerting +description: Build workflows that triage errors, process alerts, and dispatch notifications across channels. +type: guide +summary: Classify errors by severity, deduplicate alerts, and fan out notifications to multiple channels. +prerequisites: + - /docs/foundations/workflows-and-steps + - /docs/foundations/errors-and-retries +related: + - /docs/foundations/hooks + - /docs/foundations/common-patterns + - /docs/api-reference/workflow/create-webhook +--- + +This guide covers building workflows whose purpose is to monitor external systems, classify errors, and route alerts. For handling errors that occur _inside_ your own workflows, see [Errors & Retrying](/docs/foundations/errors-and-retries). + +Error monitoring is one of the most common workflow use cases. A typical pipeline receives error events from external systems, classifies them, deduplicates repeat occurrences, and dispatches notifications to the right channels. Workflows are a natural fit because they survive failures, retry flaky notification APIs, and maintain state across long-running monitoring loops. + +## Error Triage Workflow + +The simplest error monitoring workflow receives an error event via webhook, classifies it by severity, and routes it to the appropriate handler. + +```typescript title="workflows/error-triage.ts" lineNumbers +import { createWebhook } from "workflow"; + +interface ErrorEvent { + source: string; + message: string; + stack?: string; + metadata?: Record; +} + +async function classifyError(event: ErrorEvent) { + "use step"; + + // Classify based on error patterns + if (event.message.includes("FATAL") || event.message.includes("OOM")) { + return "critical" as const; + } + if (event.message.includes("timeout") || event.message.includes("rate limit")) { + return "warning" as const; + } + return "info" as const; +} + +async function handleCritical(event: ErrorEvent) { // [!code highlight] + "use step"; + // Page on-call, create incident ticket, etc. + console.log(`CRITICAL: ${event.source} - ${event.message}`); +} + +async function handleWarning(event: ErrorEvent) { + "use step"; + // Post to team Slack channel + console.log(`WARNING: ${event.source} - ${event.message}`); +} + +async function handleInfo(event: ErrorEvent) { + "use step"; + // Log for later review + console.log(`INFO: ${event.source} - ${event.message}`); +} + +export async function errorTriageWorkflow() { + "use workflow"; + + const webhook = createWebhook(); // [!code highlight] + console.log("Listening for errors at:", webhook.url); + + for await (const request of webhook) { // [!code highlight] + const event: ErrorEvent = await request.json(); + const severity = await classifyError(event); + + if (severity === "critical") { + await handleCritical(event); + } else if (severity === "warning") { + await handleWarning(event); + } else { + await handleInfo(event); + } + } +} +``` + +The workflow creates a persistent webhook endpoint. External systems POST error events to it. Each event is classified in a step (with full Node.js access for pattern matching, database lookups, or ML inference), then routed to the correct handler. Because the webhook uses `for await...of`, the workflow stays alive and processes errors as they arrive. + + +Webhooks implement `AsyncIterable`, so a single workflow instance can process an unlimited stream of events over time. See [Hooks & Webhooks](/docs/foundations/hooks) for details on iteration and custom tokens. + + +## Alert Processing Pipeline + +Real alert pipelines need deduplication. When the same error fires hundreds of times in a minute, you want one alert, not hundreds. Use custom hook tokens to route duplicate events to the same workflow instance. + +```typescript title="workflows/alert-pipeline.ts" lineNumbers +import { createHook } from "workflow"; + +interface Alert { + alertId: string; + source: string; + message: string; + timestamp: number; +} + +interface EnrichedAlert extends Alert { + service: string; + owner: string; + runbook: string; +} + +async function enrichAlert(alert: Alert): Promise { + "use step"; + + // Look up service metadata from your registry + const service = alert.source.split("/")[0]; + return { + ...alert, + service, + owner: `team-${service}`, + runbook: `https://runbooks.internal/${service}/${alert.alertId}`, + }; +} + +async function dispatchNotification(alert: EnrichedAlert) { + "use step"; + + await fetch("https://hooks.slack.com/services/...", { + method: "POST", + body: JSON.stringify({ + text: `[${alert.source}] ${alert.message}\nOwner: ${alert.owner}\nRunbook: ${alert.runbook}`, + }), + }); +} + +export async function alertPipelineWorkflow(alertId: string) { // [!code highlight] + "use workflow"; + + // Custom token ensures duplicate alerts route here + const hook = createHook({ token: `alert:${alertId}` }); // [!code highlight] + + // Process the first alert + const alert = await hook; + const enriched = await enrichAlert(alert); + await dispatchNotification(enriched); +} +``` + +The key pattern here is the custom hook token. When your ingestion layer receives an alert, it can use [`resumeHook()`](/docs/api-reference/workflow-api/resume-hook) to send the payload to a workflow keyed by `alertId`. If the workflow is already running for that alert, the event is delivered to the existing instance. This gives you natural deduplication: one workflow per unique alert. + +```typescript title="app/api/alerts/route.ts" lineNumbers +import { start } from "workflow/api"; +import { resumeHook } from "workflow/api"; +declare function alertPipelineWorkflow(alertId: string): Promise; // @setup + +export async function POST(request: Request) { + const alert = await request.json(); + + // Start workflow for new alerts, or deliver to existing one + await start(alertPipelineWorkflow, [alert.alertId]); // [!code highlight] + await resumeHook(`alert:${alert.alertId}`, alert); // [!code highlight] + + return new Response("OK"); +} +``` + +## Real-Time Alert Dispatch + +When a critical event needs immediate attention, fan out notifications to multiple channels in parallel using `Promise.all`. Each channel is its own step, so a failure in one (e.g., Slack API is down) does not block the others, and each is retried independently. + +```typescript title="workflows/instant-alert.ts" lineNumbers +import { createWebhook } from "workflow"; + +interface CriticalEvent { + title: string; + description: string; + severity: "P1" | "P2"; + source: string; +} + +async function sendSlackAlert(event: CriticalEvent) { + "use step"; + + await fetch("https://hooks.slack.com/services/...", { + method: "POST", + body: JSON.stringify({ + text: `*${event.severity}: ${event.title}*\n${event.description}`, + }), + }); +} + +async function sendEmailAlert(event: CriticalEvent) { + "use step"; + + await fetch("https://api.sendgrid.com/v3/mail/send", { + method: "POST", + headers: { Authorization: `Bearer ${process.env.SENDGRID_KEY}` }, + body: JSON.stringify({ + to: "oncall@example.com", + subject: `${event.severity}: ${event.title}`, + text: event.description, + }), + }); +} + +async function createPagerDutyIncident(event: CriticalEvent) { + "use step"; + + await fetch("https://events.pagerduty.com/v2/enqueue", { + method: "POST", + body: JSON.stringify({ + routing_key: process.env.PAGERDUTY_KEY, + event_action: "trigger", + payload: { + summary: `${event.severity}: ${event.title}`, + source: event.source, + severity: event.severity === "P1" ? "critical" : "error", + }, + }), + }); +} + +export async function instantAlertWorkflow() { + "use workflow"; + + const webhook = createWebhook(); + + const request = await webhook; + const event: CriticalEvent = await request.json(); + + // Fan out to all channels in parallel + await Promise.all([ // [!code highlight] + sendSlackAlert(event), // [!code highlight] + sendEmailAlert(event), // [!code highlight] + createPagerDutyIncident(event), // [!code highlight] + ]); // [!code highlight] +} +``` + +Because each notification is a separate step, the framework retries failures independently. If PagerDuty returns a 500, Slack and email still succeed, and the PagerDuty step retries on its own schedule. + +## External System Monitoring + +Not all monitoring is event-driven. Sometimes you need to poll external systems on a schedule. Use [`sleep()`](/docs/api-reference/workflow/sleep) in a loop to create a durable polling workflow that survives restarts and cold starts. + +```typescript title="workflows/monitor-service.ts" lineNumbers +import { sleep } from "workflow"; + +interface ServiceStatus { + healthy: boolean; + latency: number; + errorRate: number; +} + +async function checkServiceHealth(endpoint: string): Promise { + "use step"; + + const start = Date.now(); + const response = await fetch(endpoint); + const latency = Date.now() - start; + + return { + healthy: response.ok, + latency, + errorRate: response.ok ? 0 : 1, + }; +} + +async function sendAlert(service: string, status: ServiceStatus) { + "use step"; + + await fetch("https://hooks.slack.com/services/...", { + method: "POST", + body: JSON.stringify({ + text: `Service ${service} is unhealthy. Latency: ${status.latency}ms`, + }), + }); +} + +export async function monitorServiceWorkflow( + service: string, + endpoint: string +) { + "use workflow"; + + let consecutiveFailures = 0; + + while (true) { // [!code highlight] + const status = await checkServiceHealth(endpoint); + + if (!status.healthy) { + consecutiveFailures++; + if (consecutiveFailures >= 3) { + await sendAlert(service, status); + consecutiveFailures = 0; + } + } else { + consecutiveFailures = 0; + } + + await sleep("5m"); // [!code highlight] + } +} +``` + +The `sleep("5m")` call is durable - if the workflow process restarts during the sleep, it resumes at the correct time without re-running previous checks. The `while (true)` loop runs indefinitely, checking the service every 5 minutes and alerting after 3 consecutive failures. + + +`sleep()` accepts duration strings like `"5m"`, `"1h"`, or `"30s"`, as well as `Date` objects for sleeping until a specific time. See the [`sleep()` API reference](/docs/api-reference/workflow/sleep) for all supported formats. + + +## Content Security Scanning + +Workflows can also monitor content against security or policy rules. This pattern receives content via webhook, scans it in a step, and takes action on violations. + +```typescript title="workflows/content-security.ts" lineNumbers +import { createWebhook } from "workflow"; + +interface ContentEvent { + contentId: string; + body: string; + author: string; + type: "post" | "comment" | "message"; +} + +interface ScanResult { + passed: boolean; + violations: string[]; +} + +async function scanContent(event: ContentEvent): Promise { + "use step"; + + const violations: string[] = []; + + // Check against policy rules + const blockedPatterns = [/credential/i, /api[_-]?key/i, /password\s*=/i]; + for (const pattern of blockedPatterns) { + if (pattern.test(event.body)) { + violations.push(`Blocked pattern: ${pattern.source}`); + } + } + + return { passed: violations.length === 0, violations }; +} + +async function quarantineContent(contentId: string, violations: string[]) { + "use step"; + + // Move content to review queue + await fetch("https://api.internal/content/quarantine", { + method: "POST", + body: JSON.stringify({ contentId, violations }), + }); +} + +async function notifySecurityTeam(event: ContentEvent, result: ScanResult) { + "use step"; + + await fetch("https://hooks.slack.com/services/...", { + method: "POST", + body: JSON.stringify({ + text: `Content violation in ${event.type} by ${event.author}: ${result.violations.join(", ")}`, + }), + }); +} + +export async function contentSecurityWorkflow() { + "use workflow"; + + const webhook = createWebhook(); + + for await (const request of webhook) { + const event: ContentEvent = await request.json(); + const result = await scanContent(event); // [!code highlight] + + if (!result.passed) { // [!code highlight] + await Promise.all([ + quarantineContent(event.contentId, result.violations), + notifySecurityTeam(event, result), + ]); + } + } +} +``` + +The scanning step has full Node.js access, so it can call external scanning APIs, run regex-based rules, or invoke ML models. When a violation is found, the workflow quarantines the content and notifies the security team in parallel. + +## Related Documentation + +- [Errors & Retrying](/docs/foundations/errors-and-retries) - Handle errors inside your own steps with retry semantics +- [Hooks & Webhooks](/docs/foundations/hooks) - Deep dive on hooks, webhooks, and custom tokens +- [Common Patterns](/docs/foundations/common-patterns) - Sequential, parallel, timeout, and composition patterns +- [`createWebhook()` API Reference](/docs/api-reference/workflow/create-webhook) - Full webhook API documentation +- [`createHook()` API Reference](/docs/api-reference/workflow/create-hook) - Full hook API documentation +- [`sleep()` API Reference](/docs/api-reference/workflow/sleep) - Sleep and scheduling API diff --git a/docs/content/docs/recipes/index.mdx b/docs/content/docs/recipes/index.mdx new file mode 100644 index 0000000000..6d4295c6b2 --- /dev/null +++ b/docs/content/docs/recipes/index.mdx @@ -0,0 +1,17 @@ +--- +title: Recipes +description: Production-ready patterns and workflows you can adapt for your own applications. +type: overview +summary: Build real-world workflows using proven patterns from production use cases. +related: + - /docs/foundations/workflows-and-steps + - /docs/foundations/common-patterns +--- + +Recipes are practical, end-to-end guides that show how to build complete workflows for common use cases. Each recipe builds on the [Foundations](/docs/foundations) and demonstrates patterns you can adapt for your own applications. + + + + Build workflows that triage errors, deduplicate alerts, and dispatch notifications across channels. + + diff --git a/docs/content/docs/recipes/meta.json b/docs/content/docs/recipes/meta.json new file mode 100644 index 0000000000..d518eaec7d --- /dev/null +++ b/docs/content/docs/recipes/meta.json @@ -0,0 +1,4 @@ +{ + "title": "Recipes", + "pages": ["error-monitoring"] +}