diff --git a/.changeset/ninety-dancers-brush.md b/.changeset/ninety-dancers-brush.md new file mode 100644 index 0000000000..7f2a7c6528 --- /dev/null +++ b/.changeset/ninety-dancers-brush.md @@ -0,0 +1,6 @@ +--- +"@workflow/core": patch +"@workflow/cli": patch +--- + +**BREAKING CHANGE**: Make `getWorld` and `createWorld` asynchronous to support ESM dynamic imports for custom world modules. All callers must now `await getWorld()`. diff --git a/docs/content/docs/api-reference/workflow-api/get-world.mdx b/docs/content/docs/api-reference/workflow-api/get-world.mdx index 6d45095429..2feb7a6ab1 100644 --- a/docs/content/docs/api-reference/workflow-api/get-world.mdx +++ b/docs/content/docs/api-reference/workflow-api/get-world.mdx @@ -1,20 +1,20 @@ --- title: getWorld -description: Access the World instance for low-level storage, queuing, and streaming operations. +description: Async function that resolves the World instance for low-level storage, queuing, and streaming operations. type: reference -summary: Use getWorld to access low-level workflow storage, queuing, and streaming backends directly. +summary: Async function that resolves the World instance for low-level workflow storage, queuing, and streaming backends. prerequisites: - /docs/deploying --- -Retrieves the World instance for direct access to workflow storage, queuing, and streaming backends. This function returns a `World` which provides low-level access to manage workflow runs, steps, events, and hooks. +Retrieves the World instance for direct access to workflow storage, queuing, and streaming backends. This async function returns a `Promise` which provides low-level access to manage workflow runs, steps, events, and hooks. Use this function when you need direct access to the underlying workflow infrastructure, such as listing all runs, querying events, or implementing custom workflow management logic. ```typescript lineNumbers import { getWorld } from "workflow/runtime"; -const world = getWorld(); // [!code highlight] +const world = await getWorld(); // [!code highlight] ``` ## API Signature @@ -25,7 +25,7 @@ This function does not accept any parameters. ### Returns -Returns a `World` object: +Returns a `Promise` object: - Get direct access to workflow storage, queuing, and streaming backends. + Async: resolve the World instance for storage, queuing, and streaming backends. Low-level API for inspecting runs, steps, events, hooks, streams, and queues. diff --git a/docs/content/docs/api-reference/workflow-api/world/index.mdx b/docs/content/docs/api-reference/workflow-api/world/index.mdx index 161aae1ca9..1a1c02423a 100644 --- a/docs/content/docs/api-reference/workflow-api/world/index.mdx +++ b/docs/content/docs/api-reference/workflow-api/world/index.mdx @@ -2,7 +2,7 @@ title: World SDK description: Low-level API for inspecting and managing workflow runs, steps, events, hooks, streams, and queues. type: overview -summary: Access workflow infrastructure directly via getWorld() for building observability dashboards, admin tools, and custom integrations. +summary: Access workflow infrastructure via await getWorld() for building observability dashboards, admin tools, and custom integrations. prerequisites: - /docs/api-reference/workflow-api/get-world keywords: @@ -19,7 +19,7 @@ The World SDK provides direct access to workflow infrastructure — runs, steps, ```typescript lineNumbers import { getWorld } from "workflow/runtime"; -const world = getWorld(); // [!code highlight] +const world = await getWorld(); // [!code highlight] ``` ## Interfaces diff --git a/docs/content/docs/api-reference/workflow-api/world/observability.mdx b/docs/content/docs/api-reference/workflow-api/world/observability.mdx index a4c7dc63e6..0b09698368 100644 --- a/docs/content/docs/api-reference/workflow-api/world/observability.mdx +++ b/docs/content/docs/api-reference/workflow-api/world/observability.mdx @@ -147,7 +147,7 @@ const hydrated = hydrateResourceIOWithKey(step, key); // [!code highlight] import { getWorld } from "workflow/runtime"; import { parseStepName, parseWorkflowName } from "workflow/observability"; // [!code highlight] -const world = getWorld(); +const world = await getWorld(); const run = await world.runs.get(runId, { resolveData: "none" }); console.log("Workflow:", parseWorkflowName(run.workflowName)?.shortName); // [!code highlight] diff --git a/docs/content/docs/api-reference/workflow-api/world/queue.mdx b/docs/content/docs/api-reference/workflow-api/world/queue.mdx index 95c7b59b74..738c7c9615 100644 --- a/docs/content/docs/api-reference/workflow-api/world/queue.mdx +++ b/docs/content/docs/api-reference/workflow-api/world/queue.mdx @@ -28,7 +28,7 @@ Queue methods live directly on the `world` object (not nested). They dispatch in ```typescript lineNumbers import { getWorld } from "workflow/runtime"; -const world = getWorld(); // [!code highlight] +const world = await getWorld(); // [!code highlight] // Queue methods are called directly on world — e.g. world.queue() ``` diff --git a/docs/content/docs/api-reference/workflow-api/world/storage.mdx b/docs/content/docs/api-reference/workflow-api/world/storage.mdx index fd08dadc39..c28f75573e 100644 --- a/docs/content/docs/api-reference/workflow-api/world/storage.mdx +++ b/docs/content/docs/api-reference/workflow-api/world/storage.mdx @@ -37,7 +37,7 @@ The World storage interface exposes four sub-interfaces for querying workflow da ```typescript lineNumbers import { getWorld } from "workflow/runtime"; -const world = getWorld(); // [!code highlight] +const world = await getWorld(); // [!code highlight] ``` --- @@ -304,7 +304,7 @@ const result = await world.hooks.list({ // [!code highlight] ```typescript lineNumbers import { getWorld } from "workflow/runtime"; -const world = getWorld(); +const world = await getWorld(); let cursor: string | undefined; const runs = await world.runs.list({ // [!code highlight] @@ -319,7 +319,7 @@ cursor = runs.cursor; // pass to next call for pagination ```typescript lineNumbers import { getWorld } from "workflow/runtime"; -const world = getWorld(); +const world = await getWorld(); // Full data (default) — includes serialized input/output const run = await world.runs.get(runId); // [!code highlight] @@ -336,7 +336,7 @@ const lightweight = await world.runs.get(runId, { // [!code highlight] import { getWorld } from "workflow/runtime"; import { parseStepName } from "workflow/observability"; // [!code highlight] -const world = getWorld(); +const world = await getWorld(); const steps = await world.steps.list({ // [!code highlight] runId, resolveData: "none", @@ -358,7 +358,7 @@ const progress = steps.data.map((step) => { import { getWorld } from "workflow/runtime"; import { hydrateResourceIO, observabilityRevivers } from "workflow/observability"; // [!code highlight] -const world = getWorld(); +const world = await getWorld(); const step = await world.steps.get(runId, stepId); // [!code highlight] const hydrated = hydrateResourceIO(step, observabilityRevivers); // [!code highlight] console.log(hydrated.input, hydrated.output); @@ -369,7 +369,7 @@ console.log(hydrated.input, hydrated.output); ```typescript lineNumbers import { getWorld } from "workflow/runtime"; -const world = getWorld(); +const world = await getWorld(); await world.events.create(runId, { // [!code highlight] eventType: "run_cancelled", // [!code highlight] }); // [!code highlight] @@ -380,7 +380,7 @@ await world.events.create(runId, { // [!code highlight] ```typescript lineNumbers import { getWorld } from "workflow/runtime"; -const world = getWorld(); +const world = await getWorld(); const hook = await world.hooks.getByToken(token); // [!code highlight] console.log(hook.runId, hook.metadata); // [!code highlight] ``` @@ -390,7 +390,7 @@ console.log(hook.runId, hook.metadata); // [!code highlight] ```typescript lineNumbers import { getWorld } from "workflow/runtime"; -const world = getWorld(); +const world = await getWorld(); const events = await world.events.list({ runId }); // [!code highlight] for (const event of events.data) { diff --git a/docs/content/docs/api-reference/workflow-api/world/streams.mdx b/docs/content/docs/api-reference/workflow-api/world/streams.mdx index 9fa5480065..7d8b34e280 100644 --- a/docs/content/docs/api-reference/workflow-api/world/streams.mdx +++ b/docs/content/docs/api-reference/workflow-api/world/streams.mdx @@ -21,7 +21,7 @@ keywords: - stream lifecycle --- -Stream methods live on `world.streams` (the `streams` sub-object of the `world` object returned by `getWorld()`). Use them to write chunks, read streams, and manage stream lifecycle outside of the standard `getWritable()` pattern. +Stream methods live on `world.streams` (the `streams` sub-object of the `World` instance returned by `await getWorld()`). Use them to write chunks, read streams, and manage stream lifecycle outside of the standard `getWritable()` pattern. For most streaming use cases, use [`getWritable()`](/docs/api-reference/workflow/get-writable) inside steps. Direct stream methods are for advanced scenarios like building custom stream consumers or managing streams from outside a workflow. @@ -32,7 +32,7 @@ Stream methods live on `world.streams` (the `streams` sub-object of the `world` ```typescript lineNumbers import { getWorld } from "workflow/runtime"; -const world = getWorld(); // [!code highlight] +const world = await getWorld(); // [!code highlight] // Stream methods are called on world.streams — e.g. world.streams.write() ``` @@ -183,7 +183,7 @@ export async function GET(req: Request) { const url = new URL(req.url); const streamName = url.searchParams.get("name") ?? "default"; const runId = url.searchParams.get("runId")!; - const world = getWorld(); + const world = await getWorld(); const readable = await world.streams.get(runId, streamName); // [!code highlight] return new Response(readable, { @@ -197,7 +197,7 @@ export async function GET(req: Request) { ```typescript lineNumbers import { getWorld } from "workflow/runtime"; -const world = getWorld(); +const world = await getWorld(); let cursor: string | undefined; do { diff --git a/docs/content/docs/deploying/world/postgres-world.mdx b/docs/content/docs/deploying/world/postgres-world.mdx index cb4dbdd310..678ac53723 100644 --- a/docs/content/docs/deploying/world/postgres-world.mdx +++ b/docs/content/docs/deploying/world/postgres-world.mdx @@ -53,7 +53,8 @@ Create an `instrumentation.ts` file in your project root: export async function register() { if (process.env.NEXT_RUNTIME !== "edge") { const { getWorld } = await import("workflow/runtime"); - await getWorld().start?.(); + const world = await getWorld(); + await world.start?.(); } } ``` @@ -73,7 +74,8 @@ import type { ServerInit } from "@sveltejs/kit"; export const init: ServerInit = async () => { const { getWorld } = await import("workflow/runtime"); - await getWorld().start?.(); + const world = await getWorld(); + await world.start?.(); }; ``` @@ -92,7 +94,8 @@ import { defineNitroPlugin } from "nitro/~internal/runtime/plugin"; export default defineNitroPlugin(async () => { const { getWorld } = await import("workflow/runtime"); - await getWorld().start?.(); + const world = await getWorld(); + await world.start?.(); }); ``` @@ -168,7 +171,8 @@ For higher worker concurrency, Graphile Worker recommends setting `maxPoolSize` ### Programmatic configuration -{/* @skip-typecheck: incomplete code sample */} +{/*@skip-typecheck: incomplete code sample*/} + ```typescript title="workflow.config.ts" lineNumbers import { createWorld } from "@workflow/world-postgres"; @@ -200,6 +204,7 @@ Deploy your application to any cloud that supports long-running servers: - Platform-as-a-Service providers (Railway, Render, Fly.io, etc.) Ensure your deployment has: + 1. Network access to your PostgreSQL database 2. Environment variables configured correctly 3. The `start()` function called on server initialization diff --git a/docs/lib/ai-agent-detection.ts b/docs/lib/ai-agent-detection.ts index be0a02a2e4..e4184e8ecf 100644 --- a/docs/lib/ai-agent-detection.ts +++ b/docs/lib/ai-agent-detection.ts @@ -18,84 +18,84 @@ // Layer 1: Known AI agent UA substrings (lowercase). const AI_AGENT_UA_PATTERNS = [ // Anthropic — https://support.claude.com/en/articles/8896518 - "claudebot", - "claude-searchbot", - "claude-user", - "anthropic-ai", - "claude-web", + 'claudebot', + 'claude-searchbot', + 'claude-user', + 'anthropic-ai', + 'claude-web', // OpenAI — https://platform.openai.com/docs/bots - "chatgpt", - "gptbot", - "oai-searchbot", - "openai", + 'chatgpt', + 'gptbot', + 'oai-searchbot', + 'openai', // Google AI - "gemini", - "bard", - "google-cloudvertexbot", - "google-extended", + 'gemini', + 'bard', + 'google-cloudvertexbot', + 'google-extended', // Meta - "meta-externalagent", - "meta-externalfetcher", - "meta-webindexer", + 'meta-externalagent', + 'meta-externalfetcher', + 'meta-webindexer', // Search/Research AI - "perplexity", - "youbot", - "you.com", - "deepseekbot", + 'perplexity', + 'youbot', + 'you.com', + 'deepseekbot', // Coding assistants - "cursor", - "github-copilot", - "codeium", - "tabnine", - "sourcegraph", + 'cursor', + 'github-copilot', + 'codeium', + 'tabnine', + 'sourcegraph', // Other AI agents / data scrapers (low-harm to serve markdown) - "cohere-ai", - "bytespider", - "amazonbot", - "ai2bot", - "diffbot", - "omgili", - "omgilibot", + 'cohere-ai', + 'bytespider', + 'amazonbot', + 'ai2bot', + 'diffbot', + 'omgili', + 'omgilibot', ]; // Layer 2: Known AI service URLs in Signature-Agent header (RFC 9421). -const SIGNATURE_AGENT_DOMAINS = ["chatgpt.com"]; +const SIGNATURE_AGENT_DOMAINS = ['chatgpt.com']; // Layer 3: Traditional bot exclusion list — bots that should NOT trigger // the heuristic layer (they're search engine crawlers, social previews, or // monitoring tools, not AI agents). const TRADITIONAL_BOT_PATTERNS = [ - "googlebot", - "bingbot", - "yandexbot", - "baiduspider", - "duckduckbot", - "slurp", - "msnbot", - "facebot", - "twitterbot", - "linkedinbot", - "whatsapp", - "telegrambot", - "pingdom", - "uptimerobot", - "newrelic", - "datadog", - "statuspage", - "site24x7", - "applebot", + 'googlebot', + 'bingbot', + 'yandexbot', + 'baiduspider', + 'duckduckbot', + 'slurp', + 'msnbot', + 'facebot', + 'twitterbot', + 'linkedinbot', + 'whatsapp', + 'telegrambot', + 'pingdom', + 'uptimerobot', + 'newrelic', + 'datadog', + 'statuspage', + 'site24x7', + 'applebot', ]; // Broad regex for bot-like UA strings (used only in Layer 3 heuristic). const BOT_LIKE_REGEX = /bot|agent|fetch|crawl|spider|search/i; -export type DetectionMethod = "ua-match" | "signature-agent" | "heuristic"; +export type DetectionMethod = 'ua-match' | 'signature-agent' | 'heuristic'; export interface DetectionResult { detected: boolean; @@ -111,36 +111,36 @@ export interface DetectionResult { export function isAIAgent(request: { headers: { get(name: string): string | null }; }): DetectionResult { - const userAgent = request.headers.get("user-agent"); + const userAgent = request.headers.get('user-agent'); // Layer 1: Known UA pattern match if (userAgent) { const lowerUA = userAgent.toLowerCase(); if (AI_AGENT_UA_PATTERNS.some((pattern) => lowerUA.includes(pattern))) { - return { detected: true, method: "ua-match" }; + return { detected: true, method: 'ua-match' }; } } // Layer 2: Signature-Agent header (RFC 9421, used by ChatGPT agent) - const signatureAgent = request.headers.get("signature-agent"); + const signatureAgent = request.headers.get('signature-agent'); if (signatureAgent) { const lowerSig = signatureAgent.toLowerCase(); if (SIGNATURE_AGENT_DOMAINS.some((domain) => lowerSig.includes(domain))) { - return { detected: true, method: "signature-agent" }; + return { detected: true, method: 'signature-agent' }; } } // Layer 3: Missing browser fingerprint heuristic // Real browsers (Chrome 76+, Firefox 90+, Safari 16.4+) send sec-fetch-mode // on navigation requests. Its absence signals a programmatic client. - const secFetchMode = request.headers.get("sec-fetch-mode"); + const secFetchMode = request.headers.get('sec-fetch-mode'); if (!secFetchMode && userAgent && BOT_LIKE_REGEX.test(userAgent)) { const lowerUA = userAgent.toLowerCase(); const isTraditionalBot = TRADITIONAL_BOT_PATTERNS.some((pattern) => lowerUA.includes(pattern) ); if (!isTraditionalBot) { - return { detected: true, method: "heuristic" }; + return { detected: true, method: 'heuristic' }; } } diff --git a/docs/proxy.ts b/docs/proxy.ts index 683a1f307c..02b2327970 100644 --- a/docs/proxy.ts +++ b/docs/proxy.ts @@ -59,13 +59,13 @@ const proxy = (request: NextRequest, context: NextFetchEvent) => { // AI agent detection — rewrite docs pages to markdown for agents // so they always get structured content without needing .md URLs or Accept headers if ( - (pathname === "/docs" || pathname.startsWith("/docs/")) && - !pathname.includes("/llms.mdx/") + (pathname === '/docs' || pathname.startsWith('/docs/')) && + !pathname.includes('/llms.mdx/') ) { const agentResult = isAIAgent(request); if (agentResult.detected && !isMarkdownPreferred(request)) { const result = - pathname === "/docs" + pathname === '/docs' ? `/${i18n.defaultLanguage}/llms.mdx` : rewriteLLM(pathname); @@ -73,10 +73,10 @@ const proxy = (request: NextRequest, context: NextFetchEvent) => { context.waitUntil( trackMdRequest({ path: pathname, - userAgent: request.headers.get("user-agent"), - referer: request.headers.get("referer"), - acceptHeader: request.headers.get("accept"), - requestType: "agent-rewrite", + userAgent: request.headers.get('user-agent'), + referer: request.headers.get('referer'), + acceptHeader: request.headers.get('accept'), + requestType: 'agent-rewrite', detectionMethod: agentResult.method, }) ); diff --git a/packages/cli/src/base.ts b/packages/cli/src/base.ts index 8c6d091463..ace29f4055 100644 --- a/packages/cli/src/base.ts +++ b/packages/cli/src/base.ts @@ -37,7 +37,7 @@ export abstract class BaseCommand extends Command { */ async finally(err: Error | undefined): Promise { try { - const world = getWorld(); + const world = await getWorld(); await world.close?.(); } catch (closeErr) { this.warn( diff --git a/packages/cli/src/lib/inspect/setup.ts b/packages/cli/src/lib/inspect/setup.ts index 2cd7d60f6f..dde0b13599 100644 --- a/packages/cli/src/lib/inspect/setup.ts +++ b/packages/cli/src/lib/inspect/setup.ts @@ -128,7 +128,7 @@ export const setupCliWorld = async ( }, }); } else { - world = createWorld(); + world = await createWorld(); } // Store in the global cache so BaseCommand.finally() can find and close it. diff --git a/packages/core/e2e/e2e.test.ts b/packages/core/e2e/e2e.test.ts index 6e0709dd83..a2533a3f46 100644 --- a/packages/core/e2e/e2e.test.ts +++ b/packages/core/e2e/e2e.test.ts @@ -411,7 +411,7 @@ describe('e2e', () => { // Poll until all 3 webhooks are registered. // On Vercel, webhook registration can be slow due to cold starts and // queue processing latency, so we allow up to 60s. - const world = getWorld(); + const world = await getWorld(); const hooks = await (async () => { const deadline = Date.now() + 60_000; while (Date.now() < deadline) { @@ -773,7 +773,7 @@ describe('e2e', () => { } // Read all chunks via getChunks pagination - const world = getWorld(); + const world = await getWorld(); const streamName = `${run.runId.replace('wrun_', 'strm_')}_user`; const paginatedChunks: Uint8Array[] = []; let cursor: string | null = null; @@ -1591,7 +1591,7 @@ describe('e2e', () => { // Tests the queue-based health check using healthCheck() directly. // This bypasses Vercel Deployment Protection by sending messages // through the Queue infrastructure rather than direct HTTP. - const world = getWorld(); + const world = await getWorld(); // Test workflow endpoint health check const workflowResult = await healthCheck(world, 'workflow', { @@ -1988,7 +1988,7 @@ describe('e2e', () => { // This exercises the same cancelRun code path that the CLI uses // (the CLI delegates directly to this function). const { cancelRun } = await import('../src/runtime'); - await cancelRun(getWorld(), run.runId); + await cancelRun(await getWorld(), run.runId); // Verify the run was cancelled - returnValue should throw WorkflowRunCancelledError const error = await run.returnValue.catch((e: unknown) => e); @@ -2206,7 +2206,7 @@ describe('e2e', () => { // (run_created) throws a 500 server error. The queue should still // be dispatched with runInput, and the runtime should bootstrap // the run via the run_started fallback path. - const realWorld = getWorld(); + const realWorld = await getWorld(); let createCallCount = 0; const stubbedWorld: World = { ...realWorld, diff --git a/packages/core/e2e/utils.ts b/packages/core/e2e/utils.ts index 9e06b943ec..01ed6df1b3 100644 --- a/packages/core/e2e/utils.ts +++ b/packages/core/e2e/utils.ts @@ -519,7 +519,7 @@ async function getRunDiagnostics(tracked: TrackedRun): Promise { ]; try { - const world = getWorld(); + const world = await getWorld(); const runData = await world.runs.get(run.runId); lines.push(`Status: ${runData.status}`); diff --git a/packages/core/src/runtime.ts b/packages/core/src/runtime.ts index 902936b74f..1b5a6e1b3a 100644 --- a/packages/core/src/runtime.ts +++ b/packages/core/src/runtime.ts @@ -27,7 +27,11 @@ import { withHealthCheck, } from './runtime/helpers.js'; import { handleSuspension } from './runtime/suspension-handler.js'; -import { getWorld, getWorldHandlers } from './runtime/world.js'; +import { + getWorld, + getWorldHandlers, + type WorldHandlers, +} from './runtime/world.js'; import { remapErrorStack } from './source-map.js'; import * as Attribute from './telemetry/semantic-conventions.js'; import { @@ -97,99 +101,48 @@ export { export function workflowEntrypoint( workflowCode: string ): (req: Request) => Promise { - const { createQueueHandler, specVersion: worldSpecVersion } = - getWorldHandlers(); - const handler = createQueueHandler( - '__wkf_workflow_', - async (message_, metadata) => { - // Check if this is a health check message - // NOTE: Health check messages are intentionally unauthenticated for monitoring purposes. - // They only write a simple status response to a stream and do not expose sensitive data. - // The stream name includes a unique correlationId that must be known by the caller. - const healthCheck = parseHealthCheckPayload(message_); - if (healthCheck) { - await handleHealthCheckMessage( - healthCheck, - 'workflow', - worldSpecVersion - ); - return; - } - - const { - runId, - traceCarrier: traceContext, - requestedAt, - runInput, - } = WorkflowInvokePayloadSchema.parse(message_); - const { requestId } = metadata; - // Extract the workflow name from the topic name - const workflowName = metadata.queueName.slice('__wkf_workflow_'.length); - - // --- Max delivery check --- - // Enforce max delivery limit before any infrastructure calls. - // This prevents runaway workflows from consuming infinite queue deliveries. - // At this point, we want to do the minimal amount of work (no fetching - // of the workflow events, etc. We simply attempt to mark the run as failed - // and if that fails, the message is still consumed but with adequate logging - // that an error occurred preventing us from failing the run. - if (metadata.attempt > MAX_QUEUE_DELIVERIES) { - runtimeLogger.error( - `Workflow handler exceeded max deliveries (${metadata.attempt}/${MAX_QUEUE_DELIVERIES})`, - { workflowRunId: runId, workflowName, attempt: metadata.attempt } - ); - try { - const world = getWorld(); - await world.events.create( - runId, - { - eventType: 'run_failed', - specVersion: SPEC_VERSION_CURRENT, - eventData: { - error: { - message: `Workflow exceeded maximum queue deliveries (${metadata.attempt}/${MAX_QUEUE_DELIVERIES})`, - }, - errorCode: RUN_ERROR_CODES.MAX_DELIVERIES_EXCEEDED, - }, - }, - { requestId } - ); - } catch (err) { - if (EntityConflictError.is(err) || RunExpiredError.is(err)) { - // Run already finished, consume the message silently - return; - } - runtimeLogger.error( - `Failed to mark run as failed after ${metadata.attempt} delivery attempts. ` + - `A persistent error is preventing the run from being terminated. ` + - `The run will remain in its current state until manually resolved. ` + - `This is most likely due to a persistent outage of the workflow backend ` + - `or a bug in the workflow runtime and should be reported to the Workflow team.`, - { - workflowRunId: runId, - error: err instanceof Error ? err.message : String(err), - attempt: metadata.attempt, - } + const handler = (worldHandlers: WorldHandlers) => + worldHandlers.createQueueHandler( + '__wkf_workflow_', + async (message_, metadata) => { + // Check if this is a health check message + // NOTE: Health check messages are intentionally unauthenticated for monitoring purposes. + // They only write a simple status response to a stream and do not expose sensitive data. + // The stream name includes a unique correlationId that must be known by the caller. + const healthCheck = parseHealthCheckPayload(message_); + if (healthCheck) { + await handleHealthCheckMessage( + healthCheck, + 'workflow', + worldHandlers.specVersion ); + return; } - return; - } - const spanLinks = await linkToCurrentContext(); - - // --- Replay timeout guard --- - // If the replay takes longer than the timeout, fail the run and exit. - // This must be lower than the function's maxDuration to ensure - // the failure is recorded before the platform kills the function. - let replayTimeout: NodeJS.Timeout | undefined; - if (process.env.VERCEL_URL !== undefined) { - replayTimeout = setTimeout(async () => { - runtimeLogger.error('Workflow replay exceeded timeout', { - workflowRunId: runId, - timeoutMs: REPLAY_TIMEOUT_MS, - }); + const { + runId, + traceCarrier: traceContext, + requestedAt, + runInput, + } = WorkflowInvokePayloadSchema.parse(message_); + const { requestId } = metadata; + // Extract the workflow name from the topic name + const workflowName = metadata.queueName.slice('__wkf_workflow_'.length); + + // --- Max delivery check --- + // Enforce max delivery limit before any infrastructure calls. + // This prevents runaway workflows from consuming infinite queue deliveries. + // At this point, we want to do the minimal amount of work (no fetching + // of the workflow events, etc. We simply attempt to mark the run as failed + // and if that fails, the message is still consumed but with adequate logging + // that an error occurred preventing us from failing the run. + if (metadata.attempt > MAX_QUEUE_DELIVERIES) { + runtimeLogger.error( + `Workflow handler exceeded max deliveries (${metadata.attempt}/${MAX_QUEUE_DELIVERIES})`, + { workflowRunId: runId, workflowName, attempt: metadata.attempt } + ); try { - const world = getWorld(); + const world = await getWorld(); await world.events.create( runId, { @@ -197,130 +150,392 @@ export function workflowEntrypoint( specVersion: SPEC_VERSION_CURRENT, eventData: { error: { - message: `Workflow replay exceeded maximum duration (${REPLAY_TIMEOUT_MS / 1000}s)`, + message: `Workflow exceeded maximum queue deliveries (${metadata.attempt}/${MAX_QUEUE_DELIVERIES})`, }, - errorCode: RUN_ERROR_CODES.REPLAY_TIMEOUT, + errorCode: RUN_ERROR_CODES.MAX_DELIVERIES_EXCEEDED, }, }, { requestId } ); - } catch { - // Best effort — process exits regardless + } catch (err) { + if (EntityConflictError.is(err) || RunExpiredError.is(err)) { + // Run already finished, consume the message silently + return; + } + runtimeLogger.error( + `Failed to mark run as failed after ${metadata.attempt} delivery attempts. ` + + `A persistent error is preventing the run from being terminated. ` + + `The run will remain in its current state until manually resolved. ` + + `This is most likely due to a persistent outage of the workflow backend ` + + `or a bug in the workflow runtime and should be reported to the Workflow team.`, + { + workflowRunId: runId, + error: err instanceof Error ? err.message : String(err), + attempt: metadata.attempt, + } + ); } - // Note that this also prevents the runtime to acking the queue message, - // so the queue will call back once, after which a 410 will get it to exit early. - process.exit(1); - }, REPLAY_TIMEOUT_MS); - replayTimeout.unref(); - } + return; + } - // Invoke user workflow within the propagated trace context and baggage - return await withTraceContext(traceContext, async () => { - // Set workflow context as baggage for automatic propagation - return await withWorkflowBaggage( - { workflowRunId: runId, workflowName }, - async () => { - const world = getWorld(); - return trace( - `WORKFLOW ${workflowName}`, - { links: spanLinks }, - async (span) => { - span?.setAttributes({ - ...Attribute.WorkflowName(workflowName), - ...Attribute.WorkflowOperation('execute'), - // Standard OTEL messaging conventions - ...Attribute.MessagingSystem('vercel-queue'), - ...Attribute.MessagingDestinationName(metadata.queueName), - ...Attribute.MessagingMessageId(metadata.messageId), - ...Attribute.MessagingOperationType('process'), - ...getQueueOverhead({ requestedAt }), - }); - - // TODO: validate `workflowName` exists before consuming message? - - span?.setAttributes({ - ...Attribute.WorkflowRunId(runId), - ...Attribute.WorkflowTracePropagated(!!traceContext), - }); - - let workflowStartedAt = -1; - let workflowRun: WorkflowRun | undefined; - // Pre-loaded events from the run_started response. - // When present, we skip the events.list call. - let preloadedEvents: Event[] | undefined; - - // --- Infrastructure: prepare the run state --- - // Always call run_started directly — this both transitions - // the run to 'running' AND returns the run entity, saving - // a separate runs.get round-trip. - // Contract: events.create('run_started') must be idempotent - // for runs already in 'running' status (return the run - // without error), not just for pending → running transitions. - // Network/server errors propagate to the queue handler for retry. - // WorkflowRuntimeError (data integrity issues) are fatal and - // produce run_failed since retrying won't fix them. - try { - const result = await world.events.create( - runId, - { - eventType: 'run_started', - // Use the spec version from the original start() call - // when available, so the resilient start path creates - // the run with the correct version (not always current). - specVersion: - runInput?.specVersion ?? SPEC_VERSION_CURRENT, - // Pass run input from queue so the server can - // create the run if run_created was missed. - // Uint8Array values survive the queue natively - // (CBOR on world-vercel, JSON reviver on world-local). - ...(runInput - ? { - eventData: { - input: runInput.input, - deploymentId: runInput.deploymentId, - workflowName: runInput.workflowName, - executionContext: runInput.executionContext, - }, - } - : {}), + const spanLinks = await linkToCurrentContext(); + + // --- Replay timeout guard --- + // If the replay takes longer than the timeout, fail the run and exit. + // This must be lower than the function's maxDuration to ensure + // the failure is recorded before the platform kills the function. + let replayTimeout: NodeJS.Timeout | undefined; + if (process.env.VERCEL_URL !== undefined) { + replayTimeout = setTimeout(async () => { + runtimeLogger.error('Workflow replay exceeded timeout', { + workflowRunId: runId, + timeoutMs: REPLAY_TIMEOUT_MS, + }); + try { + const world = await getWorld(); + await world.events.create( + runId, + { + eventType: 'run_failed', + specVersion: SPEC_VERSION_CURRENT, + eventData: { + error: { + message: `Workflow replay exceeded maximum duration (${REPLAY_TIMEOUT_MS / 1000}s)`, }, - { requestId } - ); - if (!result.run) { - throw new WorkflowRuntimeError( - `Event creation for 'run_started' did not return the run entity for run "${runId}"` - ); - } - workflowRun = result.run; + errorCode: RUN_ERROR_CODES.REPLAY_TIMEOUT, + }, + }, + { requestId } + ); + } catch { + // Best effort — process exits regardless + } + // Note that this also prevents the runtime to acking the queue message, + // so the queue will call back once, after which a 410 will get it to exit early. + process.exit(1); + }, REPLAY_TIMEOUT_MS); + replayTimeout.unref(); + } - // If the response includes events, use them to skip - // the initial events.list call and reduce TTFB. - if (result.events && result.events.length > 0) { - preloadedEvents = result.events; - } + // Invoke user workflow within the propagated trace context and baggage + return await withTraceContext(traceContext, async () => { + // Set workflow context as baggage for automatic propagation + return await withWorkflowBaggage( + { workflowRunId: runId, workflowName }, + async () => { + const world = await getWorld(); + return trace( + `WORKFLOW ${workflowName}`, + { links: spanLinks }, + async (span) => { + span?.setAttributes({ + ...Attribute.WorkflowName(workflowName), + ...Attribute.WorkflowOperation('execute'), + // Standard OTEL messaging conventions + ...Attribute.MessagingSystem('vercel-queue'), + ...Attribute.MessagingDestinationName(metadata.queueName), + ...Attribute.MessagingMessageId(metadata.messageId), + ...Attribute.MessagingOperationType('process'), + ...getQueueOverhead({ requestedAt }), + }); + + // TODO: validate `workflowName` exists before consuming message? + + span?.setAttributes({ + ...Attribute.WorkflowRunId(runId), + ...Attribute.WorkflowTracePropagated(!!traceContext), + }); - if (!workflowRun.startedAt) { - throw new WorkflowRuntimeError( - `Workflow run "${runId}" has no "startedAt" timestamp` + let workflowStartedAt = -1; + let workflowRun: WorkflowRun | undefined; + // Pre-loaded events from the run_started response. + // When present, we skip the events.list call to reduce TTFB. + let preloadedEvents: Event[] | undefined; + + // --- Infrastructure: prepare the run state --- + // Always call run_started directly — this both transitions + // the run to 'running' AND returns the run entity, saving + // a separate runs.get round-trip. + // Contract: events.create('run_started') must be idempotent + // for runs already in 'running' status (return the run + // without error), not just for pending → running transitions. + // Network/server errors propagate to the queue handler for retry. + // WorkflowRuntimeError (data integrity issues) are fatal and + // produce run_failed since retrying won't fix them. + try { + const result = await world.events.create( + runId, + { + eventType: 'run_started', + // Use the spec version from the original start() call + // when available, so the resilient start path creates + // the run with the correct version (not always current). + specVersion: + runInput?.specVersion ?? SPEC_VERSION_CURRENT, + // Pass run input from queue so the server can + // create the run if run_created was missed. + // Uint8Array values survive the queue natively + // (CBOR on world-vercel, JSON reviver on world-local). + ...(runInput + ? { + eventData: { + input: runInput.input, + deploymentId: runInput.deploymentId, + workflowName: runInput.workflowName, + executionContext: runInput.executionContext, + }, + } + : {}), + }, + { requestId } ); + if (!result.run) { + throw new WorkflowRuntimeError( + `Event creation for 'run_started' did not return the run entity for run "${runId}"` + ); + } + workflowRun = result.run; + + // If the response includes events, use them to skip + // the initial events.list call and reduce TTFB. + if (result.events && result.events.length > 0) { + preloadedEvents = result.events; + } + + if (!workflowRun.startedAt) { + throw new WorkflowRuntimeError( + `Workflow run "${runId}" has no "startedAt" timestamp` + ); + } + } catch (err) { + // Run was concurrently completed/failed/cancelled + if ( + EntityConflictError.is(err) || + RunExpiredError.is(err) + ) { + // EntityConflictError: run was concurrently + // completed/failed/cancelled during setup. + // RunExpiredError: run already in terminal state. + // In both cases, skip processing this message. + runtimeLogger.info( + 'Run already finished during setup, skipping', + { workflowRunId: runId, message: err.message } + ); + return; + } else if (err instanceof WorkflowRuntimeError) { + runtimeLogger.error( + 'Fatal runtime error during workflow setup', + { workflowRunId: runId, error: err.message } + ); + try { + await world.events.create( + runId, + { + eventType: 'run_failed', + specVersion: SPEC_VERSION_CURRENT, + eventData: { + error: { + message: err.message, + stack: err.stack, + }, + errorCode: RUN_ERROR_CODES.RUNTIME_ERROR, + }, + }, + { requestId } + ); + } catch (failErr) { + if ( + EntityConflictError.is(failErr) || + RunExpiredError.is(failErr) + ) { + return; + } + throw failErr; + } + return; + } else { + throw err; + } } - } catch (err) { - // Run was concurrently completed/failed/cancelled - if (EntityConflictError.is(err) || RunExpiredError.is(err)) { - // EntityConflictError: run was concurrently - // completed/failed/cancelled during setup. - // RunExpiredError: run already in terminal state. - // In both cases, skip processing this message. + + workflowStartedAt = +workflowRun.startedAt; + + span?.setAttributes({ + ...Attribute.WorkflowRunStatus(workflowRun.status), + ...Attribute.WorkflowStartedAt(workflowStartedAt), + }); + + if (workflowRun.status !== 'running') { + // Workflow has already completed or failed, so we can skip it runtimeLogger.info( - 'Run already finished during setup, skipping', - { workflowRunId: runId, message: err.message } + 'Workflow already completed or failed, skipping', + { + workflowRunId: runId, + status: workflowRun.status, + } ); + + // TODO: for `cancel`, we actually want to propagate a WorkflowCancelled event + // inside the workflow context so the user can gracefully exit. this is SIGTERM + // TODO: furthermore, there should be a timeout or a way to force cancel SIGKILL + // so that we actually exit here without replaying the workflow at all, in the case + // the replaying the workflow is itself failing. + return; - } else if (err instanceof WorkflowRuntimeError) { - runtimeLogger.error( - 'Fatal runtime error during workflow setup', - { workflowRunId: runId, error: err.message } + } + + // Load all events into memory before running. + // If we got pre-loaded events from the run_started response, + // skip the events.list round-trip to reduce TTFB. + const events = + preloadedEvents ?? + (await getAllWorkflowRunEvents(workflowRun.runId)); + + // Check for any elapsed waits and create wait_completed events + const now = Date.now(); + + // Pre-compute completed correlation IDs for O(n) lookup instead of O(n²) + const completedWaitIds = new Set( + events + .filter((e) => e.eventType === 'wait_completed') + .map((e) => e.correlationId) + ); + + // Collect all waits that need completion + const waitsToComplete = events + .filter( + (e): e is typeof e & { correlationId: string } => + e.eventType === 'wait_created' && + e.correlationId !== undefined && + !completedWaitIds.has(e.correlationId) && + now >= (e.eventData.resumeAt as Date).getTime() + ) + .map((e) => ({ + eventType: 'wait_completed' as const, + specVersion: SPEC_VERSION_CURRENT, + correlationId: e.correlationId, + })); + + // Create all wait_completed events + for (const waitEvent of waitsToComplete) { + try { + const result = await world.events.create( + runId, + waitEvent, + { + requestId, + } + ); + // Add the event to the events array so the workflow can see it + events.push(result.event!); + } catch (err) { + if (EntityConflictError.is(err)) { + runtimeLogger.info('Wait already completed, skipping', { + workflowRunId: runId, + correlationId: waitEvent.correlationId, + }); + continue; + } + throw err; + } + } + + // Resolve the encryption key for this run's deployment + const rawKey = + await world.getEncryptionKeyForRun?.(workflowRun); + const encryptionKey = rawKey + ? await importKey(rawKey) + : undefined; + + // --- User code execution --- + // Only errors from runWorkflow() (user workflow code) should + // produce run_failed. Infrastructure errors (network, server) + // must propagate to the queue handler for automatic retry. + let workflowResult: unknown; + try { + workflowResult = await trace( + 'workflow.replay', + {}, + async (replaySpan) => { + replaySpan?.setAttributes({ + ...Attribute.WorkflowEventsCount(events.length), + }); + return await runWorkflow( + workflowCode, + workflowRun, + events, + encryptionKey + ); + } ); + } catch (err) { + // WorkflowSuspension is normal control flow — not an error + if (WorkflowSuspension.is(err)) { + const suspensionMessage = buildWorkflowSuspensionMessage( + runId, + err.stepCount, + err.hookCount, + err.waitCount + ); + if (suspensionMessage) { + runtimeLogger.debug(suspensionMessage); + } + + const result = await handleSuspension({ + suspension: err, + world, + run: workflowRun, + span, + requestId, + }); + + if (result.timeoutSeconds !== undefined) { + return { timeoutSeconds: result.timeoutSeconds }; + } + + // Suspension handled, no further work needed + return; + } + + // This is a user code error or a WorkflowRuntimeError + // (e.g., corrupted event log). Fail the workflow run. + + // Record exception for OTEL error tracking + if (err instanceof Error) { + span?.recordException?.(err); + } + + const normalizedError = await normalizeUnknownError(err); + const errorName = normalizedError.name || getErrorName(err); + const errorMessage = normalizedError.message; + let errorStack = + normalizedError.stack || getErrorStack(err); + + // Remap error stack using source maps to show original source locations + if (errorStack) { + const parsedName = parseWorkflowName(workflowName); + const filename = + parsedName?.moduleSpecifier || workflowName; + errorStack = remapErrorStack( + errorStack, + filename, + workflowCode + ); + } + + // Classify the error: WorkflowRuntimeError indicates an + // internal issue (corrupted event log, missing data); + // everything else is a user code error. + const errorCode = classifyRunError(err); + + runtimeLogger.error('Error while running workflow', { + workflowRunId: runId, + errorCode, + errorName, + errorStack, + }); + + // Fail the workflow run via event (event-sourced architecture) try { await world.events.create( runId, @@ -329,10 +544,10 @@ export function workflowEntrypoint( specVersion: SPEC_VERSION_CURRENT, eventData: { error: { - message: err.message, - stack: err.stack, + message: errorMessage, + stack: errorStack, }, - errorCode: RUN_ERROR_CODES.RUNTIME_ERROR, + errorCode, }, }, { requestId } @@ -342,286 +557,91 @@ export function workflowEntrypoint( EntityConflictError.is(failErr) || RunExpiredError.is(failErr) ) { + runtimeLogger.info( + 'Tried failing workflow run, but run has already finished.', + { + workflowRunId: runId, + message: failErr.message, + } + ); + span?.setAttributes({ + ...Attribute.WorkflowErrorCode(errorCode), + ...Attribute.WorkflowErrorName(errorName), + ...Attribute.WorkflowErrorMessage(errorMessage), + ...Attribute.ErrorType(errorName), + }); return; + } else { + throw failErr; } - throw failErr; - } - return; - } else { - throw err; - } - } - - workflowStartedAt = +workflowRun.startedAt; - - span?.setAttributes({ - ...Attribute.WorkflowRunStatus(workflowRun.status), - ...Attribute.WorkflowStartedAt(workflowStartedAt), - }); - - if (workflowRun.status !== 'running') { - // Workflow has already completed or failed, so we can skip it - runtimeLogger.info( - 'Workflow already completed or failed, skipping', - { - workflowRunId: runId, - status: workflowRun.status, - } - ); - - // TODO: for `cancel`, we actually want to propagate a WorkflowCancelled event - // inside the workflow context so the user can gracefully exit. this is SIGTERM - // TODO: furthermore, there should be a timeout or a way to force cancel SIGKILL - // so that we actually exit here without replaying the workflow at all, in the case - // the replaying the workflow is itself failing. - - return; - } - - // Load all events into memory before running. - // If we got pre-loaded events from the run_started response, - // skip the events.list round-trip to reduce TTFB. - const events = - preloadedEvents ?? - (await getAllWorkflowRunEvents(workflowRun.runId)); - - // Check for any elapsed waits and create wait_completed events - const now = Date.now(); - - // Pre-compute completed correlation IDs for O(n) lookup instead of O(n²) - const completedWaitIds = new Set( - events - .filter((e) => e.eventType === 'wait_completed') - .map((e) => e.correlationId) - ); - - // Collect all waits that need completion - const waitsToComplete = events - .filter( - (e): e is typeof e & { correlationId: string } => - e.eventType === 'wait_created' && - e.correlationId !== undefined && - !completedWaitIds.has(e.correlationId) && - now >= (e.eventData.resumeAt as Date).getTime() - ) - .map((e) => ({ - eventType: 'wait_completed' as const, - specVersion: SPEC_VERSION_CURRENT, - correlationId: e.correlationId, - })); - - // Create all wait_completed events - for (const waitEvent of waitsToComplete) { - try { - const result = await world.events.create(runId, waitEvent, { - requestId, - }); - // Add the event to the events array so the workflow can see it - events.push(result.event!); - } catch (err) { - if (EntityConflictError.is(err)) { - runtimeLogger.info('Wait already completed, skipping', { - workflowRunId: runId, - correlationId: waitEvent.correlationId, - }); - continue; - } - throw err; - } - } - - // Resolve the encryption key for this run's deployment - const rawKey = - await world.getEncryptionKeyForRun?.(workflowRun); - const encryptionKey = rawKey - ? await importKey(rawKey) - : undefined; - - // --- User code execution --- - // Only errors from runWorkflow() (user workflow code) should - // produce run_failed. Infrastructure errors (network, server) - // must propagate to the queue handler for automatic retry. - let workflowResult: unknown; - try { - workflowResult = await trace( - 'workflow.replay', - {}, - async (replaySpan) => { - replaySpan?.setAttributes({ - ...Attribute.WorkflowEventsCount(events.length), - }); - return await runWorkflow( - workflowCode, - workflowRun, - events, - encryptionKey - ); - } - ); - } catch (err) { - // WorkflowSuspension is normal control flow — not an error - if (WorkflowSuspension.is(err)) { - const suspensionMessage = buildWorkflowSuspensionMessage( - runId, - err.stepCount, - err.hookCount, - err.waitCount - ); - if (suspensionMessage) { - runtimeLogger.debug(suspensionMessage); } - const result = await handleSuspension({ - suspension: err, - world, - run: workflowRun, - span, - requestId, + span?.setAttributes({ + ...Attribute.WorkflowRunStatus('failed'), + ...Attribute.WorkflowErrorCode(errorCode), + ...Attribute.WorkflowErrorName(errorName), + ...Attribute.WorkflowErrorMessage(errorMessage), + ...Attribute.ErrorType(errorName), }); - - if (result.timeoutSeconds !== undefined) { - return { timeoutSeconds: result.timeoutSeconds }; - } - - // Suspension handled, no further work needed return; } - // This is a user code error or a WorkflowRuntimeError - // (e.g., corrupted event log). Fail the workflow run. - - // Record exception for OTEL error tracking - if (err instanceof Error) { - span?.recordException?.(err); - } - - const normalizedError = await normalizeUnknownError(err); - const errorName = normalizedError.name || getErrorName(err); - const errorMessage = normalizedError.message; - let errorStack = normalizedError.stack || getErrorStack(err); - - // Remap error stack using source maps to show original source locations - if (errorStack) { - const parsedName = parseWorkflowName(workflowName); - const filename = - parsedName?.moduleSpecifier || workflowName; - errorStack = remapErrorStack( - errorStack, - filename, - workflowCode - ); - } - - // Classify the error: WorkflowRuntimeError indicates an - // internal issue (corrupted event log, missing data); - // everything else is a user code error. - const errorCode = classifyRunError(err); - - runtimeLogger.error('Error while running workflow', { - workflowRunId: runId, - errorCode, - errorName, - errorStack, - }); - - // Fail the workflow run via event (event-sourced architecture) + // --- Infrastructure: complete the run --- + // This is outside the user-code try/catch so that failures + // here (e.g., network errors) propagate to the queue handler. try { await world.events.create( runId, { - eventType: 'run_failed', + eventType: 'run_completed', specVersion: SPEC_VERSION_CURRENT, eventData: { - error: { - message: errorMessage, - stack: errorStack, - }, - errorCode, + output: workflowResult, }, }, { requestId } ); - } catch (failErr) { + } catch (err) { if ( - EntityConflictError.is(failErr) || - RunExpiredError.is(failErr) + EntityConflictError.is(err) || + RunExpiredError.is(err) ) { runtimeLogger.info( - 'Tried failing workflow run, but run has already finished.', + 'Tried completing workflow run, but run has already finished.', { workflowRunId: runId, - message: failErr.message, + message: err.message, } ); - span?.setAttributes({ - ...Attribute.WorkflowErrorCode(errorCode), - ...Attribute.WorkflowErrorName(errorName), - ...Attribute.WorkflowErrorMessage(errorMessage), - ...Attribute.ErrorType(errorName), - }); return; } else { - throw failErr; + throw err; } } span?.setAttributes({ - ...Attribute.WorkflowRunStatus('failed'), - ...Attribute.WorkflowErrorCode(errorCode), - ...Attribute.WorkflowErrorName(errorName), - ...Attribute.WorkflowErrorMessage(errorMessage), - ...Attribute.ErrorType(errorName), + ...Attribute.WorkflowRunStatus('completed'), + ...Attribute.WorkflowEventsCount(events.length), }); - return; } - - // --- Infrastructure: complete the run --- - // This is outside the user-code try/catch so that failures - // here (e.g., network errors) propagate to the queue handler. - try { - await world.events.create( - runId, - { - eventType: 'run_completed', - specVersion: SPEC_VERSION_CURRENT, - eventData: { - output: workflowResult, - }, - }, - { requestId } - ); - } catch (err) { - if (EntityConflictError.is(err) || RunExpiredError.is(err)) { - runtimeLogger.info( - 'Tried completing workflow run, but run has already finished.', - { - workflowRunId: runId, - message: err.message, - } - ); - return; - } else { - throw err; - } - } - - span?.setAttributes({ - ...Attribute.WorkflowRunStatus('completed'), - ...Attribute.WorkflowEventsCount(events.length), - }); - } - ); // End trace + ); // End trace + } + ); // End withWorkflowBaggage + }).finally(() => { + if (replayTimeout) { + clearTimeout(replayTimeout); } - ); // End withWorkflowBaggage - }).finally(() => { - if (replayTimeout) { - clearTimeout(replayTimeout); - } - }); // End withTraceContext - } - ); + }); // End withTraceContext + } + ); - return withHealthCheck(handler, worldSpecVersion); + let cachedHandler: ((req: Request) => Promise) | undefined; + return withHealthCheck(async (req) => { + if (!cachedHandler) { + cachedHandler = handler(await getWorldHandlers()); + } + return cachedHandler(req); + }); } // this is a no-op placeholder as the client is diff --git a/packages/core/src/runtime/helpers.ts b/packages/core/src/runtime/helpers.ts index 620bb2273b..a2dd151c5c 100644 --- a/packages/core/src/runtime/helpers.ts +++ b/packages/core/src/runtime/helpers.ts @@ -99,7 +99,7 @@ export async function handleHealthCheckMessage( endpoint: 'workflow' | 'step', worldSpecVersion?: number ): Promise { - const world = getWorld(); + const world = await getWorld(); const streamName = getHealthCheckStreamName(healthCheck.correlationId); const response = JSON.stringify({ healthy: true, @@ -320,7 +320,7 @@ export async function getAllWorkflowRunEvents(runId: string): Promise { let hasMore = true; let pagesLoaded = 0; - const world = getWorld(); + const world = await getWorld(); const loadStart = Date.now(); while (hasMore) { // TODO: we're currently loading all the data with resolveRef behaviour. We need to update this diff --git a/packages/core/src/runtime/resume-hook.ts b/packages/core/src/runtime/resume-hook.ts index f894e55a1e..eed7e62450 100644 --- a/packages/core/src/runtime/resume-hook.ts +++ b/packages/core/src/runtime/resume-hook.ts @@ -35,7 +35,7 @@ async function getHookByTokenWithKey(token: string): Promise<{ run: WorkflowRun; encryptionKey: CryptoKey | undefined; }> { - const world = getWorld(); + const world = await getWorld(); const hook = await world.hooks.getByToken(token); const run = await world.runs.get(hook.runId); const rawKey = await world.getEncryptionKeyForRun?.(run); @@ -98,7 +98,7 @@ export async function resumeHook( ): Promise { return await waitedUntil(() => { return trace('hook.resume', async (span) => { - const world = getWorld(); + const world = await getWorld(); try { let hook: Hook; diff --git a/packages/core/src/runtime/run.ts b/packages/core/src/runtime/run.ts index 091ce57535..e755cfd363 100644 --- a/packages/core/src/runtime/run.ts +++ b/packages/core/src/runtime/run.ts @@ -78,7 +78,7 @@ export class Run { * The world object. * @internal */ - private world: World; + private worldPromise: Promise; /** * Cached encryption key resolution. Resolved once on first use and @@ -98,7 +98,7 @@ export class Run { constructor(runId: string, opts?: { resilientStart?: boolean }) { this.runId = runId; - this.world = getWorld(); + this.worldPromise = getWorld(); this.resilientStart = opts?.resilientStart ?? false; } @@ -111,8 +111,9 @@ export class Run { private getEncryptionKey(): Promise { if (!this.encryptionKeyPromise) { this.encryptionKeyPromise = (async () => { - const run = await this.world.runs.get(this.runId); - const rawKey = await this.world.getEncryptionKeyForRun?.(run); + const world = await this.worldPromise; + const run = await world.runs.get(this.runId); + const rawKey = await world.getEncryptionKeyForRun?.(run); return rawKey ? await importKey(rawKey) : undefined; })(); } @@ -127,14 +128,15 @@ export class Run { * @returns A {@link StopSleepResult} object containing the number of sleep calls that were interrupted. */ async wakeUp(options?: StopSleepOptions): Promise { - return wakeUpRun(this.world, this.runId, options); + return wakeUpRun(await this.worldPromise, this.runId, options); } /** * Cancels the workflow run. */ async cancel(): Promise { - await this.world.events.create(this.runId, { + const world = await this.worldPromise; + await world.events.create(this.runId, { eventType: 'run_cancelled', specVersion: SPEC_VERSION_CURRENT, }); @@ -144,22 +146,26 @@ export class Run { * Whether the workflow run exists. */ get exists(): Promise { - return this.world.runs - .get(this.runId, { resolveData: 'none' }) - .then(() => true) - .catch((error) => { - if (WorkflowRunNotFoundError.is(error)) { - return false; - } - throw error; - }); + return this.worldPromise.then((world) => + world.runs + .get(this.runId, { resolveData: 'none' }) + .then(() => true) + .catch((error) => { + if (WorkflowRunNotFoundError.is(error)) { + return false; + } + throw error; + }) + ); } /** * The status of the workflow run. */ get status(): Promise { - return this.world.runs.get(this.runId).then((run) => run.status); + return this.worldPromise.then((world) => + world.runs.get(this.runId).then((run) => run.status) + ); } /** @@ -174,14 +180,18 @@ export class Run { * The name of the workflow. */ get workflowName(): Promise { - return this.world.runs.get(this.runId).then((run) => run.workflowName); + return this.worldPromise.then((world) => + world.runs.get(this.runId).then((run) => run.workflowName) + ); } /** * The timestamp when the workflow run was created. */ get createdAt(): Promise { - return this.world.runs.get(this.runId).then((run) => run.createdAt); + return this.worldPromise.then((world) => + world.runs.get(this.runId).then((run) => run.createdAt) + ); } /** @@ -189,7 +199,9 @@ export class Run { * Returns undefined if the workflow has not started yet. */ get startedAt(): Promise { - return this.world.runs.get(this.runId).then((run) => run.startedAt); + return this.worldPromise.then((world) => + world.runs.get(this.runId).then((run) => run.startedAt) + ); } /** @@ -197,7 +209,9 @@ export class Run { * Returns undefined if the workflow has not completed yet. */ get completedAt(): Promise { - return this.world.runs.get(this.runId).then((run) => run.completedAt); + return this.worldPromise.then((world) => + world.runs.get(this.runId).then((run) => run.completedAt) + ); } /** @@ -237,10 +251,11 @@ export class Run { startIndex, }) as ReadableStream; - const world = this.world; + const worldPromise = this.worldPromise; const runId = this.runId; return Object.assign(stream, { getTailIndex: async (): Promise => { + const world = await worldPromise; const info = await world.streams.getInfo(runId, name); return info.tailIndex; }, @@ -253,6 +268,8 @@ export class Run { * @returns The workflow return value. */ private async pollReturnValue(): Promise { + const world = await this.worldPromise; + // When resilientStart is true, run_created failed and the run may // not exist yet. Retry on WorkflowRunNotFoundError up to 3 times // (1s + 3s + 6s = 10s total) to give the queue time to deliver @@ -264,7 +281,7 @@ export class Run { while (true) { try { - const run = await this.world.runs.get(this.runId); + const run = await world.runs.get(this.runId); if (run.status === 'completed') { const encryptionKey = await this.getEncryptionKey(); diff --git a/packages/core/src/runtime/start.ts b/packages/core/src/runtime/start.ts index 8d7af65d44..5d3cf88ddb 100644 --- a/packages/core/src/runtime/start.ts +++ b/packages/core/src/runtime/start.ts @@ -148,7 +148,7 @@ export async function start( ...Attribute.WorkflowArgumentsCount(args.length), }); - const world = opts?.world ?? getWorld(); + const world = opts?.world ?? (await getWorld()); let deploymentId = opts.deploymentId ?? (await world.getDeploymentId()); // When 'latest' is requested, resolve the actual latest deployment ID diff --git a/packages/core/src/runtime/step-handler.test.ts b/packages/core/src/runtime/step-handler.test.ts index aefac8aee3..b2fd7673c1 100644 --- a/packages/core/src/runtime/step-handler.test.ts +++ b/packages/core/src/runtime/step-handler.test.ts @@ -1,5 +1,13 @@ import { EntityConflictError, WorkflowWorldError } from '@workflow/errors'; -import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest'; +import { + afterEach, + beforeAll, + beforeEach, + describe, + expect, + it, + vi, +} from 'vitest'; // Use vi.hoisted so these are available in mock factories const { @@ -47,12 +55,12 @@ vi.mock('@vercel/functions', () => ({ // Mock the world module - createQueueHandler captures the handler vi.mock('./world.js', () => ({ - getWorld: vi.fn(() => ({ + getWorld: vi.fn(async () => ({ events: { create: mockEventsCreate }, queue: mockQueue, getEncryptionKeyForRun: vi.fn().mockResolvedValue(undefined), })), - getWorldHandlers: vi.fn(() => ({ + getWorldHandlers: vi.fn(async () => ({ createQueueHandler: vi.fn( ( _prefix: string, @@ -139,9 +147,10 @@ vi.mock('@workflow/utils/get-port', () => ({ getPort: vi.fn().mockResolvedValue(3000), })); -// Import the module AFTER all mocks are set up - this triggers createQueueHandler -// which populates capturedHandlerRef -import './step-handler.js'; +// Import the module AFTER all mocks are set up +// Since getWorldHandlers is now async, we need to call stepEntrypoint +// to trigger createQueueHandler and populate capturedHandlerRef +import { stepEntrypoint } from './step-handler.js'; import { MAX_QUEUE_DELIVERIES } from './constants.js'; import { getStepFunction } from '../private.js'; import { @@ -188,6 +197,12 @@ function createMessage(overrides: Record = {}) { } describe('step-handler 409 handling', () => { + // Trigger the lazy handler initialization by calling stepEntrypoint once. + // This invokes getWorldHandlers() which calls createQueueHandler and captures the handler. + beforeAll(async () => { + await stepEntrypoint(new Request('http://localhost')); + }); + beforeEach(() => { vi.clearAllMocks(); // Re-set mocks after clearAllMocks @@ -205,7 +220,7 @@ describe('step-handler 409 handling', () => { mockStepFn.maxRetries = 3; mockQueueMessage.mockResolvedValue(undefined); // Re-set getWorld mock since clearAllMocks resets it - vi.mocked(getWorld).mockReturnValue({ + vi.mocked(getWorld).mockResolvedValue({ events: { create: mockEventsCreate }, queue: mockQueue, getEncryptionKeyForRun: vi.fn().mockResolvedValue(undefined), @@ -506,7 +521,7 @@ describe('step-handler max deliveries', () => { mockStepFn.mockReset().mockResolvedValue('step-result'); mockStepFn.maxRetries = 3; mockQueueMessage.mockResolvedValue(undefined); - vi.mocked(getWorld).mockReturnValue({ + vi.mocked(getWorld).mockResolvedValue({ events: { create: mockEventsCreate }, queue: mockQueue, getEncryptionKeyForRun: vi.fn().mockResolvedValue(undefined), @@ -580,7 +595,7 @@ describe('step-handler step not found', () => { mockStepFn.mockReset().mockResolvedValue('step-result'); mockStepFn.maxRetries = 3; mockQueueMessage.mockResolvedValue(undefined); - vi.mocked(getWorld).mockReturnValue({ + vi.mocked(getWorld).mockResolvedValue({ events: { create: mockEventsCreate }, queue: mockQueue, getEncryptionKeyForRun: vi.fn().mockResolvedValue(undefined), diff --git a/packages/core/src/runtime/step-handler.ts b/packages/core/src/runtime/step-handler.ts index 65872545a8..f38184f293 100644 --- a/packages/core/src/runtime/step-handler.ts +++ b/packages/core/src/runtime/step-handler.ts @@ -43,551 +43,331 @@ import { queueMessage, withHealthCheck, } from './helpers.js'; -import { getWorld, getWorldHandlers } from './world.js'; +import { getWorld, getWorldHandlers, type WorldHandlers } from './world.js'; const DEFAULT_STEP_MAX_RETRIES = 3; -const { createQueueHandler, specVersion: worldSpecVersion } = - getWorldHandlers(); -const stepHandler = createQueueHandler( - '__wkf_step_', - async (message_, metadata) => { - // Check if this is a health check message - // NOTE: Health check messages are intentionally unauthenticated for monitoring purposes. - // They only write a simple status response to a stream and do not expose sensitive data. - // The stream name includes a unique correlationId that must be known by the caller. - const healthCheck = parseHealthCheckPayload(message_); - if (healthCheck) { - await handleHealthCheckMessage(healthCheck, 'step', worldSpecVersion); - return; - } - - const { - workflowName, - workflowRunId, - workflowStartedAt, - stepId, - traceCarrier: traceContext, - requestedAt, - } = StepInvokePayloadSchema.parse(message_); - const { requestId } = metadata; - - // --- Max delivery check --- - // Enforce max delivery limit before any infrastructure calls. - // This prevents runaway steps from consuming infinite queue deliveries. - // At this point, we want to do the minimal amount of work (no fetching - // of the step details, etc. We simply attempt to mark the step as failed - // and enqueue the workflow once, and if either of those fails, the message - // is still consumed but with adequate logging that an error occurred. - if (metadata.attempt > MAX_QUEUE_DELIVERIES) { - runtimeLogger.error( - `Step handler exceeded max deliveries (${metadata.attempt}/${MAX_QUEUE_DELIVERIES})`, - { - workflowRunId, - stepId, - stepName: metadata.queueName.slice('__wkf_step_'.length), - attempt: metadata.attempt, - } - ); - try { - const world = getWorld(); - await world.events.create( - workflowRunId, - { - eventType: 'step_failed', - specVersion: SPEC_VERSION_CURRENT, - correlationId: stepId, - eventData: { - error: `Step exceeded maximum queue deliveries (${metadata.attempt}/${MAX_QUEUE_DELIVERIES})`, - }, - }, - { requestId } +const stepHandler = (worldHandlers: WorldHandlers) => + worldHandlers.createQueueHandler( + '__wkf_step_', + async (message_, metadata) => { + // Check if this is a health check message + // NOTE: Health check messages are intentionally unauthenticated for monitoring purposes. + // They only write a simple status response to a stream and do not expose sensitive data. + // The stream name includes a unique correlationId that must be known by the caller. + const healthCheck = parseHealthCheckPayload(message_); + if (healthCheck) { + await handleHealthCheckMessage( + healthCheck, + 'step', + worldHandlers.specVersion ); - // Re-queue the workflow to handle the failed step - await queueMessage(world, getWorkflowQueueName(workflowName), { - runId: workflowRunId, - traceCarrier: await serializeTraceCarrier(), - requestedAt: new Date(), - }); - } catch (err) { - if (EntityConflictError.is(err) || RunExpiredError.is(err)) { - return; - } - // Can't even mark the step as failed. Consume the message to stop - // further retries. The run will remain in its current state. + return; + } + + const { + workflowName, + workflowRunId, + workflowStartedAt, + stepId, + traceCarrier: traceContext, + requestedAt, + } = StepInvokePayloadSchema.parse(message_); + const { requestId } = metadata; + + // --- Max delivery check --- + // Enforce max delivery limit before any infrastructure calls. + // This prevents runaway steps from consuming infinite queue deliveries. + // At this point, we want to do the minimal amount of work (no fetching + // of the step details, etc. We simply attempt to mark the step as failed + // and enqueue the workflow once, and if either of those fails, the message + // is still consumed but with adequate logging that an error occurred. + if (metadata.attempt > MAX_QUEUE_DELIVERIES) { runtimeLogger.error( - `Failed to mark step as failed after ${metadata.attempt} delivery attempts. ` + - `A persistent error is preventing the step from being terminated. ` + - `The run will remain in its current state until manually resolved. ` + - `This is most likely due to a persistent outage of the workflow backend ` + - `or a bug in the workflow runtime and should be reported to the Workflow team.`, + `Step handler exceeded max deliveries (${metadata.attempt}/${MAX_QUEUE_DELIVERIES})`, { workflowRunId, stepId, + stepName: metadata.queueName.slice('__wkf_step_'.length), attempt: metadata.attempt, - error: err instanceof Error ? err.message : String(err), } ); - } - return; - } - - const spanLinks = await linkToCurrentContext(); - // Execute step within the propagated trace context - return await withTraceContext(traceContext, async () => { - // Extract the step name from the topic name - const stepName = metadata.queueName.slice('__wkf_step_'.length); - const world = getWorld(); - const isVercel = process.env.VERCEL_URL !== undefined; - - // Resolve local async values concurrently before entering the trace span - const [port, spanKind] = await Promise.all([ - isVercel ? undefined : getPort(), - getSpanKind('CONSUMER'), - ]); - - return trace( - `STEP ${stepName}`, - { kind: spanKind, links: spanLinks }, - async (span) => { - span?.setAttributes({ - ...Attribute.StepName(stepName), - ...Attribute.StepAttempt(metadata.attempt), - // Standard OTEL messaging conventions - ...Attribute.MessagingSystem('vercel-queue'), - ...Attribute.MessagingDestinationName(metadata.queueName), - ...Attribute.MessagingMessageId(metadata.messageId), - ...Attribute.MessagingOperationType('process'), - ...getQueueOverhead({ requestedAt }), - }); - - // Note: Step function validation happens after step_started so we can - // properly fail the step (not the run) if the function is not registered. - // This allows the workflow to handle the step failure gracefully. - const stepFn = getStepFunction(stepName); - - span?.setAttributes({ - ...Attribute.WorkflowName(workflowName), - ...Attribute.WorkflowRunId(workflowRunId), - ...Attribute.StepId(stepId), - ...Attribute.StepTracePropagated(!!traceContext), + try { + const world = await getWorld(); + await world.events.create( + workflowRunId, + { + eventType: 'step_failed', + specVersion: SPEC_VERSION_CURRENT, + correlationId: stepId, + eventData: { + error: `Step exceeded maximum queue deliveries (${metadata.attempt}/${MAX_QUEUE_DELIVERIES})`, + }, + }, + { requestId } + ); + // Re-queue the workflow to handle the failed step + await queueMessage(world, getWorkflowQueueName(workflowName), { + runId: workflowRunId, + traceCarrier: await serializeTraceCarrier(), + requestedAt: new Date(), }); - - // step_started validates state and returns the step entity, so no separate - // world.steps.get() call is needed. The server checks: - // - Step not in terminal state (returns 409) - // - retryAfter timestamp reached (returns 425 with Retry-After header) - // - Workflow still active (returns 410 if completed) - let step; - try { - const startResult = await world.events.create( + } catch (err) { + if (EntityConflictError.is(err) || RunExpiredError.is(err)) { + return; + } + // Can't even mark the step as failed. Consume the message to stop + // further retries. The run will remain in its current state. + runtimeLogger.error( + `Failed to mark step as failed after ${metadata.attempt} delivery attempts. ` + + `A persistent error is preventing the step from being terminated. ` + + `The run will remain in its current state until manually resolved. ` + + `This is most likely due to a persistent outage of the workflow backend ` + + `or a bug in the workflow runtime and should be reported to the Workflow team.`, + { workflowRunId, - { - eventType: 'step_started', - specVersion: SPEC_VERSION_CURRENT, - correlationId: stepId, - }, - { requestId } - ); - - if (!startResult.step) { - throw new WorkflowRuntimeError( - `step_started event for "${stepId}" did not return step entity` - ); - } - step = startResult.step; - } catch (err) { - if (ThrottleError.is(err)) { - const retryRetryAfter = Math.max( - 1, - typeof err.retryAfter === 'number' ? err.retryAfter : 1 - ); - runtimeLogger.info( - 'Throttled again on retry, deferring to queue', - { - retryAfterSeconds: retryRetryAfter, - } - ); - return { timeoutSeconds: retryRetryAfter }; - } - if (RunExpiredError.is(err)) { - runtimeLogger.info( - `Workflow run "${workflowRunId}" has already completed, skipping step "${stepId}": ${err.message}` - ); - return; - } - if (EntityConflictError.is(err)) { - runtimeLogger.debug( - 'Step in terminal state, re-enqueuing workflow', - { - stepName, - stepId, - workflowRunId, - error: err.message, - } - ); - span?.setAttributes({ - ...Attribute.StepSkipped(true), - ...Attribute.StepSkipReason('completed'), - }); - span?.addEvent?.('step.skipped', { - 'skip.reason': 'terminal_state', - 'step.name': stepName, - 'step.id': stepId, - }); - await queueMessage(world, getWorkflowQueueName(workflowName), { - runId: workflowRunId, - traceCarrier: await serializeTraceCarrier(), - requestedAt: new Date(), - }); - return; - } - - // Too early: retryAfter timestamp not reached yet - // Return timeout to queue so it retries later - if (TooEarlyError.is(err)) { - const timeoutSeconds = Math.max(1, err.retryAfter ?? 1); - span?.setAttributes({ - ...Attribute.StepRetryTimeoutSeconds(timeoutSeconds), - }); - // Add span event for delayed retry - span?.addEvent?.('step.delayed', { - 'delay.reason': 'retry_after_not_reached', - 'delay.timeout_seconds': timeoutSeconds, - }); - runtimeLogger.debug('Step retryAfter timestamp not yet reached', { - stepName, - stepId, - retryAfterSeconds: err.retryAfter, - timeoutSeconds, - }); - return { timeoutSeconds }; + stepId, + attempt: metadata.attempt, + error: err instanceof Error ? err.message : String(err), } - // Re-throw other errors - throw err; - } - - runtimeLogger.debug('Step execution details', { - stepName, - stepId: step.stepId, - status: step.status, - attempt: step.attempt, - }); + ); + } + return; + } - span?.setAttributes({ - ...Attribute.StepStatus(step.status), - }); + const spanLinks = await linkToCurrentContext(); + // Execute step within the propagated trace context + return await withTraceContext(traceContext, async () => { + // Extract the step name from the topic name + const stepName = metadata.queueName.slice('__wkf_step_'.length); + const world = await getWorld(); + const isVercel = process.env.VERCEL_URL !== undefined; + + // Resolve local async values concurrently before entering the trace span + const [port, spanKind] = await Promise.all([ + isVercel ? undefined : getPort(), + getSpanKind('CONSUMER'), + ]); + + return trace( + `STEP ${stepName}`, + { kind: spanKind, links: spanLinks }, + async (span) => { + span?.setAttributes({ + ...Attribute.StepName(stepName), + ...Attribute.StepAttempt(metadata.attempt), + // Standard OTEL messaging conventions + ...Attribute.MessagingSystem('vercel-queue'), + ...Attribute.MessagingDestinationName(metadata.queueName), + ...Attribute.MessagingMessageId(metadata.messageId), + ...Attribute.MessagingOperationType('process'), + ...getQueueOverhead({ requestedAt }), + }); - // Validate step function exists AFTER step_started so we can - // properly fail the step (not the run) if the function is missing. - // This allows the workflow to handle the step failure gracefully, - // similar to how FatalError is handled. - if (!stepFn || typeof stepFn !== 'function') { - const err = new StepNotRegisteredError(stepName); + // Note: Step function validation happens after step_started so we can + // properly fail the step (not the run) if the function is not registered. + // This allows the workflow to handle the step failure gracefully. + const stepFn = getStepFunction(stepName); - runtimeLogger.error( - 'Step function not registered, failing step (not run)', - { - workflowRunId, - stepName, - stepId, - error: err.message, - } - ); + span?.setAttributes({ + ...Attribute.WorkflowName(workflowName), + ...Attribute.WorkflowRunId(workflowRunId), + ...Attribute.StepId(stepId), + ...Attribute.StepTracePropagated(!!traceContext), + }); - // Fail the step via event (event-sourced architecture) - // This matches the FatalError pattern - fail the step and re-queue workflow + // step_started validates state and returns the step entity, so no separate + // world.steps.get() call is needed. The server checks: + // - Step not in terminal state (returns 409) + // - retryAfter timestamp reached (returns 425 with Retry-After header) + // - Workflow still active (returns 410 if completed) + let step; try { - await world.events.create( + const startResult = await world.events.create( workflowRunId, { - eventType: 'step_failed', + eventType: 'step_started', specVersion: SPEC_VERSION_CURRENT, correlationId: stepId, - eventData: { - error: err.message, - stack: err.stack, - }, }, { requestId } ); - } catch (stepFailErr) { - if (EntityConflictError.is(stepFailErr)) { + + if (!startResult.step) { + throw new WorkflowRuntimeError( + `step_started event for "${stepId}" did not return step entity` + ); + } + step = startResult.step; + } catch (err) { + if (ThrottleError.is(err)) { + const retryRetryAfter = Math.max( + 1, + typeof err.retryAfter === 'number' ? err.retryAfter : 1 + ); runtimeLogger.info( - 'Tried failing step for missing function, but step has already finished.', + 'Throttled again on retry, deferring to queue', { - workflowRunId, - stepId, - stepName, - message: stepFailErr.message, + retryAfterSeconds: retryRetryAfter, } ); + return { timeoutSeconds: retryRetryAfter }; + } + if (RunExpiredError.is(err)) { + runtimeLogger.info( + `Workflow run "${workflowRunId}" has already completed, skipping step "${stepId}": ${err.message}` + ); return; } - throw stepFailErr; - } - - span?.setAttributes({ - ...Attribute.StepStatus('failed'), - ...Attribute.StepFatalError(true), - }); - - // Re-invoke the workflow to handle the failed step - await queueMessage(world, getWorkflowQueueName(workflowName), { - runId: workflowRunId, - traceCarrier: await serializeTraceCarrier(), - requestedAt: new Date(), - }); - return; - } - - const maxRetries = stepFn.maxRetries ?? DEFAULT_STEP_MAX_RETRIES; - - span?.setAttributes({ - ...Attribute.StepMaxRetries(maxRetries), - }); - - let result: unknown; - - // Check max retries AFTER step_started (attempt was just incremented) - // step.attempt tracks how many times step_started has been called. - // Note: maxRetries is the number of RETRIES after the first attempt, so total attempts = maxRetries + 1 - // Use > here (not >=) because this guards against re-invocation AFTER all attempts are used. - // The post-failure check uses >= to decide whether to retry after a failure. - if (step.attempt > maxRetries + 1) { - const retryCount = step.attempt - 1; - const errorMessage = `Step "${stepName}" exceeded max retries (${retryCount} ${pluralize('retry', 'retries', retryCount)})`; - stepLogger.error('Step exceeded max retries', { - workflowRunId, - stepName, - retryCount, - }); - // Fail the step via event (event-sourced architecture) - try { - await world.events.create( - workflowRunId, - { - eventType: 'step_failed', - specVersion: SPEC_VERSION_CURRENT, - correlationId: stepId, - eventData: { - error: errorMessage, - stack: step.error?.stack, - }, - }, - { requestId } - ); - } catch (err) { if (EntityConflictError.is(err)) { - runtimeLogger.info( - 'Tried failing step, but step has already finished.', + runtimeLogger.debug( + 'Step in terminal state, re-enqueuing workflow', { - workflowRunId, - stepId, stepName, - message: err.message, + stepId, + workflowRunId, + error: err.message, } ); + span?.setAttributes({ + ...Attribute.StepSkipped(true), + ...Attribute.StepSkipReason('completed'), + }); + span?.addEvent?.('step.skipped', { + 'skip.reason': 'terminal_state', + 'step.name': stepName, + 'step.id': stepId, + }); + await queueMessage(world, getWorkflowQueueName(workflowName), { + runId: workflowRunId, + traceCarrier: await serializeTraceCarrier(), + requestedAt: new Date(), + }); return; } + + // Too early: retryAfter timestamp not reached yet + // Return timeout to queue so it retries later + if (TooEarlyError.is(err)) { + const timeoutSeconds = Math.max(1, err.retryAfter ?? 1); + span?.setAttributes({ + ...Attribute.StepRetryTimeoutSeconds(timeoutSeconds), + }); + // Add span event for delayed retry + span?.addEvent?.('step.delayed', { + 'delay.reason': 'retry_after_not_reached', + 'delay.timeout_seconds': timeoutSeconds, + }); + runtimeLogger.debug( + 'Step retryAfter timestamp not yet reached', + { + stepName, + stepId, + retryAfterSeconds: err.retryAfter, + timeoutSeconds, + } + ); + return { timeoutSeconds }; + } + // Re-throw other errors throw err; } - span?.setAttributes({ - ...Attribute.StepStatus('failed'), - ...Attribute.StepRetryExhausted(true), + runtimeLogger.debug('Step execution details', { + stepName, + stepId: step.stepId, + status: step.status, + attempt: step.attempt, }); - // Re-invoke the workflow to handle the failed step - await queueMessage(world, getWorkflowQueueName(workflowName), { - runId: workflowRunId, - traceCarrier: await serializeTraceCarrier(), - requestedAt: new Date(), + span?.setAttributes({ + ...Attribute.StepStatus(step.status), }); - return; - } - // --- Infrastructure: prepare step input --- - // Network/server errors propagate to the queue handler for retry. - // WorkflowRuntimeError (data integrity issues) are fatal — retrying - // won't fix them, so we re-queue the workflow to surface the error. - // step_started already validated the step is in valid state (pending/running) - // and returned the updated step entity with incremented attempt + // Validate step function exists AFTER step_started so we can + // properly fail the step (not the run) if the function is missing. + // This allows the workflow to handle the step failure gracefully, + // similar to how FatalError is handled. + if (!stepFn || typeof stepFn !== 'function') { + const err = new StepNotRegisteredError(stepName); - // step.attempt is now the current attempt number (after increment) - const attempt = step.attempt; - - if (!step.startedAt) { - const errorMessage = `Step "${stepId}" has no "startedAt" timestamp`; - runtimeLogger.error('Fatal runtime error during step setup', { - workflowRunId, - stepId, - error: errorMessage, - }); - try { - await world.events.create( - workflowRunId, + runtimeLogger.error( + 'Step function not registered, failing step (not run)', { - eventType: 'step_failed', - specVersion: SPEC_VERSION_CURRENT, - correlationId: stepId, - eventData: { - error: errorMessage, - stack: new Error(errorMessage).stack ?? '', - }, - }, - { requestId } - ); - } catch (failErr) { - if (EntityConflictError.is(failErr)) { - return; - } - throw failErr; - } - // Re-queue the workflow so it can process the step failure - await queueMessage(world, getWorkflowQueueName(workflowName), { - runId: workflowRunId, - traceCarrier: await serializeTraceCarrier(), - requestedAt: new Date(), - }); - return; - } - // Capture startedAt for use in async callback (TypeScript narrowing doesn't persist) - const stepStartedAt = step.startedAt; - - // Hydrate the step input arguments, closure variables, and thisVal - // NOTE: This captures only the synchronous portion of hydration. Any async - // operations (e.g., stream loading) are added to `ops` and executed later - // via Promise.all(ops) - their timing is not included in this measurement. - const ops: Promise[] = []; - const rawKey = await world.getEncryptionKeyForRun?.(workflowRunId); - const encryptionKey = rawKey ? await importKey(rawKey) : undefined; - const hydratedInput = await trace( - 'step.hydrate', - {}, - async (hydrateSpan) => { - const startTime = Date.now(); - const result = await hydrateStepArguments( - step.input, - workflowRunId, - encryptionKey, - ops + workflowRunId, + stepName, + stepId, + error: err.message, + } ); - const durationMs = Date.now() - startTime; - hydrateSpan?.setAttributes({ - ...Attribute.StepArgumentsCount(result.args.length), - ...Attribute.QueueDeserializeTimeMs(durationMs), - }); - return result; - } - ); - - const args = hydratedInput.args; - const thisVal = hydratedInput.thisVal ?? null; - // --- User code execution --- - // Only errors from stepFn.apply() (user step code) should produce - // step_failed/step_retrying. Infrastructure errors (network, server) - // must propagate to the queue handler for automatic retry. - let userCodeError: unknown; - let userCodeFailed = false; - - const executionStartTime = Date.now(); - try { - result = await trace('step.execute', {}, async () => { - return await contextStorage.run( - { - stepMetadata: { - stepName, - stepId, - stepStartedAt: new Date(+stepStartedAt), - attempt, - }, - workflowMetadata: { - workflowName, - workflowRunId, - workflowStartedAt: new Date(+workflowStartedAt), - // TODO: there should be a getUrl method on the world interface itself. This - // solution only works for vercel + local worlds. - url: isVercel - ? `https://${process.env.VERCEL_URL}` - : `http://localhost:${port ?? 3000}`, - features: { encryption: !!encryptionKey }, + // Fail the step via event (event-sourced architecture) + // This matches the FatalError pattern - fail the step and re-queue workflow + try { + await world.events.create( + workflowRunId, + { + eventType: 'step_failed', + specVersion: SPEC_VERSION_CURRENT, + correlationId: stepId, + eventData: { + error: err.message, + stack: err.stack, + }, }, - ops, - closureVars: hydratedInput.closureVars, - encryptionKey, - }, - () => stepFn.apply(thisVal, args) - ); - }); - } catch (err) { - userCodeError = err; - userCodeFailed = true; - } - const executionTimeMs = Date.now() - executionStartTime; - - span?.setAttributes({ - ...Attribute.QueueExecutionTimeMs(executionTimeMs), - }); + { requestId } + ); + } catch (stepFailErr) { + if (EntityConflictError.is(stepFailErr)) { + runtimeLogger.info( + 'Tried failing step for missing function, but step has already finished.', + { + workflowRunId, + stepId, + stepName, + message: stepFailErr.message, + } + ); + return; + } + throw stepFailErr; + } - // --- Handle user code errors --- - if (userCodeFailed) { - const err = userCodeError; + span?.setAttributes({ + ...Attribute.StepStatus('failed'), + ...Attribute.StepFatalError(true), + }); - // Infrastructure errors that somehow surfaced through user code - // should propagate to the queue handler for retry, not consume - // step attempts. - if (RunExpiredError.is(err)) { - // Workflow has already completed, so no-op - stepLogger.info('Workflow run already completed, skipping step', { - workflowRunId, - stepId, - message: err.message, + // Re-invoke the workflow to handle the failed step + await queueMessage(world, getWorkflowQueueName(workflowName), { + runId: workflowRunId, + traceCarrier: await serializeTraceCarrier(), + requestedAt: new Date(), }); return; } - if (WorkflowWorldError.is(err)) { - if (err.status !== undefined && err.status >= 500) { - throw err; - } - } - - const normalizedError = await normalizeUnknownError(err); - const normalizedStack = - normalizedError.stack || getErrorStack(err) || ''; - - // Record exception for OTEL error tracking - if (err instanceof Error) { - span?.recordException?.(err); - } - // Determine error category and retryability - const isFatal = FatalError.is(err); - const isRetryable = RetryableError.is(err); - const errorCategory = isFatal - ? 'fatal' - : isRetryable - ? 'retryable' - : 'transient'; + const maxRetries = stepFn.maxRetries ?? DEFAULT_STEP_MAX_RETRIES; span?.setAttributes({ - ...Attribute.StepErrorName(getErrorName(err)), - ...Attribute.StepErrorMessage(normalizedError.message), - ...Attribute.ErrorType(getErrorName(err)), - ...Attribute.ErrorCategory(errorCategory), - ...Attribute.ErrorRetryable(!isFatal), + ...Attribute.StepMaxRetries(maxRetries), }); - if (isFatal) { - stepLogger.error( - 'Encountered FatalError while executing step, bubbling up to parent workflow', - { - workflowRunId, - stepName, - errorStack: normalizedStack, - } - ); + let result: unknown; + + // Check max retries AFTER step_started (attempt was just incremented) + // step.attempt tracks how many times step_started has been called. + // Note: maxRetries is the number of RETRIES after the first attempt, so total attempts = maxRetries + 1 + // Use > here (not >=) because this guards against re-invocation AFTER all attempts are used. + // The post-failure check uses >= to decide whether to retry after a failure. + if (step.attempt > maxRetries + 1) { + const retryCount = step.attempt - 1; + const errorMessage = `Step "${stepName}" exceeded max retries (${retryCount} ${pluralize('retry', 'retries', retryCount)})`; + stepLogger.error('Step exceeded max retries', { + workflowRunId, + stepName, + retryCount, + }); // Fail the step via event (event-sourced architecture) try { await world.events.create( @@ -597,57 +377,226 @@ const stepHandler = createQueueHandler( specVersion: SPEC_VERSION_CURRENT, correlationId: stepId, eventData: { - error: normalizedError.message, - stack: normalizedStack, + error: errorMessage, + stack: step.error?.stack, }, }, { requestId } ); - } catch (stepFailErr) { - if (EntityConflictError.is(stepFailErr)) { + } catch (err) { + if (EntityConflictError.is(err)) { runtimeLogger.info( 'Tried failing step, but step has already finished.', { workflowRunId, stepId, stepName, - message: stepFailErr.message, + message: err.message, } ); return; } - throw stepFailErr; + throw err; } span?.setAttributes({ ...Attribute.StepStatus('failed'), - ...Attribute.StepFatalError(true), + ...Attribute.StepRetryExhausted(true), + }); + + // Re-invoke the workflow to handle the failed step + await queueMessage(world, getWorkflowQueueName(workflowName), { + runId: workflowRunId, + traceCarrier: await serializeTraceCarrier(), + requestedAt: new Date(), + }); + return; + } + + // --- Infrastructure: prepare step input --- + // Network/server errors propagate to the queue handler for retry. + // WorkflowRuntimeError (data integrity issues) are fatal — retrying + // won't fix them, so we re-queue the workflow to surface the error. + // step_started already validated the step is in valid state (pending/running) + // and returned the updated step entity with incremented attempt + + // step.attempt is now the current attempt number (after increment) + const attempt = step.attempt; + + if (!step.startedAt) { + const errorMessage = `Step "${stepId}" has no "startedAt" timestamp`; + runtimeLogger.error('Fatal runtime error during step setup', { + workflowRunId, + stepId, + error: errorMessage, + }); + try { + await world.events.create( + workflowRunId, + { + eventType: 'step_failed', + specVersion: SPEC_VERSION_CURRENT, + correlationId: stepId, + eventData: { + error: errorMessage, + stack: new Error(errorMessage).stack ?? '', + }, + }, + { requestId } + ); + } catch (failErr) { + if (EntityConflictError.is(failErr)) { + return; + } + throw failErr; + } + // Re-queue the workflow so it can process the step failure + await queueMessage(world, getWorkflowQueueName(workflowName), { + runId: workflowRunId, + traceCarrier: await serializeTraceCarrier(), + requestedAt: new Date(), + }); + return; + } + // Capture startedAt for use in async callback (TypeScript narrowing doesn't persist) + const stepStartedAt = step.startedAt; + + // Hydrate the step input arguments, closure variables, and thisVal + // NOTE: This captures only the synchronous portion of hydration. Any async + // operations (e.g., stream loading) are added to `ops` and executed later + // via Promise.all(ops) - their timing is not included in this measurement. + const ops: Promise[] = []; + const rawKey = await world.getEncryptionKeyForRun?.(workflowRunId); + const encryptionKey = rawKey ? await importKey(rawKey) : undefined; + const hydratedInput = await trace( + 'step.hydrate', + {}, + async (hydrateSpan) => { + const startTime = Date.now(); + const result = await hydrateStepArguments( + step.input, + workflowRunId, + encryptionKey, + ops + ); + const durationMs = Date.now() - startTime; + hydrateSpan?.setAttributes({ + ...Attribute.StepArgumentsCount(result.args.length), + ...Attribute.QueueDeserializeTimeMs(durationMs), + }); + return result; + } + ); + + const args = hydratedInput.args; + const thisVal = hydratedInput.thisVal ?? null; + + // --- User code execution --- + // Only errors from stepFn.apply() (user step code) should produce + // step_failed/step_retrying. Infrastructure errors (network, server) + // must propagate to the queue handler for automatic retry. + let userCodeError: unknown; + let userCodeFailed = false; + + const executionStartTime = Date.now(); + try { + result = await trace('step.execute', {}, async () => { + return await contextStorage.run( + { + stepMetadata: { + stepName, + stepId, + stepStartedAt: new Date(+stepStartedAt), + attempt, + }, + workflowMetadata: { + workflowName, + workflowRunId, + workflowStartedAt: new Date(+workflowStartedAt), + // TODO: there should be a getUrl method on the world interface itself. This + // solution only works for vercel + local worlds. + url: isVercel + ? `https://${process.env.VERCEL_URL}` + : `http://localhost:${port ?? 3000}`, + features: { encryption: !!encryptionKey }, + }, + ops, + closureVars: hydratedInput.closureVars, + encryptionKey, + }, + () => stepFn.apply(thisVal, args) + ); }); - } else { - const maxRetries = stepFn.maxRetries ?? DEFAULT_STEP_MAX_RETRIES; - // step.attempt was incremented by step_started, use it here - const currentAttempt = step.attempt; + } catch (err) { + userCodeError = err; + userCodeFailed = true; + } + const executionTimeMs = Date.now() - executionStartTime; + + span?.setAttributes({ + ...Attribute.QueueExecutionTimeMs(executionTimeMs), + }); + + // --- Handle user code errors --- + if (userCodeFailed) { + const err = userCodeError; + + // Infrastructure errors that somehow surfaced through user code + // should propagate to the queue handler for retry, not consume + // step attempts. + if (RunExpiredError.is(err)) { + // Workflow has already completed, so no-op + stepLogger.info( + 'Workflow run already completed, skipping step', + { + workflowRunId, + stepId, + message: err.message, + } + ); + return; + } + if (WorkflowWorldError.is(err)) { + if (err.status !== undefined && err.status >= 500) { + throw err; + } + } + + const normalizedError = await normalizeUnknownError(err); + const normalizedStack = + normalizedError.stack || getErrorStack(err) || ''; + + // Record exception for OTEL error tracking + if (err instanceof Error) { + span?.recordException?.(err); + } + + // Determine error category and retryability + const isFatal = FatalError.is(err); + const isRetryable = RetryableError.is(err); + const errorCategory = isFatal + ? 'fatal' + : isRetryable + ? 'retryable' + : 'transient'; span?.setAttributes({ - ...Attribute.StepAttempt(currentAttempt), - ...Attribute.StepMaxRetries(maxRetries), + ...Attribute.StepErrorName(getErrorName(err)), + ...Attribute.StepErrorMessage(normalizedError.message), + ...Attribute.ErrorType(getErrorName(err)), + ...Attribute.ErrorCategory(errorCategory), + ...Attribute.ErrorRetryable(!isFatal), }); - // Note: maxRetries is the number of RETRIES after the first attempt, so total attempts = maxRetries + 1 - if (currentAttempt >= maxRetries + 1) { - // Max retries reached - const retryCount = step.attempt - 1; + if (isFatal) { stepLogger.error( - 'Max retries reached, bubbling error to parent workflow', + 'Encountered FatalError while executing step, bubbling up to parent workflow', { workflowRunId, stepName, - attempt: step.attempt, - retryCount, errorStack: normalizedStack, } ); - const errorMessage = `Step "${stepName}" failed after ${maxRetries} ${pluralize('retry', 'retries', maxRetries)}: ${normalizedError.message}`; // Fail the step via event (event-sourced architecture) try { await world.events.create( @@ -657,7 +606,7 @@ const stepHandler = createQueueHandler( specVersion: SPEC_VERSION_CURRENT, correlationId: stepId, eventData: { - error: errorMessage, + error: normalizedError.message, stack: normalizedStack, }, }, @@ -681,190 +630,263 @@ const stepHandler = createQueueHandler( span?.setAttributes({ ...Attribute.StepStatus('failed'), - ...Attribute.StepRetryExhausted(true), + ...Attribute.StepFatalError(true), }); } else { - // Not at max retries yet - log as a retryable error - if (RetryableError.is(err)) { - stepLogger.info( - 'Encountered RetryableError, step will be retried', + const maxRetries = + stepFn.maxRetries ?? DEFAULT_STEP_MAX_RETRIES; + // step.attempt was incremented by step_started, use it here + const currentAttempt = step.attempt; + + span?.setAttributes({ + ...Attribute.StepAttempt(currentAttempt), + ...Attribute.StepMaxRetries(maxRetries), + }); + + // Note: maxRetries is the number of RETRIES after the first attempt, so total attempts = maxRetries + 1 + if (currentAttempt >= maxRetries + 1) { + // Max retries reached + const retryCount = step.attempt - 1; + stepLogger.error( + 'Max retries reached, bubbling error to parent workflow', { workflowRunId, stepName, - attempt: currentAttempt, - message: err.message, + attempt: step.attempt, + retryCount, + errorStack: normalizedStack, } ); - } else { - stepLogger.info('Encountered Error, step will be retried', { - workflowRunId, - stepName, - attempt: currentAttempt, - errorStack: normalizedStack, - }); - } - // Set step to pending for retry via event (event-sourced architecture) - // step_retrying records the error and sets status to pending - try { - await world.events.create( - workflowRunId, - { - eventType: 'step_retrying', - specVersion: SPEC_VERSION_CURRENT, - correlationId: stepId, - eventData: { - error: normalizedError.message, - stack: normalizedStack, - ...(RetryableError.is(err) && { - retryAfter: err.retryAfter, - }), + const errorMessage = `Step "${stepName}" failed after ${maxRetries} ${pluralize('retry', 'retries', maxRetries)}: ${normalizedError.message}`; + // Fail the step via event (event-sourced architecture) + try { + await world.events.create( + workflowRunId, + { + eventType: 'step_failed', + specVersion: SPEC_VERSION_CURRENT, + correlationId: stepId, + eventData: { + error: errorMessage, + stack: normalizedStack, + }, }, - }, - { requestId } - ); - } catch (stepRetryErr) { - if (EntityConflictError.is(stepRetryErr)) { - runtimeLogger.info( - 'Tried retrying step, but step has already finished.', + { requestId } + ); + } catch (stepFailErr) { + if (EntityConflictError.is(stepFailErr)) { + runtimeLogger.info( + 'Tried failing step, but step has already finished.', + { + workflowRunId, + stepId, + stepName, + message: stepFailErr.message, + } + ); + return; + } + throw stepFailErr; + } + + span?.setAttributes({ + ...Attribute.StepStatus('failed'), + ...Attribute.StepRetryExhausted(true), + }); + } else { + // Not at max retries yet - log as a retryable error + if (RetryableError.is(err)) { + stepLogger.info( + 'Encountered RetryableError, step will be retried', { workflowRunId, - stepId, stepName, - message: stepRetryErr.message, + attempt: currentAttempt, + message: err.message, } ); - return; + } else { + stepLogger.info('Encountered Error, step will be retried', { + workflowRunId, + stepName, + attempt: currentAttempt, + errorStack: normalizedStack, + }); + } + // Set step to pending for retry via event (event-sourced architecture) + // step_retrying records the error and sets status to pending + try { + await world.events.create( + workflowRunId, + { + eventType: 'step_retrying', + specVersion: SPEC_VERSION_CURRENT, + correlationId: stepId, + eventData: { + error: normalizedError.message, + stack: normalizedStack, + ...(RetryableError.is(err) && { + retryAfter: err.retryAfter, + }), + }, + }, + { requestId } + ); + } catch (stepRetryErr) { + if (EntityConflictError.is(stepRetryErr)) { + runtimeLogger.info( + 'Tried retrying step, but step has already finished.', + { + workflowRunId, + stepId, + stepName, + message: stepRetryErr.message, + } + ); + return; + } + throw stepRetryErr; } - throw stepRetryErr; - } - const timeoutSeconds = Math.max( - 1, - RetryableError.is(err) - ? Math.ceil((+err.retryAfter.getTime() - Date.now()) / 1000) - : 1 - ); + const timeoutSeconds = Math.max( + 1, + RetryableError.is(err) + ? Math.ceil( + (+err.retryAfter.getTime() - Date.now()) / 1000 + ) + : 1 + ); - span?.setAttributes({ - ...Attribute.StepRetryTimeoutSeconds(timeoutSeconds), - ...Attribute.StepRetryWillRetry(true), - }); + span?.setAttributes({ + ...Attribute.StepRetryTimeoutSeconds(timeoutSeconds), + ...Attribute.StepRetryWillRetry(true), + }); - // Add span event for retry scheduling - span?.addEvent?.('retry.scheduled', { - 'retry.timeout_seconds': timeoutSeconds, - 'retry.attempt': currentAttempt, - 'retry.max_retries': maxRetries, - }); + // Add span event for retry scheduling + span?.addEvent?.('retry.scheduled', { + 'retry.timeout_seconds': timeoutSeconds, + 'retry.attempt': currentAttempt, + 'retry.max_retries': maxRetries, + }); - // It's a retryable error - so have the queue keep the message visible - // so that it gets retried. - return { timeoutSeconds }; + // It's a retryable error - so have the queue keep the message visible + // so that it gets retried. + return { timeoutSeconds }; + } } - } - // Re-invoke the workflow to handle the failed/retrying step - await queueMessage(world, getWorkflowQueueName(workflowName), { - runId: workflowRunId, - traceCarrier: await serializeTraceCarrier(), - requestedAt: new Date(), - }); - return; - } + // Re-invoke the workflow to handle the failed/retrying step + await queueMessage(world, getWorkflowQueueName(workflowName), { + runId: workflowRunId, + traceCarrier: await serializeTraceCarrier(), + requestedAt: new Date(), + }); + return; + } - // --- Infrastructure: complete the step --- - // Errors here (network failures, server errors) propagate to the - // queue handler for automatic retry. - - // NOTE: None of the code from this point is guaranteed to run - // Since the step might fail or cause a function timeout and the process might be SIGKILL'd - // The workflow runtime must be resilient to the below code not executing on a failed step - result = await trace('step.dehydrate', {}, async (dehydrateSpan) => { - const startTime = Date.now(); - const dehydrated = await dehydrateStepReturnValue( - result, - workflowRunId, - encryptionKey, - ops + // --- Infrastructure: complete the step --- + // Errors here (network failures, server errors) propagate to the + // queue handler for automatic retry. + + // NOTE: None of the code from this point is guaranteed to run + // Since the step might fail or cause a function timeout and the process might be SIGKILL'd + // The workflow runtime must be resilient to the below code not executing on a failed step + result = await trace( + 'step.dehydrate', + {}, + async (dehydrateSpan) => { + const startTime = Date.now(); + const dehydrated = await dehydrateStepReturnValue( + result, + workflowRunId, + encryptionKey, + ops + ); + const durationMs = Date.now() - startTime; + dehydrateSpan?.setAttributes({ + ...Attribute.QueueSerializeTimeMs(durationMs), + ...Attribute.StepResultType(typeof dehydrated), + }); + return dehydrated; + } ); - const durationMs = Date.now() - startTime; - dehydrateSpan?.setAttributes({ - ...Attribute.QueueSerializeTimeMs(durationMs), - ...Attribute.StepResultType(typeof dehydrated), - }); - return dehydrated; - }); - waitUntil( - Promise.all(ops).catch((err) => { - // Ignore expected client disconnect errors (e.g., browser refresh during streaming) - const isAbortError = - err?.name === 'AbortError' || err?.name === 'ResponseAborted'; - if (!isAbortError) throw err; - }) - ); + waitUntil( + Promise.all(ops).catch((err) => { + // Ignore expected client disconnect errors (e.g., browser refresh during streaming) + const isAbortError = + err?.name === 'AbortError' || err?.name === 'ResponseAborted'; + if (!isAbortError) throw err; + }) + ); - // Run step_completed and trace serialization concurrently; - // the trace carrier is used in the final queueMessage call below - let stepCompleted409 = false; - const [, traceCarrier] = await Promise.all([ - world.events - .create( - workflowRunId, - { - eventType: 'step_completed', - specVersion: SPEC_VERSION_CURRENT, - correlationId: stepId, - eventData: { - result: result as Uint8Array, + // Run step_completed and trace serialization concurrently; + // the trace carrier is used in the final queueMessage call below + let stepCompleted409 = false; + const [, traceCarrier] = await Promise.all([ + world.events + .create( + workflowRunId, + { + eventType: 'step_completed', + specVersion: SPEC_VERSION_CURRENT, + correlationId: stepId, + eventData: { + result: result as Uint8Array, + }, }, - }, - { requestId } - ) - .catch((err: unknown) => { - if (EntityConflictError.is(err)) { - runtimeLogger.info( - 'Tried completing step, but step has already finished.', - { - workflowRunId, - stepId, - stepName, - message: err.message, - } - ); - stepCompleted409 = true; - return; - } - throw err; - }), - serializeTraceCarrier(), - ]); + { requestId } + ) + .catch((err: unknown) => { + if (EntityConflictError.is(err)) { + runtimeLogger.info( + 'Tried completing step, but step has already finished.', + { + workflowRunId, + stepId, + stepName, + message: err.message, + } + ); + stepCompleted409 = true; + return; + } + throw err; + }), + serializeTraceCarrier(), + ]); - if (stepCompleted409) { - return; - } + if (stepCompleted409) { + return; + } - span?.setAttributes({ - ...Attribute.StepStatus('completed'), - ...Attribute.StepResultType(typeof result), - }); + span?.setAttributes({ + ...Attribute.StepStatus('completed'), + ...Attribute.StepResultType(typeof result), + }); - // Queue the workflow continuation with the concurrently-resolved trace carrier - await queueMessage(world, getWorkflowQueueName(workflowName), { - runId: workflowRunId, - traceCarrier, - requestedAt: new Date(), - }); - } - ); - }); - } -); + // Queue the workflow continuation with the concurrently-resolved trace carrier + await queueMessage(world, getWorkflowQueueName(workflowName), { + runId: workflowRunId, + traceCarrier, + requestedAt: new Date(), + }); + } + ); + }); + } + ); /** * A single route that handles any step execution request and routes to the * appropriate step function. We may eventually want to create different bundles * for each step, this is temporary. */ +let cachedStepHandler: ((req: Request) => Promise) | undefined; export const stepEntrypoint: (req: Request) => Promise = - /* @__PURE__ */ withHealthCheck(stepHandler, worldSpecVersion); + /* @__PURE__ */ withHealthCheck(async (req) => { + if (!cachedStepHandler) { + cachedStepHandler = stepHandler(await getWorldHandlers()); + } + return cachedStepHandler(req); + }); diff --git a/packages/core/src/runtime/world.ts b/packages/core/src/runtime/world.ts index 524dc3cd7a..1cb2d8acca 100644 --- a/packages/core/src/runtime/world.ts +++ b/packages/core/src/runtime/world.ts @@ -1,5 +1,6 @@ import { createRequire } from 'node:module'; -import { join } from 'node:path'; +import { resolve } from 'node:path'; +import { pathToFileURL } from 'node:url'; import { isVercelWorldTarget, resolveWorkflowTargetWorld, @@ -8,16 +9,55 @@ import type { World } from '@workflow/world'; import { createLocalWorld } from '@workflow/world-local'; import { createVercelWorld } from '@workflow/world-vercel'; -const require = createRequire(join(process.cwd(), 'index.js')); +const require = createRequire( + pathToFileURL(process.cwd() + '/package.json').href +); const WorldCache = Symbol.for('@workflow/world//cache'); const StubbedWorldCache = Symbol.for('@workflow/world//stubbedCache'); +const WorldCachePromise = Symbol.for('@workflow/world//cachePromise'); +const StubbedWorldCachePromise = Symbol.for( + '@workflow/world//stubbedCachePromise' +); const globalSymbols: typeof globalThis & { [WorldCache]?: World; [StubbedWorldCache]?: World; + [WorldCachePromise]?: Promise; + [StubbedWorldCachePromise]?: Promise; } = globalThis; +/** + * Hides the dynamic import behind `new Function` to prevent bundlers from + * trying to resolve it at build time, since the world module may not exist + * at build time. Falls back to `require()` in environments where + * `new Function`-based `import()` is unavailable (e.g. CJS test runners). + */ +const dynamicImport = new Function('specifier', 'return import(specifier)') as ( + specifier: string +) => Promise; + +function resolveModulePath(specifier: string): string { + // Already a file:// URL + if (specifier.startsWith('file://')) { + return specifier; + } + // Absolute path - convert to file:// URL + if (specifier.startsWith('/')) { + return pathToFileURL(specifier).href; + } + // Relative path - resolve relative to cwd and convert to file:// URL + if (specifier.startsWith('./') || specifier.startsWith('../')) { + return pathToFileURL(resolve(process.cwd(), specifier)).href; + } + // Package specifier - use require.resolve to find the package + try { + return pathToFileURL(require.resolve(specifier)).href; + } catch { + return specifier; + } +} + /** * Create a new world instance based on environment variables. * WORKFLOW_TARGET_WORLD is used to determine the target world. @@ -30,7 +70,7 @@ const globalSymbols: typeof globalThis & { * vars should call createVercelWorld() directly with an explicit config and * use setWorld() to inject the instance. */ -export const createWorld = (): World => { +export const createWorld = async (): Promise => { const targetWorld = resolveWorkflowTargetWorld(); if (isVercelWorldTarget(targetWorld)) { @@ -62,7 +102,16 @@ export const createWorld = (): World => { }); } - const mod = require(targetWorld); + // Try dynamic import() first — ESM-first since this PR's purpose is ESM support. + // Fall back to require() for environments where `new Function`-based import() + // is unavailable (e.g. CJS test runners). + let mod: any; + try { + const resolvedPath = resolveModulePath(targetWorld); + mod = await dynamicImport(resolvedPath); + } catch { + mod = require(targetWorld); + } if (typeof mod === 'function') { return mod() as World; } else if (typeof mod.default === 'function') { @@ -76,6 +125,8 @@ export const createWorld = (): World => { ); }; +export type WorldHandlers = Pick; + /** * Some functions from the world are needed at build time, but we do NOT want * to cache the world in those instances for general use, since we don't have @@ -85,14 +136,19 @@ export const createWorld = (): World => { * Once we migrate to a file-based configuration (workflow.config.ts), we should * be able to re-combine getWorld and getWorldHandlers into one singleton. */ -export const getWorldHandlers = (): Pick< - World, - 'createQueueHandler' | 'specVersion' -> => { +export const getWorldHandlers = async (): Promise => { if (globalSymbols[StubbedWorldCache]) { return globalSymbols[StubbedWorldCache]; } - const _world = createWorld(); + // Store the promise immediately to prevent race conditions with concurrent calls. + // Clear on rejection so subsequent calls can retry instead of caching the failure. + if (!globalSymbols[StubbedWorldCachePromise]) { + globalSymbols[StubbedWorldCachePromise] = createWorld().catch((err) => { + globalSymbols[StubbedWorldCachePromise] = undefined; + throw err; + }); + } + const _world = await globalSymbols[StubbedWorldCachePromise]; globalSymbols[StubbedWorldCache] = _world; return { createQueueHandler: _world.createQueueHandler, @@ -100,11 +156,19 @@ export const getWorldHandlers = (): Pick< }; }; -export const getWorld = (): World => { +export const getWorld = async (): Promise => { if (globalSymbols[WorldCache]) { return globalSymbols[WorldCache]; } - globalSymbols[WorldCache] = createWorld(); + // Store the promise immediately to prevent race conditions with concurrent calls. + // Clear on rejection so subsequent calls can retry instead of caching the failure. + if (!globalSymbols[WorldCachePromise]) { + globalSymbols[WorldCachePromise] = createWorld().catch((err) => { + globalSymbols[WorldCachePromise] = undefined; + throw err; + }); + } + globalSymbols[WorldCache] = await globalSymbols[WorldCachePromise]; return globalSymbols[WorldCache]; }; @@ -115,4 +179,6 @@ export const getWorld = (): World => { export const setWorld = (world: World | undefined): void => { globalSymbols[WorldCache] = world; globalSymbols[StubbedWorldCache] = world; + globalSymbols[WorldCachePromise] = undefined; + globalSymbols[StubbedWorldCachePromise] = undefined; }; diff --git a/packages/core/src/serialization.ts b/packages/core/src/serialization.ts index 77346ca140..1c07c7e24c 100644 --- a/packages/core/src/serialization.ts +++ b/packages/core/src/serialization.ts @@ -426,7 +426,7 @@ export class WorkflowServerReadableStream extends ReadableStream { pull: async (controller) => { let reader = this.#reader; if (!reader) { - const world = getWorld(); + const world = await getWorld(); const stream = await world.streams.get(runId, name, startIndex); reader = this.#reader = stream.getReader(); } @@ -469,7 +469,7 @@ export class WorkflowServerWritableStream extends WritableStream { if (typeof name !== 'string' || name.length === 0) { throw new Error(`"name" is required, got "${name}"`); } - const world = getWorld(); + const worldPromise = getWorld(); // Buffering state for batched writes // Encryption/decryption is handled at the framing level by @@ -477,6 +477,7 @@ export class WorkflowServerWritableStream extends WritableStream { let buffer: Uint8Array[] = []; let flushTimer: ReturnType | null = null; let flushPromise: Promise | null = null; + let resolvedFlushIntervalMs: number | undefined; const flush = async (): Promise => { if (flushTimer) { @@ -490,6 +491,12 @@ export class WorkflowServerWritableStream extends WritableStream { // This prevents data loss if the write operation fails const chunksToFlush = buffer.slice(); + const world = await worldPromise; + // Cache the flush interval from the world on first use + if (resolvedFlushIntervalMs === undefined) { + resolvedFlushIntervalMs = + world.streamFlushIntervalMs ?? STREAM_FLUSH_INTERVAL_MS; + } // Use writeMulti if available for batch writes if ( typeof world.streams.writeMulti === 'function' && @@ -528,7 +535,7 @@ export class WorkflowServerWritableStream extends WritableStream { for (const w of currentWaiters) w.reject(err); } ); - }, world.streamFlushIntervalMs ?? STREAM_FLUSH_INTERVAL_MS); + }, resolvedFlushIntervalMs ?? STREAM_FLUSH_INTERVAL_MS); }; super({ @@ -561,6 +568,7 @@ export class WorkflowServerWritableStream extends WritableStream { // Flush any remaining buffered chunks await flush(); + const world = await worldPromise; await world.streams.close(runId, name); }, abort(reason) { diff --git a/packages/core/src/writable-stream.test.ts b/packages/core/src/writable-stream.test.ts index 9d6488a62a..e570b1f14e 100644 --- a/packages/core/src/writable-stream.test.ts +++ b/packages/core/src/writable-stream.test.ts @@ -12,7 +12,10 @@ describe('WorkflowServerWritableStream', () => { writeMulti: ReturnType; close: ReturnType; }; - let mockWorld: { streams: typeof mockStreams; streamFlushIntervalMs?: number }; + let mockWorld: { + streams: typeof mockStreams; + streamFlushIntervalMs?: number; + }; beforeEach(async () => { mockStreams = { diff --git a/packages/docs-typecheck/src/docs-globals.d.ts b/packages/docs-typecheck/src/docs-globals.d.ts index cd3bc1f596..05741b4aa8 100644 --- a/packages/docs-typecheck/src/docs-globals.d.ts +++ b/packages/docs-typecheck/src/docs-globals.d.ts @@ -232,6 +232,9 @@ declare global { queue: (...args: any[]) => Promise; createQueueHandler: (...args: any[]) => any; }; + /** Resolves the configured World (async — may perform dynamic import / env-based setup). */ + function getWorld(): Promise; + const streamId: string; const streamName: string; const hookId: string; diff --git a/packages/web/app/components/hooks-table.tsx b/packages/web/app/components/hooks-table.tsx index ff93b2c9bd..6b8662de37 100644 --- a/packages/web/app/components/hooks-table.tsx +++ b/packages/web/app/components/hooks-table.tsx @@ -142,7 +142,11 @@ export function HooksTable({ setInvocationData((prev) => { const updated = new Map(prev); for (const hook of hooks) { - updated.set(hook.hookId, { count: 0, hasMore: false, loading: false }); + updated.set(hook.hookId, { + count: 0, + hasMore: false, + loading: false, + }); } return updated; }); @@ -182,7 +186,11 @@ export function HooksTable({ setInvocationData((prev) => { const updated = new Map(prev); for (const hook of hooks) { - updated.set(hook.hookId, { count: 0, hasMore: false, loading: false }); + updated.set(hook.hookId, { + count: 0, + hasMore: false, + loading: false, + }); } return updated; }); diff --git a/packages/web/app/lib/client/hooks/use-resource-data.test.ts b/packages/web/app/lib/client/hooks/use-resource-data.test.ts index 1bf306af73..8f7d9d1fd5 100644 --- a/packages/web/app/lib/client/hooks/use-resource-data.test.ts +++ b/packages/web/app/lib/client/hooks/use-resource-data.test.ts @@ -16,11 +16,7 @@ vi.mock('~/lib/rpc-client', () => ({ import { waitEventsToWaitEntity } from '@workflow/web-shared'; import type { WorkflowRun } from '@workflow/world'; -import { - fetchEvents, - fetchHook, - fetchRun, -} from '~/lib/rpc-client'; +import { fetchEvents, fetchHook, fetchRun } from '~/lib/rpc-client'; const env = { SOME_VAR: 'test' }; @@ -116,8 +112,18 @@ describe('useWorkflowResourceData', () => { it('shows sleep entity constructed from events', async () => { const events = [ - { eventId: 'e1', type: 'sleep_scheduled', correlationId: 'sleep-corr-1', data: {} }, - { eventId: 'e2', type: 'other_event', correlationId: 'other-id', data: {} }, + { + eventId: 'e1', + type: 'sleep_scheduled', + correlationId: 'sleep-corr-1', + data: {}, + }, + { + eventId: 'e2', + type: 'other_event', + correlationId: 'other-id', + data: {}, + }, ]; vi.mocked(fetchEvents).mockResolvedValue({ success: true, diff --git a/packages/web/app/server/workflow-server-actions.server.ts b/packages/web/app/server/workflow-server-actions.server.ts index c5127a19a8..ca072d1b77 100644 --- a/packages/web/app/server/workflow-server-actions.server.ts +++ b/packages/web/app/server/workflow-server-actions.server.ts @@ -447,7 +447,7 @@ async function getWorldFromEnv(userEnvMap: EnvMap): Promise { return cachedWorld; } - const world = createWorld(); + const world = await createWorld(); worldCache.set(cacheKey, world); return world; } diff --git a/packages/workflow/src/observability.ts b/packages/workflow/src/observability.ts index 93a964f76c..45e90ad676 100644 --- a/packages/workflow/src/observability.ts +++ b/packages/workflow/src/observability.ts @@ -9,7 +9,7 @@ * import { getWorld } from 'workflow/api'; * import { hydrateResourceIO, observabilityRevivers } from 'workflow/observability'; * - * const world = getWorld(); + * const world = await getWorld(); * const step = await world.steps.get(runId, stepId, { resolveData: 'all' }); * const hydrated = hydrateResourceIO(step, observabilityRevivers); * // hydrated.input and hydrated.output are now plain JS objects diff --git a/packages/world-local/src/streamer.test.ts b/packages/world-local/src/streamer.test.ts index 72931c84c7..53ec843705 100644 --- a/packages/world-local/src/streamer.test.ts +++ b/packages/world-local/src/streamer.test.ts @@ -795,9 +795,13 @@ describe('streamer', () => { await streamer.streams.close(TEST_RUN_ID, streamName); // Page 1: limit=2 - const page1 = await streamer.streams.getChunks(TEST_RUN_ID, streamName, { - limit: 2, - }); + const page1 = await streamer.streams.getChunks( + TEST_RUN_ID, + streamName, + { + limit: 2, + } + ); expect(page1.data).toHaveLength(2); expect(page1.data[0].index).toBe(0); expect(page1.data[1].index).toBe(1); @@ -805,10 +809,14 @@ describe('streamer', () => { expect(page1.cursor).not.toBeNull(); // Page 2: remaining chunks - const page2 = await streamer.streams.getChunks(TEST_RUN_ID, streamName, { - limit: 2, - cursor: page1.cursor!, - }); + const page2 = await streamer.streams.getChunks( + TEST_RUN_ID, + streamName, + { + limit: 2, + cursor: page1.cursor!, + } + ); expect(page2.data).toHaveLength(1); expect(page2.data[0].index).toBe(2); expect(page2.hasMore).toBe(false); @@ -821,7 +829,10 @@ describe('streamer', () => { await streamer.streams.write(TEST_RUN_ID, streamName, 'data'); - const result = await streamer.streams.getChunks(TEST_RUN_ID, streamName); + const result = await streamer.streams.getChunks( + TEST_RUN_ID, + streamName + ); expect(result.data).toHaveLength(1); expect(result.done).toBe(false); }); @@ -845,9 +856,13 @@ describe('streamer', () => { await streamer.streams.close(TEST_RUN_ID, streamName); // Invalid cursor should reset to beginning - const result = await streamer.streams.getChunks(TEST_RUN_ID, streamName, { - cursor: 'not-valid-base64-json', - }); + const result = await streamer.streams.getChunks( + TEST_RUN_ID, + streamName, + { + cursor: 'not-valid-base64-json', + } + ); expect(result.data).toHaveLength(1); expect(result.data[0].index).toBe(0); }); diff --git a/packages/world-local/src/streamer.ts b/packages/world-local/src/streamer.ts index 4b2c2ce6bf..b6d3d973a7 100644 --- a/packages/world-local/src/streamer.ts +++ b/packages/world-local/src/streamer.ts @@ -375,10 +375,7 @@ export function createStreamer(basedir: string, tag?: string): Streamer { }; }, - async getInfo( - _runId: string, - name: string - ): Promise { + async getInfo(_runId: string, name: string): Promise { const chunksDir = path.join(basedir, 'streams', 'chunks'); const { files: chunkFiles, extMap: fileExtMap } = await listChunkFilesForStream(chunksDir, name, tag); diff --git a/packages/world-postgres/HOW_IT_WORKS.md b/packages/world-postgres/HOW_IT_WORKS.md index a7ad4456da..4c13a2a20e 100644 --- a/packages/world-postgres/HOW_IT_WORKS.md +++ b/packages/world-postgres/HOW_IT_WORKS.md @@ -48,7 +48,8 @@ In **Next.js**, the `world.start()` call needs to be added to `instrumentation.t if (process.env.NEXT_RUNTIME !== "edge") { import("workflow/runtime").then(async ({ getWorld }) => { // start listening to the jobs. - await getWorld().start?.(); + const world = await getWorld(); + await world.start?.(); }); } ``` diff --git a/packages/world-postgres/src/streamer.ts b/packages/world-postgres/src/streamer.ts index 9986e5f684..a79c3ebd75 100644 --- a/packages/world-postgres/src/streamer.ts +++ b/packages/world-postgres/src/streamer.ts @@ -325,10 +325,7 @@ export function createStreamer(pool: Pool, drizzle: Drizzle): PostgresStreamer { }; }, - async getInfo( - _runId: string, - name: string - ): Promise { + async getInfo(_runId: string, name: string): Promise { // Use COUNT(*) instead of fetching all rows into memory const [countResult] = await drizzle .select({ count: sql`count(*)` }) diff --git a/packages/world-testing/src/server.mts b/packages/world-testing/src/server.mts index 923b2f80f6..229f96f626 100644 --- a/packages/world-testing/src/server.mts +++ b/packages/world-testing/src/server.mts @@ -65,7 +65,8 @@ const app = new Hono() return ctx.json({ runId, hookId: hook.hookId }); }) .get('/runs/:runId', async (ctx) => { - const run = await getWorld().runs.get(ctx.req.param('runId')); + const world = await getWorld(); + const run = await world.runs.get(ctx.req.param('runId')); // Custom JSON serialization to handle Uint8Array as base64 const json = JSON.stringify(run, (_key, value) => { if (value instanceof Uint8Array) { @@ -112,7 +113,7 @@ serve( } } - const world = getWorld(); + const world = await getWorld(); if (world.start) { console.log(`starting background tasks...`); await world.start().then( diff --git a/packages/world-vercel/src/streamer.ts b/packages/world-vercel/src/streamer.ts index 48a490568d..93b678b67c 100644 --- a/packages/world-vercel/src/streamer.ts +++ b/packages/world-vercel/src/streamer.ts @@ -227,10 +227,7 @@ export function createStreamer(config?: APIConfig): Streamer { }); }, - async getInfo( - runId: string, - name: string - ): Promise { + async getInfo(runId: string, name: string): Promise { const endpoint = `/v2/runs/${encodeURIComponent(runId)}/streams/${encodeURIComponent(name)}/info`; return makeRequest({ endpoint, diff --git a/skills/workflow/SKILL.md b/skills/workflow/SKILL.md index 15ec0611ad..54d62f314c 100644 --- a/skills/workflow/SKILL.md +++ b/skills/workflow/SKILL.md @@ -3,7 +3,7 @@ name: workflow description: Creates durable, resumable workflows using Vercel's Workflow SDK. Use when building workflows that need to survive restarts, pause for external events, retry on failure, or coordinate multi-step operations over time. Triggers on mentions of "workflow", "durable functions", "resumable", "workflow sdk", "queue", "event", "push", "subscribe", or step-based orchestration. metadata: author: Vercel Inc. - version: '1.6' + version: '1.7' --- ## *CRITICAL*: Always Use Correct `workflow` Documentation @@ -617,7 +617,7 @@ await resumeWebhook(hook.token, new Request("https://example.com/webhook", { ## Observability & World SDK -Use `getWorld()` to build observability dashboards, admin panels, and inspect workflow state. +Use `await getWorld()` to build observability dashboards, admin panels, and inspect workflow state. `getWorld()` is asynchronous and returns `Promise` (dynamic import / env-based setup). **Key imports:** ```typescript @@ -634,7 +634,7 @@ import { hydrateResourceIO, observabilityRevivers, parseStepName, parseWorkflowN ⚠️ Pagination is nested: `{ pagination: { cursor } }` — NOT `{ cursor }` directly. ```typescript -const world = getWorld(); +const world = await getWorld(); // Runs const { data, cursor } = await world.runs.list({ pagination: { cursor }, resolveData: 'all' | 'none' }); @@ -654,12 +654,14 @@ await world.events.create(runId, { eventType: 'run_cancelled' }); const hook = await world.hooks.get(hookId); const hook = await world.hooks.getByToken(token); -// Streams (methods live directly on world, not nested) -await world.writeToStream(name, runId, chunk); -const readable = await world.readFromStream(name); -const chunks = await world.getStreamChunks(name, runId, { limit, cursor }); -const info = await world.getStreamInfo(name, runId); -const streams = await world.listStreamsByRunId(runId); +// Streams (methods on world.streams) +await world.streams.write(runId, name, chunk); +await world.streams.writeMulti?.(runId, name, chunks); +const readable = await world.streams.get(runId, name, startIndex); +await world.streams.close(runId, name); +const streamNames = await world.streams.list(runId); +const chunks = await world.streams.getChunks(runId, name, { limit, cursor }); +const info = await world.streams.getInfo(runId, name); // Queue (methods live directly on world — internal SDK infrastructure) await world.queue(queueName, payload, opts); diff --git a/workbench/astro/scripts/start-with-pg.mjs b/workbench/astro/scripts/start-with-pg.mjs index c6c5060906..def1d9f795 100644 --- a/workbench/astro/scripts/start-with-pg.mjs +++ b/workbench/astro/scripts/start-with-pg.mjs @@ -8,7 +8,11 @@ async function main() { if (process.env.WORKFLOW_TARGET_WORLD === '@workflow/world-postgres') { console.log('Starting Postgres World...'); const { getWorld } = await import('workflow/runtime'); - await getWorld().start?.(); + const world = await getWorld(); + if (world.start) { + console.log('Starting World workers...'); + await world.start(); + } } // Now start the Astro server diff --git a/workbench/nest/src/main.ts b/workbench/nest/src/main.ts index 100c8774eb..b0c22548c6 100644 --- a/workbench/nest/src/main.ts +++ b/workbench/nest/src/main.ts @@ -7,8 +7,11 @@ async function bootstrap() { // Start the Postgres World if configured if (process.env.WORKFLOW_TARGET_WORLD === '@workflow/world-postgres') { const { getWorld } = await import('workflow/runtime'); - console.log('Starting Postgres World...'); - await getWorld().start?.(); + const world = await getWorld(); + if (world.start) { + console.log('Starting World workers...'); + await world.start(); + } } const app = await NestFactory.create(AppModule, { diff --git a/workbench/nextjs-turbopack/instrumentation.ts b/workbench/nextjs-turbopack/instrumentation.ts index 174137a971..508eddd8d3 100644 --- a/workbench/nextjs-turbopack/instrumentation.ts +++ b/workbench/nextjs-turbopack/instrumentation.ts @@ -5,6 +5,10 @@ registerOTel({ serviceName: 'example-nextjs-workflow' }); if (process.env.NEXT_RUNTIME !== 'edge') { // kickstart the world import('workflow/runtime').then(async ({ getWorld }) => { - await getWorld().start?.(); + const world = await getWorld(); + if (world.start) { + console.log('Starting World workers...'); + await world.start(); + } }); } diff --git a/workbench/nitro-v3/plugins/start-pg-world.ts b/workbench/nitro-v3/plugins/start-pg-world.ts index 7e9cff2247..43f690fdad 100644 --- a/workbench/nitro-v3/plugins/start-pg-world.ts +++ b/workbench/nitro-v3/plugins/start-pg-world.ts @@ -5,8 +5,11 @@ import { defineNitroPlugin } from 'nitro/~internal/runtime/plugin'; export default defineNitroPlugin(async () => { if (process.env.WORKFLOW_TARGET_WORLD === '@workflow/world-postgres') { import('workflow/runtime').then(async ({ getWorld }) => { - console.log('Starting Postgres World...'); - await getWorld().start?.(); + const world = await getWorld(); + if (world.start) { + console.log('Starting World workers...'); + await world.start(); + } }); } }); diff --git a/workbench/nuxt/server/plugins/start-pg-world.ts b/workbench/nuxt/server/plugins/start-pg-world.ts index 2824d2b3ec..613d8e3110 100644 --- a/workbench/nuxt/server/plugins/start-pg-world.ts +++ b/workbench/nuxt/server/plugins/start-pg-world.ts @@ -5,8 +5,11 @@ import { defineNitroPlugin } from '#imports'; export default defineNitroPlugin(async () => { if (process.env.WORKFLOW_TARGET_WORLD === '@workflow/world-postgres') { import('workflow/runtime').then(async ({ getWorld }) => { - console.log('Starting Postgres World...'); - await getWorld().start?.(); + const world = await getWorld(); + if (world.start) { + console.log('Starting World workers...'); + await world.start(); + } }); } }); diff --git a/workbench/sveltekit/src/hooks.server.ts b/workbench/sveltekit/src/hooks.server.ts index 16d598cf0b..619b303f8b 100644 --- a/workbench/sveltekit/src/hooks.server.ts +++ b/workbench/sveltekit/src/hooks.server.ts @@ -5,7 +5,10 @@ export const init: ServerInit = async () => { // Needed since we test this in CI if (process.env.WORKFLOW_TARGET_WORLD === '@workflow/world-postgres') { const { getWorld } = await import('workflow/runtime'); - console.log('Starting Postgres World...'); - await getWorld().start?.(); + const world = await getWorld(); + if (world.start) { + console.log('Starting World workers...'); + await world.start(); + } } };