From 28b55c2ccf986315cbd6cbd73336bde5c45c73c1 Mon Sep 17 00:00:00 2001 From: kjgbot Date: Sat, 9 May 2026 19:10:03 +0200 Subject: [PATCH] fix(local-auto-fix): drop synthetic stage ids from startFromStep retry MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the local-runtime launch/precheck fails before any Agent Relay SDK step starts, Ricky tags evidence with a synthetic stage id ('runtime-launch', 'runtime-precheck', or the 'local-runtime' fallback). The auto-fix loop was reading those back out and forwarding them to the SDK as `startFromStep` on every retry, so the SDK threw `startFrom step "..." not found in workflow` seven times in a row before giving up — the user saw a misleading `MISSING_BINARY at runtime-launch` final blocker instead of the real classified failure. Filter the synthetic ids out at the boundary in `failedStepFromEvidence` so launch-phase failures retry from the start (with `previousRunId` still reused, matching the cli-auto-fix-and-resume spec). Centralize the allowlist in `src/local/synthetic-step-ids.ts` so the next stage label added in `entrypoint.ts` has a single source of truth to register against. Co-Authored-By: Claude Opus 4.7 (1M context) --- specs/auto-fix-synthetic-step-ids.md | 104 +++++++++++++++++++++++++++ src/local/auto-fix-loop.test.ts | 63 ++++++++++++++-- src/local/auto-fix-loop.ts | 6 +- src/local/synthetic-step-ids.ts | 13 ++++ 4 files changed, 178 insertions(+), 8 deletions(-) create mode 100644 specs/auto-fix-synthetic-step-ids.md create mode 100644 src/local/synthetic-step-ids.ts diff --git a/specs/auto-fix-synthetic-step-ids.md b/specs/auto-fix-synthetic-step-ids.md new file mode 100644 index 00000000..e02c03e0 --- /dev/null +++ b/specs/auto-fix-synthetic-step-ids.md @@ -0,0 +1,104 @@ +# Spec: auto-fix loop must not forward synthetic stage IDs as `startFromStep` + +## Problem + +When `ricky --mode local --spec-file --run` (or any `runLocal` invocation that lands in the auto-fix loop) hits a launch-phase failure — i.e. the spawned workflow process exits non-zero before any Agent Relay SDK step starts — Ricky reports the failure with a synthetic step id and the auto-fix loop then forwards that id back to the SDK as `startFromStep` on every retry. The SDK rejects it because the id is not a real workflow step name, and all retries fail identically. + +Concrete repro from a real run on `cloud@codex/msd-shared-sandbox-review-runtime-spec`: + +``` +ricky --mode local --spec-file docs/runtimes/specs/msd-shared-sandbox-review-runtime.md --run +… +Error: startFrom step "local-runtime" not found in workflow. Available steps: prepare-context, …, final-signoff +… +Error: startFrom step "runtime-launch" not found in workflow. Available steps: prepare-context, …, final-signoff # ×6 more +… +Execution: blocked — MISSING_BINARY at runtime-launch +Auto-fix: stopped after 7/7 attempt(s) (MISSING_BINARY) +``` + +The synthetic ids come from three places inside Ricky: + +- `src/local/entrypoint.ts:2013` — `runtime-precheck` ("Local runtime precheck"), reported when the precheck phase fails. +- `src/local/entrypoint.ts:2085` — `runtime-launch` ("Local runtime execution"), reported when the launch phase fails. +- `src/local/auto-fix-loop.ts:1643` — `local-runtime` fallback in `localResponseToWorkflowRunEvidence` when no `failed_step.id` is reported. + +All three are Ricky-internal labels for stage-level failures, not real Agent Relay SDK step names. + +The auto-fix loop then propagates that id to the SDK: + +- `src/local/auto-fix-loop.ts:148` — `failedStepFromEvidence` reads back the synthetic id as `failedStep`. +- `src/local/auto-fix-loop.ts:362, 387, 475` — passes `startFromStep: failedStep` into the retry request. +- `src/local/entrypoint.ts:401, 532, 798` — that becomes a `--start-from ` CLI arg or a `START_FROM` env var on the next workflow run. +- `@agent-relay/sdk WorkflowRunner.execute` validates `startFrom` against the workflow's actual step names and throws `startFrom step "" not found in workflow`. The seven retries all fail the same way; the user sees seven copies of the same stack trace and a misleading `MISSING_BINARY at runtime-launch` final blocker. + +## Behavior we want + +A launch-phase failure (process never reached an SDK step) must not produce a `startFromStep` on retry. The retry should re-run from the beginning — with `previousRunId` reused if available — exactly as the spec already says ("If no step granularity is reported … `--start-from` is omitted and we just retry the whole run with `--previous-run-id`." — `specs/cli-auto-fix-and-resume.md:59`). + +After this fix, the same repro must: + +- Not emit any `startFrom step "..." not found in workflow` errors on retry. +- Either succeed within the budget if the underlying failure is repairable, or terminate with the real classified blocker (e.g. `MISSING_BINARY` for an actually-missing binary) without burning retries on a phantom-step error. +- Surface the real failed step in the per-attempt summary only when one exists; otherwise omit `failed_step` instead of synthesizing `runtime-launch` / `local-runtime`. + +## Implementation surface + +The fix is contained and surgical. Two coordinated changes: + +1. **Stop synthesizing fake step ids as resume targets.** Introduce a single source of truth listing the synthetic stage labels Ricky uses internally: + + ```ts + // src/local/synthetic-step-ids.ts + export const SYNTHETIC_LOCAL_STAGE_IDS = new Set([ + 'runtime-precheck', + 'runtime-launch', + 'local-runtime', + ]); + export function isSyntheticStageId(id: string | undefined): boolean { … } + ``` + + The label may still appear in human-facing evidence (`evidence.failed_step.name = 'Local runtime execution'`) and in attempt summary output for diagnostics, but it must not be forwarded to the SDK as a `startFrom` target. + +2. **Filter at the boundary, not at the source.** The cleanest place is `failedStepFromEvidence` in `src/local/auto-fix-loop.ts`: return `undefined` when the only failed step id is synthetic. That keeps the existing evidence/event shape intact (other consumers, tests, and the persona-repair prompt may still want to know "the launch phase failed") while making the auto-fix retry path correctly treat it as a no-step-granularity failure. + + Concretely: + + ```ts + // src/local/auto-fix-loop.ts + function failedStepFromEvidence(evidence: WorkflowRunEvidence): string | undefined { + const real = evidence.steps.find( + (step) => step.status === 'failed' && !isSyntheticStageId(step.stepId), + ); + return real?.stepId; + } + ``` + + Every existing call site that reads `failedStep` (`src/local/auto-fix-loop.ts:148, 362, 387, 475` and the attempt-summary builder at `:155`) keeps working unchanged — they already guard with `failedStep ? { startFromStep: failedStep } : {}`, and after this filter `failedStep` will be `undefined` for launch-phase failures. + +3. **Keep the synthetic id out of evidence.steps[].stepId for resume purposes only.** Leave `src/local/entrypoint.ts:2104` (`workflow_steps[].id = 'runtime-launch'` in the success branch) untouched — that is observability metadata for a successful run and does not flow into auto-fix. The fallback at `src/local/auto-fix-loop.ts:1643` (`stepId: failedStepId ?? 'local-runtime'`) is the one that produces the bogus retry target; it should keep using the synthetic id for the *fallback step name* so output remains readable, but `failedStepFromEvidence` is the only place that decides resume targets, so this is already covered by change 2. + +## Test cases + +Unit tests in `src/local/auto-fix-loop.test.ts` (extend the existing file): + +1. **Synthetic launch-phase failure does not propagate `startFromStep`** — first attempt returns a `LocalResponse` with `execution.evidence.failed_step = { id: 'runtime-launch', name: 'Local runtime execution' }`. The retry request built by the loop must have `retry.startFromStep === undefined`. The retry must still set `retry.previousRunId` when a run id is available. +2. **Synthetic fallback id does not propagate either** — `LocalResponse` with no `failed_step` field; the fallback path stamps `stepId: 'local-runtime'` into evidence. Same assertion as (1). +3. **Real failed step still propagates** — evidence has a real failed step (`stepId: 'aggregate-drift'`, `status: 'failed'`). Retry request must carry `startFromStep: 'aggregate-drift'`. (Regression guard for the existing happy path.) +4. **Mixed evidence prefers a real failed step over a synthetic one** — if `evidence.steps` contains both `{ stepId: 'runtime-launch', status: 'failed' }` and `{ stepId: 'aggregate-drift', status: 'failed' }`, the loop forwards `aggregate-drift`. (Defends against future changes that emit both.) +5. **Attempt summary still records the synthetic name** — so users / logs can still see "Local runtime execution failed" even though no `startFromStep` was forwarded. The `failed_step` field on the summary is allowed to be omitted, but the human-readable summary line / evidence narrative must still mention the launch-phase failure. + +End-to-end (manual): rerun the original repro (`ricky --mode local --spec-file --run` against a workflow that fails to launch — e.g. arrange for a `MISSING_BINARY`). Verify zero `startFrom step "..." not found in workflow` errors across all retries; verify the final summary cites the real underlying blocker. + +## Out of scope + +- Changing the `MISSING_BINARY` classification or the auto-fix attempt budget. +- Renaming or removing the synthetic stage labels in observability output (`evidence.failed_step.name`, attempt summary text). They remain useful for humans reading logs. +- Detecting the *real* failed step from stderr when the SDK never reported one. The retry just re-runs from the start, which is correct and safe. +- Cloud-run retry semantics. This is local-only. + +## Acceptance + +- The original repro (`ricky --mode local --spec-file docs/runtimes/specs/msd-shared-sandbox-review-runtime.md --run`, run from the `cloud` repo) emits no `startFrom step "..." not found in workflow` errors. The final blocker, if any, reflects the real classified failure (e.g. `MISSING_BINARY` with the genuinely missing binary), not `runtime-launch`. +- `npm test` passes, including the new auto-fix-loop test cases above. +- No regression in `specs/cli-auto-fix-and-resume.md` acceptance criteria — `--auto-fix` with `MISSING_BINARY` + `npm install` still resumes correctly when a real step fails. diff --git a/src/local/auto-fix-loop.test.ts b/src/local/auto-fix-loop.test.ts index aa52dd78..deddf2a4 100644 --- a/src/local/auto-fix-loop.test.ts +++ b/src/local/auto-fix-loop.test.ts @@ -93,8 +93,8 @@ describe('runWithAutoFix', () => { it('passes failed repair history into the next workflow repair attempt', async () => { const runSingleAttempt = vi .fn() - .mockResolvedValueOnce(blockerResponse('MISSING_ENV_VAR', 'run-1', 'runtime-launch')) - .mockResolvedValueOnce(blockerResponse('MISSING_ENV_VAR', 'run-2', 'runtime-launch')) + .mockResolvedValueOnce(blockerResponse('MISSING_ENV_VAR', 'run-1', 'install-deps')) + .mockResolvedValueOnce(blockerResponse('MISSING_ENV_VAR', 'run-2', 'install-deps')) .mockResolvedValueOnce(successResponse('run-3')); const workflowRepairer = vi .fn() @@ -123,7 +123,7 @@ describe('runWithAutoFix', () => { retryAttempt: 2, outcome: expect.objectContaining({ status: 'blocker', - failedStep: 'runtime-launch', + failedStep: 'install-deps', blockerCode: 'MISSING_ENV_VAR', runId: 'run-2', debuggerSummary: 'Set TEST_TOKEN before retrying.', @@ -521,7 +521,7 @@ describe('runWithAutoFix', () => { it('uses the persona repair path even when the debugger recommends guided repair', async () => { const runSingleAttempt = vi .fn() - .mockResolvedValueOnce(blockerResponse('INVALID_ARTIFACT', 'run-1', 'runtime-launch')) + .mockResolvedValueOnce(blockerResponse('INVALID_ARTIFACT', 'run-1', 'install-deps')) .mockResolvedValueOnce(successResponse('run-2')); const workflowRepairer = vi.fn().mockResolvedValue(workflowRepair('guided repair workflow')); @@ -542,7 +542,7 @@ describe('runWithAutoFix', () => { it('repairs missing environment prerequisites in the workflow artifact before retrying', async () => { const runSingleAttempt = vi .fn() - .mockResolvedValueOnce(blockerResponse('MISSING_ENV_VAR', 'run-1', 'runtime-launch')) + .mockResolvedValueOnce(blockerResponse('MISSING_ENV_VAR', 'run-1', 'install-deps')) .mockResolvedValueOnce(successResponse('run-2')); const workflowRepairer = vi.fn().mockResolvedValue(workflowRepair('repaired env workflow')); @@ -557,7 +557,7 @@ describe('runWithAutoFix', () => { expect(runSingleAttempt).toHaveBeenCalledTimes(2); expect(workflowRepairer).toHaveBeenCalledWith(expect.objectContaining({ - failedStep: 'runtime-launch', + failedStep: 'install-deps', runId: 'run-1', response: expect.objectContaining({ execution: expect.objectContaining({ @@ -568,7 +568,7 @@ describe('runWithAutoFix', () => { expect(runSingleAttempt.mock.calls[1][0].retry).toMatchObject({ previousRunId: 'run-1', retryOfRunId: 'run-1', - startFromStep: 'runtime-launch', + startFromStep: 'install-deps', }); expect(result.ok).toBe(true); expect(result.auto_fix).toMatchObject({ @@ -718,6 +718,55 @@ describe('runWithAutoFix', () => { expect(runSingleAttempt.mock.calls[1][0].retry.previousRunId).toBeUndefined(); expect(result.warnings).toContain('Auto-fix retry could not resolve a previous run id; retrying without step-level resume.'); }); + + it.each([ + ['runtime-launch'], + ['local-runtime'], + ['runtime-precheck'], + ])('does not forward synthetic stage id %s as startFromStep on retry', async (syntheticId) => { + const runSingleAttempt = vi + .fn() + .mockResolvedValueOnce(blockerResponse('MISSING_ENV_VAR', 'run-1', syntheticId)) + .mockResolvedValueOnce(successResponse('run-2')); + const workflowRepairer = vi.fn().mockResolvedValue(workflowRepair('repaired workflow')); + + const result = await runWithAutoFix(baseRequest, { + maxAttempts: 3, + runSingleAttempt, + classifyFailure: fakeClassification, + debugWorkflowRun: directDebugger, + workflowRepairer, + artifactWriter: vi.fn().mockResolvedValue(undefined), + }); + + expect(result.ok).toBe(true); + expect(runSingleAttempt).toHaveBeenCalledTimes(2); + expect(runSingleAttempt.mock.calls[1][0].retry).toMatchObject({ + attempt: 2, + previousRunId: 'run-1', + }); + expect(runSingleAttempt.mock.calls[1][0].retry.startFromStep).toBeUndefined(); + expect(workflowRepairer.mock.calls[0][0].failedStep).toBeUndefined(); + }); + +}); + +describe('isSyntheticStageId', () => { + it('matches the runtime-launch / local-runtime / runtime-precheck labels', async () => { + const { isSyntheticStageId, SYNTHETIC_LOCAL_STAGE_IDS } = await import('./synthetic-step-ids.js'); + expect(isSyntheticStageId('runtime-launch')).toBe(true); + expect(isSyntheticStageId('local-runtime')).toBe(true); + expect(isSyntheticStageId('runtime-precheck')).toBe(true); + expect(SYNTHETIC_LOCAL_STAGE_IDS.size).toBe(3); + }); + + it('does not match real workflow step ids', async () => { + const { isSyntheticStageId } = await import('./synthetic-step-ids.js'); + expect(isSyntheticStageId('install-deps')).toBe(false); + expect(isSyntheticStageId('aggregate-drift')).toBe(false); + expect(isSyntheticStageId(undefined)).toBe(false); + expect(isSyntheticStageId('')).toBe(false); + }); }); function successResponse(runId: string): LocalResponse { diff --git a/src/local/auto-fix-loop.ts b/src/local/auto-fix-loop.ts index f805867e..6c124f44 100644 --- a/src/local/auto-fix-loop.ts +++ b/src/local/auto-fix-loop.ts @@ -11,6 +11,7 @@ import type { FailureClassification } from '../runtime/failure/types.js'; import { debugWorkflowRun as defaultDebugWorkflowRun } from '../product/specialists/debugger/debugger.js'; import type { DebuggerResult } from '../product/specialists/debugger/types.js'; import type { WorkflowRunEvidence, WorkflowStepEvidence } from '../shared/models/workflow-evidence.js'; +import { isSyntheticStageId } from './synthetic-step-ids.js'; import { repairWorkflowWithWorkforcePersona } from '../product/generation/workforce-persona-repairer.js'; import type { WorkforcePersonaRepairAttempt } from '../product/generation/workforce-persona-repairer.js'; import { @@ -1628,7 +1629,10 @@ function resolveRunId(response: LocalResponse): string | undefined { } function failedStepFromEvidence(evidence: WorkflowRunEvidence): string | undefined { - return evidence.steps.find((step) => step.status === 'failed')?.stepId; + const real = evidence.steps.find( + (step) => step.status === 'failed' && !isSyntheticStageId(step.stepId), + ); + return real?.stepId; } function localResponseToWorkflowRunEvidence(response: LocalResponse, attempt: number): WorkflowRunEvidence { diff --git a/src/local/synthetic-step-ids.ts b/src/local/synthetic-step-ids.ts new file mode 100644 index 00000000..d9f2eef0 --- /dev/null +++ b/src/local/synthetic-step-ids.ts @@ -0,0 +1,13 @@ +// Synthetic stage IDs Ricky uses to label local-runtime phases that aren't real +// Agent Relay SDK workflow steps. Forwarding them to the SDK as a `startFrom` +// resume target produces `startFrom step "..." not found in workflow`, so the +// auto-fix loop must filter them out. +export const SYNTHETIC_LOCAL_STAGE_IDS = new Set([ + 'runtime-precheck', + 'runtime-launch', + 'local-runtime', +]); + +export function isSyntheticStageId(id: string | undefined): boolean { + return !!id && SYNTHETIC_LOCAL_STAGE_IDS.has(id); +}