From 2f0c0103541fcb32e1d63b6ed556e18577953303 Mon Sep 17 00:00:00 2001 From: Khaliq Date: Sun, 14 Jun 2026 14:06:28 +0200 Subject: [PATCH] Stop the resume melt-down on leaked broker agent names MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Recurring "[factory] error HarnessDriverProtocolError: agent '' already exists" (http 500): an agent exits, #handleAgentExit resumes it via spawnPty with the same name, but the broker never released that name on exit (relay#1116-family) so re-registering collides. The error is marked retryable:true, so every exit event re-collides — a 500 loop (seen on ar-255-impl then ar-255-review). Handle it gracefully: on resume, detect the "already exists" collision (isAgentAlreadyExistsError), record the resume key so subsequent exit events short-circuit, increment resumeNameCollisions, and warn once — instead of surfacing a hard error and retrying forever. The external reaper / a broker restart reclaims the leaked name; the real upstream fix is relay#1116 (agents self-exit so no resume is needed). Test covers: a resume collision is resumed once, not retried on a second exit event, counted, and not surfaced as a hard error. Full factory-sdk suite green (346), typecheck clean. Co-Authored-By: Claude Opus 4.8 (1M context) --- .../src/orchestrator/factory.test.ts | 34 +++++++++++++++++++ .../factory-sdk/src/orchestrator/factory.ts | 29 ++++++++++++++++ 2 files changed, 63 insertions(+) diff --git a/packages/factory-sdk/src/orchestrator/factory.test.ts b/packages/factory-sdk/src/orchestrator/factory.test.ts index 9ee41738..08ddcaa5 100644 --- a/packages/factory-sdk/src/orchestrator/factory.test.ts +++ b/packages/factory-sdk/src/orchestrator/factory.test.ts @@ -204,6 +204,21 @@ class SpawnFailingFleetClient extends FakeFleetClient { } } +// Mimics the broker rejecting a resume because it never released the agent's +// name on exit (relay#1116-family): http 500 "agent '' already exists". +class ResumeNameCollisionFleetClient extends FakeFleetClient { + override async resume(input: Parameters[0]): Promise { + this.resumes.push(input) + const name = input.name ?? input.sessionRef + throw Object.assign(new Error(`agent '${name}' already exists`), { + code: 'http_500', + status: 500, + retryable: true, + data: { error: `agent '${name}' already exists`, name, success: false }, + }) + } +} + class ManualClock { value = 0 @@ -3092,6 +3107,25 @@ describe('FactoryLoop', () => { expect(factory.status().inFlight).toEqual([]) }) + it('does not loop when resume hits a leaked broker name ("already exists")', async () => { + const mount = new FakeMountClient({ [issuePath(80)]: issueFile(80) }) + const fleet = new ResumeNameCollisionFleetClient() + fleet.setSessionRef('ar-80-review', 'session-review-80') + const factory = createFactory(config(), { mount, fleet, triage: new StaticTriage() }) + const decision = await factory.triageIssue(parseLinearIssue(issuePath(80), issueFile(80))) + + await factory.dispatch(decision) + fleet.emitAgentExit('ar-80-review', 'crash') + await flush() + // A second exit event must NOT trigger another resume attempt. + fleet.emitAgentExit('ar-80-review', 'crash') + await flush() + + expect(fleet.resumes).toHaveLength(1) // resumed once, collided, then short-circuited + expect(factory.status().counters.resumeNameCollisions).toBe(1) + expect(factory.status().counters.errors ?? 0).toBe(0) // not surfaced as a hard error + }) + it('does not complete on an implementer exit when only a draft PR exists', async () => { const issue = realIssueFile(256, ready, { title: 'Real implementer draft PR exit' }) const mount = new FakeMountClient({ [issuePath(256)]: issue }) diff --git a/packages/factory-sdk/src/orchestrator/factory.ts b/packages/factory-sdk/src/orchestrator/factory.ts index 2f1e60af..417b67df 100644 --- a/packages/factory-sdk/src/orchestrator/factory.ts +++ b/packages/factory-sdk/src/orchestrator/factory.ts @@ -1603,6 +1603,24 @@ export class FactoryLoop implements Factory { try { await resume this.#resumedExitKeys.add(resumeKey) + } catch (error) { + if (isAgentAlreadyExistsError(error)) { + // The broker never released this agent's name on exit + // (relay#1116-family), so re-registering collides with the stuck + // name. The error is marked retryable but isn't — retrying just + // re-collides forever. Treat it as terminal for this name: record + // the resume key so subsequent exit events short-circuit, count it, + // and warn once instead of spamming a 500 stack trace. The external + // reaper / a broker restart reclaims the leaked name. + this.#resumedExitKeys.add(resumeKey) + this.#increment('resumeNameCollisions') + this.#logger.warn?.('[factory] resume skipped: broker still holds agent name (relay#1116); not retrying', { + issue: record.issue.key, + name, + }) + } else { + throw error + } } finally { this.#resumeInFlight.delete(resumeKey) } @@ -2912,6 +2930,17 @@ const labelName = (value: unknown): string | undefined => { const isCompletionReason = (reason?: string): boolean => reason === 'issue-done' || reason === 'done' || reason === 'completed' +// The broker rejects re-registering a name it never released on exit +// (relay#1116-family) with a 500 "agent '' already exists". Detect it from +// the structured payload or the message so resume can treat it as terminal +// rather than retrying the (falsely) "retryable" error forever. +const isAgentAlreadyExistsError = (error: unknown): boolean => { + const record = asRecord(error) + const data = asRecord(record?.data) + const message = stringValue(data?.error) ?? (error instanceof Error ? error.message : '') + return /already exists/iu.test(message) +} + const defaultRestartPolicy = (spec: AgentSpec): AgentSpec['restartPolicy'] | undefined => spec.role === 'implementer' ? { maxRestarts: 3, strategy: 'resume' } as AgentSpec['restartPolicy'] : spec.restartPolicy