diff --git a/packages/factory-sdk/src/orchestrator/factory.test.ts b/packages/factory-sdk/src/orchestrator/factory.test.ts index 9ee41738..08ddcaa5 100644 --- a/packages/factory-sdk/src/orchestrator/factory.test.ts +++ b/packages/factory-sdk/src/orchestrator/factory.test.ts @@ -204,6 +204,21 @@ class SpawnFailingFleetClient extends FakeFleetClient { } } +// Mimics the broker rejecting a resume because it never released the agent's +// name on exit (relay#1116-family): http 500 "agent '' already exists". +class ResumeNameCollisionFleetClient extends FakeFleetClient { + override async resume(input: Parameters[0]): Promise { + this.resumes.push(input) + const name = input.name ?? input.sessionRef + throw Object.assign(new Error(`agent '${name}' already exists`), { + code: 'http_500', + status: 500, + retryable: true, + data: { error: `agent '${name}' already exists`, name, success: false }, + }) + } +} + class ManualClock { value = 0 @@ -3092,6 +3107,25 @@ describe('FactoryLoop', () => { expect(factory.status().inFlight).toEqual([]) }) + it('does not loop when resume hits a leaked broker name ("already exists")', async () => { + const mount = new FakeMountClient({ [issuePath(80)]: issueFile(80) }) + const fleet = new ResumeNameCollisionFleetClient() + fleet.setSessionRef('ar-80-review', 'session-review-80') + const factory = createFactory(config(), { mount, fleet, triage: new StaticTriage() }) + const decision = await factory.triageIssue(parseLinearIssue(issuePath(80), issueFile(80))) + + await factory.dispatch(decision) + fleet.emitAgentExit('ar-80-review', 'crash') + await flush() + // A second exit event must NOT trigger another resume attempt. + fleet.emitAgentExit('ar-80-review', 'crash') + await flush() + + expect(fleet.resumes).toHaveLength(1) // resumed once, collided, then short-circuited + expect(factory.status().counters.resumeNameCollisions).toBe(1) + expect(factory.status().counters.errors ?? 0).toBe(0) // not surfaced as a hard error + }) + it('does not complete on an implementer exit when only a draft PR exists', async () => { const issue = realIssueFile(256, ready, { title: 'Real implementer draft PR exit' }) const mount = new FakeMountClient({ [issuePath(256)]: issue }) diff --git a/packages/factory-sdk/src/orchestrator/factory.ts b/packages/factory-sdk/src/orchestrator/factory.ts index 2f1e60af..417b67df 100644 --- a/packages/factory-sdk/src/orchestrator/factory.ts +++ b/packages/factory-sdk/src/orchestrator/factory.ts @@ -1603,6 +1603,24 @@ export class FactoryLoop implements Factory { try { await resume this.#resumedExitKeys.add(resumeKey) + } catch (error) { + if (isAgentAlreadyExistsError(error)) { + // The broker never released this agent's name on exit + // (relay#1116-family), so re-registering collides with the stuck + // name. The error is marked retryable but isn't — retrying just + // re-collides forever. Treat it as terminal for this name: record + // the resume key so subsequent exit events short-circuit, count it, + // and warn once instead of spamming a 500 stack trace. The external + // reaper / a broker restart reclaims the leaked name. + this.#resumedExitKeys.add(resumeKey) + this.#increment('resumeNameCollisions') + this.#logger.warn?.('[factory] resume skipped: broker still holds agent name (relay#1116); not retrying', { + issue: record.issue.key, + name, + }) + } else { + throw error + } } finally { this.#resumeInFlight.delete(resumeKey) } @@ -2912,6 +2930,17 @@ const labelName = (value: unknown): string | undefined => { const isCompletionReason = (reason?: string): boolean => reason === 'issue-done' || reason === 'done' || reason === 'completed' +// The broker rejects re-registering a name it never released on exit +// (relay#1116-family) with a 500 "agent '' already exists". Detect it from +// the structured payload or the message so resume can treat it as terminal +// rather than retrying the (falsely) "retryable" error forever. +const isAgentAlreadyExistsError = (error: unknown): boolean => { + const record = asRecord(error) + const data = asRecord(record?.data) + const message = stringValue(data?.error) ?? (error instanceof Error ? error.message : '') + return /already exists/iu.test(message) +} + const defaultRestartPolicy = (spec: AgentSpec): AgentSpec['restartPolicy'] | undefined => spec.role === 'implementer' ? { maxRestarts: 3, strategy: 'resume' } as AgentSpec['restartPolicy'] : spec.restartPolicy