From 92c9f4089faa195c429af1ef78f17a7663094722 Mon Sep 17 00:00:00 2001 From: kjgbot Date: Thu, 11 Jun 2026 23:13:07 +0200 Subject: [PATCH] Fix fleet agent exit latch --- .../src/fleet/internal-fleet-client.test.ts | 179 +++++++++++++++--- .../src/fleet/internal-fleet-client.ts | 26 +-- 2 files changed, 159 insertions(+), 46 deletions(-) diff --git a/packages/factory-sdk/src/fleet/internal-fleet-client.test.ts b/packages/factory-sdk/src/fleet/internal-fleet-client.test.ts index 895def71..36374ebe 100644 --- a/packages/factory-sdk/src/fleet/internal-fleet-client.test.ts +++ b/packages/factory-sdk/src/fleet/internal-fleet-client.test.ts @@ -301,7 +301,7 @@ describe('InternalFleetClient', () => { expect(exits).toEqual([{ name: 'ar-1-impl', reason: 'crashed' }]) }) - it('deduplicates distinct broker exit events for one agent death by name', () => { + it('latches one agent death by name across lagged exit callbacks', () => { vi.useFakeTimers() vi.setSystemTime(new Date('2026-06-11T00:00:00.000Z')) @@ -319,6 +319,7 @@ describe('InternalFleetClient', () => { reason: 'pty_closed', event_id: 'exit-event-1', } as BrokerEvent) + vi.advanceTimersByTime(10_000) harness.emit({ kind: 'agent_exited', name: 'ar-1-impl', @@ -326,34 +327,136 @@ describe('InternalFleetClient', () => { reason: 'pty_closed', event_id: 'exit-event-2', } as BrokerEvent) - harness.emit({ - kind: 'agent_exited', - name: 'ar-1-review', - code: 1, - reason: 'pty_closed', - event_id: 'exit-event-3', - } as BrokerEvent) - vi.advanceTimersByTime(5_000) - harness.emit({ - kind: 'agent_exited', - name: 'ar-1-impl', - code: 1, - reason: 'pty_closed', - event_id: 'exit-event-4', - } as BrokerEvent) - - expect(exits).toEqual([ - { name: 'ar-1-impl', reason: 'pty_closed' }, - { name: 'ar-1-review', reason: 'pty_closed' }, - { name: 'ar-1-impl', reason: 'pty_closed' }, - ]) + expect(exits).toEqual([{ name: 'ar-1-impl', reason: 'pty_closed' }]) } finally { vi.useRealTimers() } }) - it('suppresses legacy same-name exit callbacks inside the death window only', () => { + it('does not suppress exits for different agent names', () => { + const harness = new FakeHarnessDriverClient() + const fleet = new InternalFleetClient({ client: harness }) + const exits: Array<{ name: string; reason?: string }> = [] + + fleet.onAgentExit((name, reason) => exits.push({ name, reason })) + + harness.emit({ + kind: 'agent_exited', + name: 'ar-1-impl', + code: 1, + reason: 'pty_closed', + event_id: 'exit-event-1', + } as BrokerEvent) + harness.emit({ + kind: 'agent_exited', + name: 'ar-1-review', + code: 1, + reason: 'pty_closed', + event_id: 'exit-event-2', + } as BrokerEvent) + + expect(exits).toEqual([ + { name: 'ar-1-impl', reason: 'pty_closed' }, + { name: 'ar-1-review', reason: 'pty_closed' }, + ]) + }) + + it('clears the exit latch when the same agent name is spawned again', async () => { + const harness = new FakeHarnessDriverClient() + const fleet = new InternalFleetClient({ client: harness }) + const exits: Array<{ name: string; reason?: string }> = [] + + fleet.onAgentExit((name, reason) => exits.push({ name, reason })) + + await fleet.spawn({ name: 'ar-1-impl', capability: 'spawn:codex' }) + harness.emit({ + kind: 'agent_exited', + name: 'ar-1-impl', + code: 1, + reason: 'first-death', + event_id: 'exit-event-1', + } as BrokerEvent) + harness.emit({ + kind: 'agent_exited', + name: 'ar-1-impl', + code: 1, + reason: 'lagged-duplicate', + event_id: 'exit-event-2', + } as BrokerEvent) + + await fleet.spawn({ name: 'ar-1-impl', capability: 'spawn:codex' }) + harness.emit({ + kind: 'agent_exited', + name: 'ar-1-impl', + code: 1, + reason: 'second-death', + event_id: 'exit-event-3', + } as BrokerEvent) + + expect(exits).toEqual([ + { name: 'ar-1-impl', reason: 'first-death' }, + { name: 'ar-1-impl', reason: 'second-death' }, + ]) + }) + + it('clears the exit latch when the same agent name is resumed', async () => { + const harness = new FakeHarnessDriverClient() + const fleet = new InternalFleetClient({ client: harness }) + const exits: Array<{ name: string; reason?: string }> = [] + + fleet.onAgentExit((name, reason) => exits.push({ name, reason })) + + harness.emit({ + kind: 'agent_exited', + name: 'ar-1-impl', + code: 1, + reason: 'first-death', + event_id: 'exit-event-1', + } as BrokerEvent) + harness.emit({ + kind: 'agent_exited', + name: 'ar-1-impl', + code: 1, + reason: 'lagged-duplicate', + event_id: 'exit-event-2', + } as BrokerEvent) + + await fleet.resume({ name: 'ar-1-impl', sessionRef: 'session-original' }) + harness.emit({ + kind: 'agent_exited', + name: 'ar-1-impl', + code: 1, + reason: 'second-death', + event_id: 'exit-event-3', + } as BrokerEvent) + + expect(exits).toEqual([ + { name: 'ar-1-impl', reason: 'first-death' }, + { name: 'ar-1-impl', reason: 'second-death' }, + ]) + }) + + it('suppresses typed duplicate agent-exit callbacks until the name is spawned again', async () => { + const harness = new FakeHarnessDriverClient() + const fleet = new InternalFleetClient({ client: harness }) + const exits: Array<{ name: string; reason?: string }> = [] + + fleet.onAgentExit((name, reason) => exits.push({ name, reason })) + + harness.emitAgentExited({ name: 'ar-1-impl', sessionId: 'session-1' }) + harness.emitAgentExited({ name: 'ar-1-impl', sessionId: 'session-2' }) + + await fleet.spawn({ name: 'ar-1-impl', capability: 'spawn:codex' }) + harness.emitAgentExited({ name: 'ar-1-impl', sessionId: 'session-3' }) + + expect(exits).toEqual([ + { name: 'ar-1-impl', reason: 'exited' }, + { name: 'ar-1-impl', reason: 'exited' }, + ]) + }) + + it('surfaces same-name exits again only after a lifecycle restart, not after time passes', () => { vi.useFakeTimers() vi.setSystemTime(new Date('2026-06-11T00:00:00.000Z')) @@ -364,14 +467,32 @@ describe('InternalFleetClient', () => { fleet.onAgentExit((name, reason) => exits.push({ name, reason })) - harness.emitAgentExited({ name: 'ar-1-impl', sessionId: 'session-1' }) - harness.emitAgentExited({ name: 'ar-1-impl', sessionId: 'session-2' }) - vi.advanceTimersByTime(5_000) - harness.emitAgentExited({ name: 'ar-1-impl', sessionId: 'session-3' }) + harness.emit({ + kind: 'agent_exited', + name: 'ar-1-impl', + code: 1, + reason: 'pty_closed', + event_id: 'exit-event-1', + } as BrokerEvent) + vi.advanceTimersByTime(60_000) + harness.emit({ + kind: 'agent_exited', + name: 'ar-1-impl', + code: 1, + reason: 'pty_closed', + event_id: 'exit-event-2', + } as BrokerEvent) + harness.emit({ + kind: 'agent_exited', + name: 'ar-1-review', + code: 1, + reason: 'pty_closed', + event_id: 'exit-event-3', + } as BrokerEvent) expect(exits).toEqual([ - { name: 'ar-1-impl', reason: 'exited' }, - { name: 'ar-1-impl', reason: 'exited' }, + { name: 'ar-1-impl', reason: 'pty_closed' }, + { name: 'ar-1-review', reason: 'pty_closed' }, ]) } finally { vi.useRealTimers() diff --git a/packages/factory-sdk/src/fleet/internal-fleet-client.ts b/packages/factory-sdk/src/fleet/internal-fleet-client.ts index 45c4b7e0..7d235f99 100644 --- a/packages/factory-sdk/src/fleet/internal-fleet-client.ts +++ b/packages/factory-sdk/src/fleet/internal-fleet-client.ts @@ -49,8 +49,6 @@ const selfNode: RosterEntry['nodes'][number] = { live: true, } -const AGENT_EXIT_DEDUP_WINDOW_MS = 5_000 - export class InternalFleetClient implements FleetClient { readonly #client: HarnessDriverClientLike readonly #cwd?: string @@ -65,7 +63,7 @@ export class InternalFleetClient implements FleetClient { readonly #injectedEventIdSet = new Set() readonly #failedDeliveries = new Map() readonly #failedDeliveryIds: string[] = [] - readonly #recentAgentExits = new Map() + readonly #exitedAgentNames = new Set() #suppressedDuplicateEvents = 0 #suppressedDuplicateAgentExits = 0 #missingIdentityEvents = 0 @@ -92,6 +90,8 @@ export class InternalFleetClient implements FleetClient { continueFrom: input.sessionRef, }) + this.#clearAgentExitLatch(handle.name) + return { name: handle.name, sessionRef: sessionRefFrom(handle) } } @@ -106,6 +106,8 @@ export class InternalFleetClient implements FleetClient { continueFrom: input.sessionRef, }) + this.#clearAgentExitLatch(handle.name) + return { name: handle.name, sessionRef: sessionRefFrom(handle) ?? input.sessionRef } } @@ -291,33 +293,23 @@ export class InternalFleetClient implements FleetClient { } #rememberAgentExit(name: string): boolean { - const now = Date.now() - const prior = this.#recentAgentExits.get(name) - if (prior !== undefined && now - prior < AGENT_EXIT_DEDUP_WINDOW_MS) { + if (this.#exitedAgentNames.has(name)) { this.#suppressedDuplicateAgentExits += 1 if (this.#suppressedDuplicateAgentExits <= 3 || this.#suppressedDuplicateAgentExits % 100 === 0) { this.#logger?.debug?.('[factory-sdk] suppressed duplicate agent exit', { count: this.#suppressedDuplicateAgentExits, name, - windowMs: AGENT_EXIT_DEDUP_WINDOW_MS, }) } return true } - this.#recentAgentExits.set(name, now) - if (this.#recentAgentExits.size > 500) { - this.#pruneRecentAgentExits(now) - } + this.#exitedAgentNames.add(name) return false } - #pruneRecentAgentExits(now: number): void { - for (const [name, seenAt] of this.#recentAgentExits) { - if (now - seenAt >= AGENT_EXIT_DEDUP_WINDOW_MS) { - this.#recentAgentExits.delete(name) - } - } + #clearAgentExitLatch(name: string): void { + this.#exitedAgentNames.delete(name) } #rememberEvent(identity: EventIdentity): boolean {