From 96aa11eec8d50e42affdbc8e975bce282ac5d309 Mon Sep 17 00:00:00 2001 From: Amr Elsayed Date: Sun, 24 May 2026 07:58:34 +1000 Subject: [PATCH 01/11] [Issue #829] feat(porch): add listAllProjects helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Enumerate every project on disk by walking codev/projects/*/status.yaml and .builders/*/codev/projects/*/status.yaml, deduping by project id with .builders/ taking precedence (matches findStatusPath semantics; Spec 653 keeps worktree copies fresher than main in multi-PR flows). Skips unparseable status files silently — callers that need to surface parse errors can re-invoke readState() on individual paths. --- packages/codev/src/commands/porch/state.ts | 49 ++++++++++++++++++++++ 1 file changed, 49 insertions(+) diff --git a/packages/codev/src/commands/porch/state.ts b/packages/codev/src/commands/porch/state.ts index 9b97c9109..82483165b 100644 --- a/packages/codev/src/commands/porch/state.ts +++ b/packages/codev/src/commands/porch/state.ts @@ -302,6 +302,55 @@ export function findStatusPath(workspaceRoot: string, projectId: string): string return null; } +/** + * Enumerate every project on disk, parsing each status.yaml. + * + * Precedence matches findStatusPath(): `.builders//codev/projects/` wins + * over `codev/projects/` when the same project id appears in both. In + * multi-PR flows (Spec 653) early phases merge status.yaml to main, so the + * worktree copy is the fresher one. + * + * status.yaml files that fail to parse are skipped silently — callers that + * care about parse failures can re-attempt readState() on individual paths. + */ +export function listAllProjects( + workspaceRoot: string, +): Array<{ statusPath: string; state: ProjectState }> { + const collected = new Map(); + + const addFromDir = (projectsDir: string): void => { + if (!fs.existsSync(projectsDir)) return; + const entries = fs.readdirSync(projectsDir, { withFileTypes: true }); + for (const entry of entries) { + if (!entry.isDirectory()) continue; + const statusPath = path.join(projectsDir, entry.name, 'status.yaml'); + if (!fs.existsSync(statusPath)) continue; + try { + const state = readState(statusPath); + if (!collected.has(state.id)) { + collected.set(state.id, { statusPath, state }); + } + } catch { + // Unparseable / missing-fields — skip silently + } + } + }; + + // 1. Builder worktrees first (freshest in multi-PR layouts) + const buildersDir = path.join(workspaceRoot, '.builders'); + if (fs.existsSync(buildersDir)) { + for (const wt of fs.readdirSync(buildersDir, { withFileTypes: true })) { + if (!wt.isDirectory()) continue; + addFromDir(path.join(buildersDir, wt.name, PROJECTS_DIR)); + } + } + + // 2. Local main copy + addFromDir(path.join(workspaceRoot, PROJECTS_DIR)); + + return [...collected.values()]; +} + /** * Detect project ID from the current working directory if inside a builder worktree. * Works from any subdirectory within the worktree. From d14ee52e37f110587a0cacbeb0cf579c873da990 Mon Sep 17 00:00:00 2001 From: Amr Elsayed Date: Sun, 24 May 2026 07:59:02 +1000 Subject: [PATCH 02/11] [Issue #829] feat(afx): add 'workspace recover' to revive builders after reboot MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a machine reboot/crash kills Tower and every shellper daemon at once, disk state survives: worktrees, porch status.yaml, terminal_sessions rows. 'afx workspace recover' enumerates that disk state, identifies builders whose shellper is dead, and respawns each via 'afx spawn --resume'. Dry-run by default; --apply actually respawns. Default 7-day recency cap on status.yaml updated_at (override with --max-age N or --include-stale). Confirmation prompt before respawn unless -y/--yes. Eligibility predicate (evaluateEligibility): - phase is not terminal (verified/complete) - a terminal_sessions row exists for the derived builder id - shellper PID is dead AND socket file is gone (either signal of life keeps the builder out of the revive set; intentionally conservative) - worktree exists with a .git marker - status.yaml updated within the recency window Builders idle at the pr gate are deliberately included — humans may want to dispatch the builder to address review feedback without manual respawn. The predicate is pure (filesystem and process probes are injected) so the 29 unit tests cover every branch without touching real I/O. --- .../__tests__/workspace-recover.test.ts | 366 ++++++++++++++++++ packages/codev/src/agent-farm/cli.ts | 28 ++ .../agent-farm/commands/workspace-recover.ts | 296 ++++++++++++++ 3 files changed, 690 insertions(+) create mode 100644 packages/codev/src/agent-farm/__tests__/workspace-recover.test.ts create mode 100644 packages/codev/src/agent-farm/commands/workspace-recover.ts diff --git a/packages/codev/src/agent-farm/__tests__/workspace-recover.test.ts b/packages/codev/src/agent-farm/__tests__/workspace-recover.test.ts new file mode 100644 index 000000000..e98cb49d7 --- /dev/null +++ b/packages/codev/src/agent-farm/__tests__/workspace-recover.test.ts @@ -0,0 +1,366 @@ +/** + * Tests for `afx workspace recover` — eligibility predicate, builder-info + * derivation, worktree resolution, and listAllProjects precedence. + * + * Issue #829. + */ + +import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import { mkdtempSync, rmSync, mkdirSync, writeFileSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; + +import { + evaluateEligibility, + deriveBuilderInfo, + resolveWorktreePath, + formatRelativeAge, + type EligibilityInputs, +} from '../commands/workspace-recover.js'; +import { listAllProjects } from '../../commands/porch/state.js'; +import type { ProjectState } from '../../commands/porch/types.js'; +import type { DbTerminalSession } from '../servers/tower-types.js'; + +function makeState(overrides: Partial = {}): ProjectState { + return { + id: '0087', + title: 'Test project', + protocol: 'spir', + phase: 'implement', + plan_phases: [], + current_plan_phase: null, + gates: {}, + iteration: 1, + build_complete: false, + history: [], + started_at: '2026-05-20T00:00:00.000Z', + updated_at: new Date().toISOString(), + ...overrides, + }; +} + +function makeSession(overrides: Partial = {}): DbTerminalSession { + return { + id: 'term-123', + workspace_path: '/workspace', + type: 'builder', + role_id: 'builder-spir-87', + pid: null, + shellper_socket: '/tmp/shellper.sock', + shellper_pid: 12345, + shellper_start_time: Date.now(), + label: null, + cwd: null, + created_at: new Date().toISOString(), + ...overrides, + }; +} + +function defaults(): Omit { + return { + maxAgeDays: 7, + includeStale: false, + isProcessAlive: () => false, + socketExists: () => false, + }; +} + +describe('evaluateEligibility', () => { + it('skips terminal phase (verified)', () => { + const result = evaluateEligibility({ + state: makeState({ phase: 'verified' }), + session: makeSession(), + worktreeExists: true, + ageDays: 0, + ...defaults(), + }); + expect(result).toEqual({ eligible: false, reason: 'terminal' }); + }); + + it('skips terminal phase (complete)', () => { + const result = evaluateEligibility({ + state: makeState({ phase: 'complete' }), + session: makeSession(), + worktreeExists: true, + ageDays: 0, + ...defaults(), + }); + expect(result).toEqual({ eligible: false, reason: 'terminal' }); + }); + + it('skips when no terminal_sessions row exists', () => { + const result = evaluateEligibility({ + state: makeState(), + session: null, + worktreeExists: true, + ageDays: 0, + ...defaults(), + }); + expect(result).toEqual({ eligible: false, reason: 'no_session_row' }); + }); + + it('skips when shellper PID is alive', () => { + const result = evaluateEligibility({ + state: makeState(), + session: makeSession(), + worktreeExists: true, + ageDays: 0, + ...defaults(), + isProcessAlive: () => true, + }); + expect(result).toEqual({ eligible: false, reason: 'shellper_alive' }); + }); + + it('skips when socket file still exists (treated as alive)', () => { + const result = evaluateEligibility({ + state: makeState(), + session: makeSession(), + worktreeExists: true, + ageDays: 0, + ...defaults(), + isProcessAlive: () => false, + socketExists: () => true, + }); + expect(result).toEqual({ eligible: false, reason: 'shellper_alive' }); + }); + + it('skips when worktree is missing', () => { + const result = evaluateEligibility({ + state: makeState(), + session: makeSession(), + worktreeExists: false, + ageDays: 0, + ...defaults(), + }); + expect(result).toEqual({ eligible: false, reason: 'worktree_missing' }); + }); + + it('skips stale projects when --include-stale not set', () => { + const result = evaluateEligibility({ + state: makeState(), + session: makeSession(), + worktreeExists: true, + ageDays: 30, + ...defaults(), + }); + expect(result).toEqual({ eligible: false, reason: 'stale' }); + }); + + it('honors --include-stale on otherwise-stale projects', () => { + const result = evaluateEligibility({ + state: makeState(), + session: makeSession(), + worktreeExists: true, + ageDays: 30, + ...defaults(), + includeStale: true, + }); + expect(result).toEqual({ eligible: true }); + }); + + it('returns eligible when all conditions are met', () => { + const result = evaluateEligibility({ + state: makeState(), + session: makeSession(), + worktreeExists: true, + ageDays: 2, + ...defaults(), + }); + expect(result).toEqual({ eligible: true }); + }); + + it('checks predicates in cheap-first order (terminal beats missing session)', () => { + const result = evaluateEligibility({ + state: makeState({ phase: 'verified' }), + session: null, + worktreeExists: false, + ageDays: 999, + ...defaults(), + }); + expect(result).toEqual({ eligible: false, reason: 'terminal' }); + }); +}); + +describe('deriveBuilderInfo', () => { + it('maps SPIR state to builder-spir-', () => { + expect(deriveBuilderInfo(makeState({ id: '0087', protocol: 'spir' }))).toEqual({ + builderId: 'builder-spir-87', + issueArg: '87', + cliProtocol: 'spir', + }); + }); + + it('aliases protocol: spider → spir for both builderId and CLI invocation', () => { + expect(deriveBuilderInfo(makeState({ id: '0092', protocol: 'spider' }))).toEqual({ + builderId: 'builder-spir-92', + issueArg: '92', + cliProtocol: 'spir', + }); + }); + + it('handles bugfix project IDs (bugfix-693 → builder-bugfix-693, issue 693)', () => { + expect(deriveBuilderInfo(makeState({ id: 'bugfix-693', protocol: 'bugfix' }))).toEqual({ + builderId: 'builder-bugfix-693', + issueArg: '693', + cliProtocol: 'bugfix', + }); + }); + + it('handles PIR projects', () => { + expect(deriveBuilderInfo(makeState({ id: '0829', protocol: 'pir' }))).toEqual({ + builderId: 'builder-pir-829', + issueArg: '829', + cliProtocol: 'pir', + }); + }); + + it('handles ASPIR projects', () => { + expect(deriveBuilderInfo(makeState({ id: '0438', protocol: 'aspir' }))).toEqual({ + builderId: 'builder-aspir-438', + issueArg: '438', + cliProtocol: 'aspir', + }); + }); + + it('handles AIR projects', () => { + expect(deriveBuilderInfo(makeState({ id: '0501', protocol: 'air' }))).toEqual({ + builderId: 'builder-air-501', + issueArg: '501', + cliProtocol: 'air', + }); + }); +}); + +describe('resolveWorktreePath', () => { + let tmp: string; + let buildersDir: string; + + beforeEach(() => { + tmp = mkdtempSync(join(tmpdir(), 'recover-test-')); + buildersDir = join(tmp, '.builders'); + mkdirSync(buildersDir, { recursive: true }); + }); + + afterEach(() => { + rmSync(tmp, { recursive: true, force: true }); + }); + + it('finds ID-only worktree (Spec 653 layout)', () => { + const wt = join(buildersDir, 'spir-87'); + mkdirSync(join(wt, '.git'), { recursive: true }); + const result = resolveWorktreePath(buildersDir, makeState({ id: '0087', protocol: 'spir' })); + expect(result).toBe(wt); + }); + + it('falls back to legacy title-suffixed worktree', () => { + const wt = join(buildersDir, 'spir-87-some-title-slug'); + mkdirSync(join(wt, '.git'), { recursive: true }); + const result = resolveWorktreePath(buildersDir, makeState({ id: '0087', protocol: 'spir' })); + expect(result).toBe(wt); + }); + + it('returns null when no worktree matches', () => { + const result = resolveWorktreePath(buildersDir, makeState({ id: '0087', protocol: 'spir' })); + expect(result).toBeNull(); + }); + + it('ignores directories with the right prefix but no .git', () => { + mkdirSync(join(buildersDir, 'spir-87'), { recursive: true }); + const result = resolveWorktreePath(buildersDir, makeState({ id: '0087', protocol: 'spir' })); + expect(result).toBeNull(); + }); + + it('resolves bugfix worktree by issue number', () => { + const wt = join(buildersDir, 'bugfix-693'); + mkdirSync(join(wt, '.git'), { recursive: true }); + const result = resolveWorktreePath(buildersDir, makeState({ id: 'bugfix-693', protocol: 'bugfix' })); + expect(result).toBe(wt); + }); +}); + +describe('listAllProjects (precedence)', () => { + let tmp: string; + + beforeEach(() => { + tmp = mkdtempSync(join(tmpdir(), 'recover-list-')); + }); + + afterEach(() => { + rmSync(tmp, { recursive: true, force: true }); + }); + + function writeStatus(dir: string, state: Partial): void { + mkdirSync(dir, { recursive: true }); + const full = { ...makeState(state) }; + const yaml = [ + `id: '${full.id}'`, + `title: '${full.title}'`, + `protocol: ${full.protocol}`, + `phase: ${full.phase}`, + 'plan_phases: []', + 'current_plan_phase: null', + 'gates: {}', + `iteration: ${full.iteration}`, + `build_complete: ${full.build_complete}`, + 'history: []', + `started_at: '${full.started_at}'`, + `updated_at: '${full.updated_at}'`, + ].join('\n'); + writeFileSync(join(dir, 'status.yaml'), yaml + '\n', 'utf-8'); + } + + it('returns projects from codev/projects when no .builders copy exists', () => { + writeStatus(join(tmp, 'codev', 'projects', '0087-foo'), { id: '0087', phase: 'implement' }); + const result = listAllProjects(tmp); + expect(result).toHaveLength(1); + expect(result[0].state.id).toBe('0087'); + expect(result[0].statusPath).toBe(join(tmp, 'codev', 'projects', '0087-foo', 'status.yaml')); + }); + + it('prefers .builders/ copy when same project id exists in both', () => { + writeStatus(join(tmp, 'codev', 'projects', '0087-foo'), { id: '0087', phase: 'specify' }); + writeStatus( + join(tmp, '.builders', 'spir-87', 'codev', 'projects', '0087-foo'), + { id: '0087', phase: 'review' }, + ); + const result = listAllProjects(tmp); + expect(result).toHaveLength(1); + expect(result[0].state.phase).toBe('review'); + expect(result[0].statusPath).toContain('.builders'); + }); + + it('returns empty array for a workspace with no projects', () => { + expect(listAllProjects(tmp)).toEqual([]); + }); + + it('skips unparseable status.yaml files', () => { + const dir = join(tmp, 'codev', 'projects', '0099-broken'); + mkdirSync(dir, { recursive: true }); + writeFileSync(join(dir, 'status.yaml'), 'this is: not\n valid:\nyaml: [\n', 'utf-8'); + writeStatus(join(tmp, 'codev', 'projects', '0087-foo'), { id: '0087' }); + const result = listAllProjects(tmp); + expect(result).toHaveLength(1); + expect(result[0].state.id).toBe('0087'); + }); +}); + +describe('formatRelativeAge', () => { + it('formats minutes', () => { + const iso = new Date(Date.now() - 30 * 60_000).toISOString(); + expect(formatRelativeAge(iso)).toMatch(/^\d+m ago$/); + }); + + it('formats hours', () => { + const iso = new Date(Date.now() - 3 * 3600_000).toISOString(); + expect(formatRelativeAge(iso)).toMatch(/^\d+h ago$/); + }); + + it('formats days', () => { + const iso = new Date(Date.now() - 5 * 86_400_000).toISOString(); + expect(formatRelativeAge(iso)).toMatch(/^\d+d ago$/); + }); + + it('returns placeholder for malformed ISO', () => { + expect(formatRelativeAge('not a date')).toBe('—'); + }); +}); diff --git a/packages/codev/src/agent-farm/cli.ts b/packages/codev/src/agent-farm/cli.ts index 6e4d89f22..dc1c8591d 100644 --- a/packages/codev/src/agent-farm/cli.ts +++ b/packages/codev/src/agent-farm/cli.ts @@ -132,6 +132,34 @@ export async function runAgentFarm(args: string[]): Promise { } }); + // Issue #829: revive builders whose shellper died (e.g. after machine reboot). + workspaceCmd + .command('recover') + .description('Revive builders whose shellper died (e.g. after machine reboot)') + .option('--apply', 'Actually respawn builders (default: dry-run preview only)') + .option('--max-age ', 'Skip projects with status.yaml older than N days', '7') + .option('--include-stale', 'Ignore --max-age (revive arbitrarily old projects)') + .option('-y, --yes', 'Skip --apply confirmation prompt') + .action(async (options: { apply?: boolean; maxAge?: string; includeStale?: boolean; yes?: boolean }) => { + const { workspaceRecover } = await import('./commands/workspace-recover.js'); + try { + const parsedMaxAge = options.maxAge ? parseInt(options.maxAge, 10) : undefined; + if (parsedMaxAge !== undefined && (Number.isNaN(parsedMaxAge) || parsedMaxAge < 0)) { + logger.error(`Invalid --max-age value: ${options.maxAge}`); + process.exit(1); + } + await workspaceRecover({ + apply: options.apply, + maxAge: parsedMaxAge, + includeStale: options.includeStale, + yes: options.yes, + }); + } catch (error) { + logger.error(error instanceof Error ? error.message : String(error)); + process.exit(1); + } + }); + // Deprecated alias: `afx dash` → `afx workspace` const dashCmd = program .command('dash') diff --git a/packages/codev/src/agent-farm/commands/workspace-recover.ts b/packages/codev/src/agent-farm/commands/workspace-recover.ts new file mode 100644 index 000000000..6109a7a2e --- /dev/null +++ b/packages/codev/src/agent-farm/commands/workspace-recover.ts @@ -0,0 +1,296 @@ +// workspace recover — revive builders whose shellper died (e.g. machine reboot). +// Issue #829. Dry-run by default; --apply actually respawns. + +import { spawn } from 'node:child_process'; +import { existsSync, readdirSync } from 'node:fs'; +import { join } from 'node:path'; +import chalk from 'chalk'; + +import { getConfig } from '../utils/index.js'; +import { logger } from '../utils/logger.js'; +import { buildAgentName, stripLeadingZeros } from '../utils/agent-names.js'; +import { processExists, getTerminalSessionsForWorkspace } from '../servers/tower-terminals.js'; +import { closeGlobalDb } from '../db/index.js'; +import { listAllProjects } from '../../commands/porch/state.js'; +import type { ProjectState } from '../../commands/porch/types.js'; +import type { DbTerminalSession } from '../servers/tower-types.js'; +import { confirm } from '../../lib/cli-prompts.js'; + +const TERMINAL_PHASES = new Set(['verified', 'complete']); +const SPIDER_TO_SPIR = 'spir'; +const DEFAULT_MAX_AGE_DAYS = 7; + +export interface WorkspaceRecoverOptions { + apply?: boolean; + maxAge?: number; + includeStale?: boolean; + yes?: boolean; +} + +export type IneligibleReason = + | 'terminal' + | 'no_session_row' + | 'shellper_alive' + | 'worktree_missing' + | 'stale'; + +export type EligibilityResult = + | { eligible: true } + | { eligible: false; reason: IneligibleReason }; + +export interface EligibilityInputs { + state: ProjectState; + session: DbTerminalSession | null; + worktreeExists: boolean; + ageDays: number; + maxAgeDays: number; + includeStale: boolean; + isProcessAlive: (pid: number) => boolean; + socketExists: (socket: string) => boolean; +} + +/** + * Pure predicate — no I/O. All filesystem and process probes happen in the + * caller and are passed in via `isProcessAlive` and `socketExists`. This keeps + * the predicate trivially unit-testable. + * + * Order matters: cheap structural checks (phase, session row) come first; + * filesystem-touching checks (worktree, socket) later. + */ +export function evaluateEligibility(inputs: EligibilityInputs): EligibilityResult { + const { + state, session, worktreeExists, ageDays, maxAgeDays, includeStale, + isProcessAlive, socketExists, + } = inputs; + + if (TERMINAL_PHASES.has(state.phase)) { + return { eligible: false, reason: 'terminal' }; + } + if (!session) { + return { eligible: false, reason: 'no_session_row' }; + } + + // Either signal of life keeps the builder out of the revive set. + // We don't try to open the socket — file existence + PID liveness is enough + // and avoids any chance of disturbing a healthy shellper. + const pidAlive = session.shellper_pid !== null && isProcessAlive(session.shellper_pid); + const socketPresent = session.shellper_socket !== null && socketExists(session.shellper_socket); + if (pidAlive || socketPresent) { + return { eligible: false, reason: 'shellper_alive' }; + } + + if (!worktreeExists) { + return { eligible: false, reason: 'worktree_missing' }; + } + if (!includeStale && ageDays > maxAgeDays) { + return { eligible: false, reason: 'stale' }; + } + return { eligible: true }; +} + +export interface BuilderInfo { + builderId: string; + issueArg: string; + cliProtocol: string; +} + +/** + * Derive the inputs needed to invoke `afx spawn --resume --protocol ` + * and the SQLite `role_id` to look up the builder's terminal session. + * + * Normalizes the legacy `spider` protocol alias to `spir`. + */ +export function deriveBuilderInfo(state: ProjectState): BuilderInfo { + const rawProtocol = state.protocol === 'spider' ? SPIDER_TO_SPIR : state.protocol; + + if (state.protocol === 'bugfix') { + const numericId = state.id.replace(/^bugfix-/, ''); + return { + builderId: buildAgentName('bugfix', numericId), + issueArg: numericId, + cliProtocol: 'bugfix', + }; + } + return { + builderId: buildAgentName('spec', state.id, rawProtocol), + issueArg: stripLeadingZeros(state.id), + cliProtocol: rawProtocol, + }; +} + +/** + * Resolve the builder's worktree path on disk, handling both the Spec-653 + * ID-only layout and the legacy title-suffixed form. + */ +export function resolveWorktreePath(buildersDir: string, state: ProjectState): string | null { + const info = deriveBuilderInfo(state); + const idOnlyName = `${info.cliProtocol}-${info.issueArg}`; + const idOnlyPath = join(buildersDir, idOnlyName); + if (existsSync(idOnlyPath) && existsSync(join(idOnlyPath, '.git'))) { + return idOnlyPath; + } + + if (!existsSync(buildersDir)) return null; + const prefix = `${info.cliProtocol}-${info.issueArg}-`; + for (const entry of readdirSync(buildersDir, { withFileTypes: true })) { + if (!entry.isDirectory() || !entry.name.startsWith(prefix)) continue; + const candidate = join(buildersDir, entry.name); + if (existsSync(join(candidate, '.git'))) return candidate; + } + return null; +} + +export function formatRelativeAge(iso: string): string { + const ms = Date.now() - Date.parse(iso); + if (Number.isNaN(ms) || ms < 0) return '—'; + const minutes = Math.floor(ms / 60_000); + if (minutes < 60) return `${minutes}m ago`; + const hours = Math.floor(minutes / 60); + if (hours < 24) return `${hours}h ago`; + const days = Math.floor(hours / 24); + return `${days}d ago`; +} + +function reasonLabel(reason: IneligibleReason): string { + switch (reason) { + case 'terminal': return 'terminal'; + case 'no_session_row': return 'no session row'; + case 'shellper_alive': return 'shellper alive'; + case 'worktree_missing': return 'worktree missing'; + case 'stale': return 'stale'; + } +} + +interface RecoverRow { + state: ProjectState; + builderInfo: BuilderInfo; + worktreePath: string | null; + eligibility: EligibilityResult; +} + +function printPreview(rows: RecoverRow[]): void { + const widths = [6, 9, 12, 14, 10, 20]; + logger.row(['ID', 'PROTOCOL', 'PHASE', 'UPDATED', 'STATUS', 'REASON'], widths); + logger.row(['─'.repeat(6), '─'.repeat(9), '─'.repeat(12), '─'.repeat(14), '─'.repeat(10), '─'.repeat(20)], widths); + for (const row of rows) { + const status = row.eligibility.eligible + ? chalk.green('revive') + : chalk.gray('skip'); + const reason = row.eligibility.eligible ? '—' : reasonLabel(row.eligibility.reason); + logger.row( + [ + row.state.id, + row.state.protocol, + row.state.phase, + formatRelativeAge(row.state.updated_at), + status, + reason, + ], + widths, + ); + } +} + +async function respawnBuilder(info: BuilderInfo): Promise { + await new Promise((resolvePromise, rejectPromise) => { + const child = spawn( + 'afx', + ['spawn', info.issueArg, '--resume', '--protocol', info.cliProtocol], + { stdio: 'inherit' }, + ); + child.on('error', rejectPromise); + child.on('exit', (code) => { + if (code === 0) resolvePromise(); + else rejectPromise(new Error(`afx spawn exited with code ${code}`)); + }); + }); +} + +export async function workspaceRecover(options: WorkspaceRecoverOptions = {}): Promise { + const config = getConfig(); + const maxAgeDays = options.maxAge ?? DEFAULT_MAX_AGE_DAYS; + const includeStale = options.includeStale ?? false; + const apply = options.apply ?? false; + + logger.header(`Workspace Recover${apply ? '' : ' (dry-run)'}`); + logger.kv('Workspace', config.workspaceRoot); + if (!includeStale) logger.kv('Max age', `${maxAgeDays} day(s)`); + logger.blank(); + + const projects = listAllProjects(config.workspaceRoot); + if (projects.length === 0) { + logger.info('No porch projects found.'); + return; + } + + let sessions: DbTerminalSession[]; + try { + sessions = getTerminalSessionsForWorkspace(config.workspaceRoot); + } finally { + closeGlobalDb(); + } + const sessionByRoleId = new Map(); + for (const s of sessions) { + if (s.type === 'builder' && s.role_id) sessionByRoleId.set(s.role_id, s); + } + + const rows: RecoverRow[] = projects.map(({ state }) => { + const builderInfo = deriveBuilderInfo(state); + const session = sessionByRoleId.get(builderInfo.builderId) ?? null; + const worktreePath = resolveWorktreePath(config.buildersDir, state); + const ageDays = (Date.now() - Date.parse(state.updated_at)) / 86_400_000; + const eligibility = evaluateEligibility({ + state, session, + worktreeExists: worktreePath !== null, + ageDays, maxAgeDays, includeStale, + isProcessAlive: processExists, + socketExists: existsSync, + }); + return { state, builderInfo, worktreePath, eligibility }; + }); + + printPreview(rows); + + const eligible = rows.filter((r): r is RecoverRow & { eligibility: { eligible: true } } => r.eligibility.eligible); + logger.blank(); + logger.kv('Eligible', `${eligible.length} / ${rows.length}`); + + if (eligible.length === 0) { + logger.info(apply ? 'Nothing to revive.' : 'Nothing would be revived.'); + return; + } + + if (!apply) { + logger.info(`Run with --apply to respawn ${eligible.length} builder(s).`); + return; + } + + if (!options.yes) { + const proceed = await confirm(`Proceed to respawn ${eligible.length} builder(s)?`, false); + if (!proceed) { + logger.info('Aborted.'); + return; + } + } + + let succeeded = 0; + let failed = 0; + for (const row of eligible) { + logger.blank(); + logger.info(`Respawning ${row.builderInfo.builderId} (issue ${row.builderInfo.issueArg}, ${row.builderInfo.cliProtocol})...`); + try { + await respawnBuilder(row.builderInfo); + succeeded++; + } catch (err) { + failed++; + logger.error(`Failed to respawn ${row.builderInfo.builderId}: ${err instanceof Error ? err.message : String(err)}`); + } + } + + logger.blank(); + logger.kv('Respawned', String(succeeded)); + if (failed > 0) { + logger.kv('Failed', String(failed)); + process.exit(1); + } +} From 69d65d3773f524edbee421e25d78bc382814f5bd Mon Sep 17 00:00:00 2001 From: Amr Elsayed Date: Sun, 24 May 2026 08:30:37 +1000 Subject: [PATCH 03/11] [Issue #829] fix: address cmap-3 review findings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Four fixes from the 3-way review (Gemini + Codex + Claude): 1. Liveness probe trusts the PID first (Gemini blocker). Shellper sockets live in ~/.codev/run/ and are only unlinked on graceful SIGTERM (shellper-main.ts:196-206). The OS does not clean that path on reboot, so the previous "PID alive OR socket present → alive" rule would falsely mark every post-crash builder as alive and defeat the feature's primary use case. New rule: when shellper_pid is known, a dead PID is definitive (ignore the socket); fall back to socket presence only when shellper_pid is null (legacy pre-PID rows). 2. Protocol allowlist + unsupported_protocol reason (Codex blocker). afx spawn switches to issue-driven mode whenever a positional arg is present (spawn.ts:209-217), so 'afx spawn --resume --protocol experiment' mis-routes through spawnSpec and fails. Restrict revival to issue-driven protocols only: spir, aspir, air, pir, bugfix. deriveBuilderInfo returns null for the rest (including legacy spider, which was retired long ago), and evaluateEligibility surfaces them with the new 'unsupported_protocol' reason in the preview table so operators see what was skipped and why. 3. Aggregate duplicate session rows by role_id (Codex defensive fix). terminal_sessions has no UNIQUE constraint on role_id. If a prior recovery cycle left a dead row behind, the previous Map collapsed to last-write-wins non-deterministically. New behavior: bucket all matching rows per role_id and treat the builder as alive if ANY matching row's PID is alive — false positives (duplicate respawn) are much worse than false negatives (manual reattempt). 4. Use process.execPath + process.argv[1] for the respawn subprocess (Gemini hardening). Bare 'afx' relies on $PATH lookup, which can pick up an older global install or fail under npx / dev invocations. Re-invoking the same node binary + same CLI entry guarantees the spawned 'spawn' subcommand matches the version that ran the recover. Plus the agreed nit: listAllProjects accepts an optional onParseError callback so workspaceRecover can debug-log skipped malformed status.yaml files (previously swallowed silently). Tests grow from 29 → 41, covering the new liveness rule (dead PID + stale socket → revive), the unsupported_protocol skip across six protocol names, the duplicate-row aggregation in both directions, and the onParseError callback contract. --- .../__tests__/workspace-recover.test.ts | 197 ++++++++++++++---- .../agent-farm/commands/workspace-recover.ts | 112 +++++++--- packages/codev/src/commands/porch/state.ts | 9 +- 3 files changed, 246 insertions(+), 72 deletions(-) diff --git a/packages/codev/src/agent-farm/__tests__/workspace-recover.test.ts b/packages/codev/src/agent-farm/__tests__/workspace-recover.test.ts index e98cb49d7..ed4b82667 100644 --- a/packages/codev/src/agent-farm/__tests__/workspace-recover.test.ts +++ b/packages/codev/src/agent-farm/__tests__/workspace-recover.test.ts @@ -16,6 +16,7 @@ import { resolveWorktreePath, formatRelativeAge, type EligibilityInputs, + type BuilderInfo, } from '../commands/workspace-recover.js'; import { listAllProjects } from '../../commands/porch/state.js'; import type { ProjectState } from '../../commands/porch/types.js'; @@ -56,7 +57,11 @@ function makeSession(overrides: Partial = {}): DbTerminalSess }; } -function defaults(): Omit { +function makeBuilderInfo(overrides: Partial = {}): BuilderInfo { + return { builderId: 'builder-spir-87', issueArg: '87', cliProtocol: 'spir', ...overrides }; +} + +function defaults(): Omit { return { maxAgeDays: 7, includeStale: false, @@ -66,10 +71,11 @@ function defaults(): Omit { - it('skips terminal phase (verified)', () => { + it('skips terminal phase (verified) — comes before all other checks', () => { const result = evaluateEligibility({ state: makeState({ phase: 'verified' }), - session: makeSession(), + builderInfo: makeBuilderInfo(), + sessions: [makeSession()], worktreeExists: true, ageDays: 0, ...defaults(), @@ -80,7 +86,8 @@ describe('evaluateEligibility', () => { it('skips terminal phase (complete)', () => { const result = evaluateEligibility({ state: makeState({ phase: 'complete' }), - session: makeSession(), + builderInfo: makeBuilderInfo(), + sessions: [makeSession()], worktreeExists: true, ageDays: 0, ...defaults(), @@ -88,46 +95,130 @@ describe('evaluateEligibility', () => { expect(result).toEqual({ eligible: false, reason: 'terminal' }); }); - it('skips when no terminal_sessions row exists', () => { + it('skips when builderInfo is null (unsupported protocol)', () => { const result = evaluateEligibility({ - state: makeState(), - session: null, + state: makeState({ protocol: 'experiment' }), + builderInfo: null, + sessions: [makeSession()], worktreeExists: true, ageDays: 0, ...defaults(), }); - expect(result).toEqual({ eligible: false, reason: 'no_session_row' }); + expect(result).toEqual({ eligible: false, reason: 'unsupported_protocol' }); }); - it('skips when shellper PID is alive', () => { + it('skips when no terminal_sessions row exists', () => { const result = evaluateEligibility({ state: makeState(), - session: makeSession(), + builderInfo: makeBuilderInfo(), + sessions: [], worktreeExists: true, ageDays: 0, ...defaults(), - isProcessAlive: () => true, }); - expect(result).toEqual({ eligible: false, reason: 'shellper_alive' }); + expect(result).toEqual({ eligible: false, reason: 'no_session_row' }); }); - it('skips when socket file still exists (treated as alive)', () => { - const result = evaluateEligibility({ - state: makeState(), - session: makeSession(), - worktreeExists: true, - ageDays: 0, - ...defaults(), - isProcessAlive: () => false, - socketExists: () => true, + describe('liveness probe (PID-first per Gemini #829 review)', () => { + it('skips when shellper PID is alive (socket also present)', () => { + const result = evaluateEligibility({ + state: makeState(), + builderInfo: makeBuilderInfo(), + sessions: [makeSession({ shellper_pid: 12345, shellper_socket: '/sock' })], + worktreeExists: true, + ageDays: 0, + ...defaults(), + isProcessAlive: () => true, + socketExists: () => true, + }); + expect(result).toEqual({ eligible: false, reason: 'shellper_alive' }); + }); + + it('revives when PID is known dead even though stale socket file remains', () => { + // Critical for reboot recovery: sockets live in ~/.codev/run/ which the + // OS does not clear on reboot. The dead PID is definitive evidence. + const result = evaluateEligibility({ + state: makeState(), + builderInfo: makeBuilderInfo(), + sessions: [makeSession({ shellper_pid: 12345, shellper_socket: '/sock' })], + worktreeExists: true, + ageDays: 0, + ...defaults(), + isProcessAlive: () => false, + socketExists: () => true, + }); + expect(result).toEqual({ eligible: true }); + }); + + it('falls back to socket check when shellper_pid is null (legacy rows)', () => { + const result = evaluateEligibility({ + state: makeState(), + builderInfo: makeBuilderInfo(), + sessions: [makeSession({ shellper_pid: null, shellper_socket: '/sock' })], + worktreeExists: true, + ageDays: 0, + ...defaults(), + isProcessAlive: () => false, + socketExists: () => true, + }); + expect(result).toEqual({ eligible: false, reason: 'shellper_alive' }); + }); + + it('revives when shellper_pid is null AND no socket file', () => { + const result = evaluateEligibility({ + state: makeState(), + builderInfo: makeBuilderInfo(), + sessions: [makeSession({ shellper_pid: null, shellper_socket: '/sock' })], + worktreeExists: true, + ageDays: 0, + ...defaults(), + isProcessAlive: () => false, + socketExists: () => false, + }); + expect(result).toEqual({ eligible: true }); + }); + }); + + describe('duplicate row aggregation (Codex #829 review)', () => { + it('treats builder as alive if ANY matching row has a live PID', () => { + const result = evaluateEligibility({ + state: makeState(), + builderInfo: makeBuilderInfo(), + sessions: [ + makeSession({ id: 'dead-row', shellper_pid: 11111 }), + makeSession({ id: 'live-row', shellper_pid: 22222 }), + ], + worktreeExists: true, + ageDays: 0, + ...defaults(), + // pid 11111 dead, pid 22222 alive + isProcessAlive: (pid) => pid === 22222, + }); + expect(result).toEqual({ eligible: false, reason: 'shellper_alive' }); + }); + + it('revives only when ALL matching rows look dead', () => { + const result = evaluateEligibility({ + state: makeState(), + builderInfo: makeBuilderInfo(), + sessions: [ + makeSession({ id: 'row-a', shellper_pid: 11111 }), + makeSession({ id: 'row-b', shellper_pid: 22222 }), + ], + worktreeExists: true, + ageDays: 0, + ...defaults(), + isProcessAlive: () => false, + }); + expect(result).toEqual({ eligible: true }); }); - expect(result).toEqual({ eligible: false, reason: 'shellper_alive' }); }); it('skips when worktree is missing', () => { const result = evaluateEligibility({ state: makeState(), - session: makeSession(), + builderInfo: makeBuilderInfo(), + sessions: [makeSession()], worktreeExists: false, ageDays: 0, ...defaults(), @@ -138,7 +229,8 @@ describe('evaluateEligibility', () => { it('skips stale projects when --include-stale not set', () => { const result = evaluateEligibility({ state: makeState(), - session: makeSession(), + builderInfo: makeBuilderInfo(), + sessions: [makeSession()], worktreeExists: true, ageDays: 30, ...defaults(), @@ -149,7 +241,8 @@ describe('evaluateEligibility', () => { it('honors --include-stale on otherwise-stale projects', () => { const result = evaluateEligibility({ state: makeState(), - session: makeSession(), + builderInfo: makeBuilderInfo(), + sessions: [makeSession()], worktreeExists: true, ageDays: 30, ...defaults(), @@ -161,7 +254,8 @@ describe('evaluateEligibility', () => { it('returns eligible when all conditions are met', () => { const result = evaluateEligibility({ state: makeState(), - session: makeSession(), + builderInfo: makeBuilderInfo(), + sessions: [makeSession()], worktreeExists: true, ageDays: 2, ...defaults(), @@ -169,10 +263,11 @@ describe('evaluateEligibility', () => { expect(result).toEqual({ eligible: true }); }); - it('checks predicates in cheap-first order (terminal beats missing session)', () => { + it('checks predicates in cheap-first order (terminal beats unsupported)', () => { const result = evaluateEligibility({ - state: makeState({ phase: 'verified' }), - session: null, + state: makeState({ phase: 'verified', protocol: 'experiment' }), + builderInfo: null, + sessions: [], worktreeExists: false, ageDays: 999, ...defaults(), @@ -190,14 +285,6 @@ describe('deriveBuilderInfo', () => { }); }); - it('aliases protocol: spider → spir for both builderId and CLI invocation', () => { - expect(deriveBuilderInfo(makeState({ id: '0092', protocol: 'spider' }))).toEqual({ - builderId: 'builder-spir-92', - issueArg: '92', - cliProtocol: 'spir', - }); - }); - it('handles bugfix project IDs (bugfix-693 → builder-bugfix-693, issue 693)', () => { expect(deriveBuilderInfo(makeState({ id: 'bugfix-693', protocol: 'bugfix' }))).toEqual({ builderId: 'builder-bugfix-693', @@ -229,6 +316,15 @@ describe('deriveBuilderInfo', () => { cliProtocol: 'air', }); }); + + describe('unsupported protocols return null', () => { + it.each(['experiment', 'maintain', 'task', 'protocol', 'release', 'spider'])( + 'returns null for protocol: %s', + (protocol) => { + expect(deriveBuilderInfo(makeState({ protocol }))).toBeNull(); + }, + ); + }); }); describe('resolveWorktreePath', () => { @@ -276,9 +372,18 @@ describe('resolveWorktreePath', () => { const result = resolveWorktreePath(buildersDir, makeState({ id: 'bugfix-693', protocol: 'bugfix' })); expect(result).toBe(wt); }); + + it('returns null for unsupported protocols without filesystem lookups', () => { + // An experiment dir on disk would normally be found if the protocol were + // supported — but for unsupported protocols we short-circuit to null. + const wt = join(buildersDir, 'experiment-abcd'); + mkdirSync(join(wt, '.git'), { recursive: true }); + const result = resolveWorktreePath(buildersDir, makeState({ id: 'abcd', protocol: 'experiment' })); + expect(result).toBeNull(); + }); }); -describe('listAllProjects (precedence)', () => { +describe('listAllProjects (precedence + diagnostics)', () => { let tmp: string; beforeEach(() => { @@ -333,7 +438,7 @@ describe('listAllProjects (precedence)', () => { expect(listAllProjects(tmp)).toEqual([]); }); - it('skips unparseable status.yaml files', () => { + it('skips unparseable status.yaml files silently by default', () => { const dir = join(tmp, 'codev', 'projects', '0099-broken'); mkdirSync(dir, { recursive: true }); writeFileSync(join(dir, 'status.yaml'), 'this is: not\n valid:\nyaml: [\n', 'utf-8'); @@ -342,6 +447,20 @@ describe('listAllProjects (precedence)', () => { expect(result).toHaveLength(1); expect(result[0].state.id).toBe('0087'); }); + + it('invokes onParseError callback when provided', () => { + const dir = join(tmp, 'codev', 'projects', '0099-broken'); + mkdirSync(dir, { recursive: true }); + writeFileSync(join(dir, 'status.yaml'), 'this is: not\n valid:\nyaml: [\n', 'utf-8'); + const errors: Array<{ path: string; err: unknown }> = []; + const result = listAllProjects(tmp, { + onParseError: (path, err) => errors.push({ path, err }), + }); + expect(result).toEqual([]); + expect(errors).toHaveLength(1); + expect(errors[0].path).toBe(join(dir, 'status.yaml')); + expect(errors[0].err).toBeInstanceOf(Error); + }); }); describe('formatRelativeAge', () => { diff --git a/packages/codev/src/agent-farm/commands/workspace-recover.ts b/packages/codev/src/agent-farm/commands/workspace-recover.ts index 6109a7a2e..348eda02a 100644 --- a/packages/codev/src/agent-farm/commands/workspace-recover.ts +++ b/packages/codev/src/agent-farm/commands/workspace-recover.ts @@ -17,9 +17,16 @@ import type { DbTerminalSession } from '../servers/tower-types.js'; import { confirm } from '../../lib/cli-prompts.js'; const TERMINAL_PHASES = new Set(['verified', 'complete']); -const SPIDER_TO_SPIR = 'spir'; const DEFAULT_MAX_AGE_DAYS = 7; +// Protocols that `afx spawn --resume --protocol ` can revive cleanly: +// issue-driven families with a positional issue arg and a stable worktree layout. +// `experiment` / `maintain` / `task` / bare protocol-mode builders can't be +// resumed by ID through spawn.ts and are skipped with `unsupported_protocol`. +// (Legacy `spider` is also excluded — it was retired long ago, and any stray +// project still carrying that protocol value should be treated as unsupported.) +const REVIVABLE_PROTOCOLS = new Set(['spir', 'aspir', 'air', 'pir', 'bugfix']); + export interface WorkspaceRecoverOptions { apply?: boolean; maxAge?: number; @@ -29,6 +36,7 @@ export interface WorkspaceRecoverOptions { export type IneligibleReason = | 'terminal' + | 'unsupported_protocol' | 'no_session_row' | 'shellper_alive' | 'worktree_missing' @@ -40,7 +48,8 @@ export type EligibilityResult = export interface EligibilityInputs { state: ProjectState; - session: DbTerminalSession | null; + builderInfo: BuilderInfo | null; + sessions: DbTerminalSession[]; worktreeExists: boolean; ageDays: number; maxAgeDays: number; @@ -49,36 +58,56 @@ export interface EligibilityInputs { socketExists: (socket: string) => boolean; } +/** + * Liveness rule per row (Gemini cmap finding): + * - shellper_pid known + alive → ALIVE + * - shellper_pid known + dead → DEAD (socket is ignored; the OS doesn't clean + * ~/.codev/run/ on reboot, so a stale socket + * would otherwise falsely mark the row alive + * and defeat the primary recovery use case) + * - shellper_pid null + socket file exists → ALIVE (legacy / pre-PID rows) + * - shellper_pid null + no socket → DEAD + */ +function isSessionAlive( + session: DbTerminalSession, + isProcessAlive: (pid: number) => boolean, + socketExists: (socket: string) => boolean, +): boolean { + if (session.shellper_pid !== null) { + return isProcessAlive(session.shellper_pid); + } + return session.shellper_socket !== null && socketExists(session.shellper_socket); +} + /** * Pure predicate — no I/O. All filesystem and process probes happen in the * caller and are passed in via `isProcessAlive` and `socketExists`. This keeps * the predicate trivially unit-testable. * - * Order matters: cheap structural checks (phase, session row) come first; - * filesystem-touching checks (worktree, socket) later. + * Order matters: cheap structural checks come first; filesystem-touching + * checks (worktree) later. */ export function evaluateEligibility(inputs: EligibilityInputs): EligibilityResult { const { - state, session, worktreeExists, ageDays, maxAgeDays, includeStale, + state, builderInfo, sessions, worktreeExists, ageDays, maxAgeDays, includeStale, isProcessAlive, socketExists, } = inputs; if (TERMINAL_PHASES.has(state.phase)) { return { eligible: false, reason: 'terminal' }; } - if (!session) { + if (builderInfo === null) { + return { eligible: false, reason: 'unsupported_protocol' }; + } + if (sessions.length === 0) { return { eligible: false, reason: 'no_session_row' }; } - - // Either signal of life keeps the builder out of the revive set. - // We don't try to open the socket — file existence + PID liveness is enough - // and avoids any chance of disturbing a healthy shellper. - const pidAlive = session.shellper_pid !== null && isProcessAlive(session.shellper_pid); - const socketPresent = session.shellper_socket !== null && socketExists(session.shellper_socket); - if (pidAlive || socketPresent) { + // If ANY matching session looks alive, treat the builder as alive. Duplicates + // can occur when a prior recovery left a dead row behind (terminal_sessions + // has no UNIQUE constraint on role_id) — the cautious read is "alive." + if (sessions.some(s => isSessionAlive(s, isProcessAlive, socketExists))) { return { eligible: false, reason: 'shellper_alive' }; } - if (!worktreeExists) { return { eligible: false, reason: 'worktree_missing' }; } @@ -98,11 +127,14 @@ export interface BuilderInfo { * Derive the inputs needed to invoke `afx spawn --resume --protocol ` * and the SQLite `role_id` to look up the builder's terminal session. * - * Normalizes the legacy `spider` protocol alias to `spir`. + * Returns null for protocols that cannot be cleanly resumed via the spawn CLI + * (experiment/maintain/task/legacy spider/etc) — callers should skip those + * projects with an `unsupported_protocol` reason. */ -export function deriveBuilderInfo(state: ProjectState): BuilderInfo { - const rawProtocol = state.protocol === 'spider' ? SPIDER_TO_SPIR : state.protocol; - +export function deriveBuilderInfo(state: ProjectState): BuilderInfo | null { + if (!REVIVABLE_PROTOCOLS.has(state.protocol)) { + return null; + } if (state.protocol === 'bugfix') { const numericId = state.id.replace(/^bugfix-/, ''); return { @@ -112,18 +144,23 @@ export function deriveBuilderInfo(state: ProjectState): BuilderInfo { }; } return { - builderId: buildAgentName('spec', state.id, rawProtocol), + builderId: buildAgentName('spec', state.id, state.protocol), issueArg: stripLeadingZeros(state.id), - cliProtocol: rawProtocol, + cliProtocol: state.protocol, }; } /** * Resolve the builder's worktree path on disk, handling both the Spec-653 * ID-only layout and the legacy title-suffixed form. + * + * Returns null when the project's protocol isn't revivable (no defined worktree + * naming) or when no matching directory exists. */ export function resolveWorktreePath(buildersDir: string, state: ProjectState): string | null { const info = deriveBuilderInfo(state); + if (info === null) return null; + const idOnlyName = `${info.cliProtocol}-${info.issueArg}`; const idOnlyPath = join(buildersDir, idOnlyName); if (existsSync(idOnlyPath) && existsSync(join(idOnlyPath, '.git'))) { @@ -154,6 +191,7 @@ export function formatRelativeAge(iso: string): string { function reasonLabel(reason: IneligibleReason): string { switch (reason) { case 'terminal': return 'terminal'; + case 'unsupported_protocol': return 'unsupported protocol'; case 'no_session_row': return 'no session row'; case 'shellper_alive': return 'shellper alive'; case 'worktree_missing': return 'worktree missing'; @@ -163,7 +201,7 @@ function reasonLabel(reason: IneligibleReason): string { interface RecoverRow { state: ProjectState; - builderInfo: BuilderInfo; + builderInfo: BuilderInfo | null; worktreePath: string | null; eligibility: EligibilityResult; } @@ -192,10 +230,13 @@ function printPreview(rows: RecoverRow[]): void { } async function respawnBuilder(info: BuilderInfo): Promise { + // Use the current node binary and CLI entry point so the respawn invocation + // matches the install method this command was started under (npm global, + // npx, dev script, etc.) instead of relying on PATH lookup of 'afx'. await new Promise((resolvePromise, rejectPromise) => { const child = spawn( - 'afx', - ['spawn', info.issueArg, '--resume', '--protocol', info.cliProtocol], + process.execPath, + [process.argv[1], 'spawn', info.issueArg, '--resume', '--protocol', info.cliProtocol], { stdio: 'inherit' }, ); child.on('error', rejectPromise); @@ -217,7 +258,11 @@ export async function workspaceRecover(options: WorkspaceRecoverOptions = {}): P if (!includeStale) logger.kv('Max age', `${maxAgeDays} day(s)`); logger.blank(); - const projects = listAllProjects(config.workspaceRoot); + const projects = listAllProjects(config.workspaceRoot, { + onParseError: (statusPath, err) => { + logger.debug(`Skipped unparseable ${statusPath}: ${err instanceof Error ? err.message : String(err)}`); + }, + }); if (projects.length === 0) { logger.info('No porch projects found.'); return; @@ -229,18 +274,24 @@ export async function workspaceRecover(options: WorkspaceRecoverOptions = {}): P } finally { closeGlobalDb(); } - const sessionByRoleId = new Map(); + // role_id has no UNIQUE constraint in the schema, so collect every matching + // row per builder id rather than collapsing to last-write-wins. + const sessionsByRoleId = new Map(); for (const s of sessions) { - if (s.type === 'builder' && s.role_id) sessionByRoleId.set(s.role_id, s); + if (s.type !== 'builder' || !s.role_id) continue; + const bucket = sessionsByRoleId.get(s.role_id); + if (bucket) bucket.push(s); + else sessionsByRoleId.set(s.role_id, [s]); } const rows: RecoverRow[] = projects.map(({ state }) => { const builderInfo = deriveBuilderInfo(state); - const session = sessionByRoleId.get(builderInfo.builderId) ?? null; + const matchingSessions = builderInfo ? sessionsByRoleId.get(builderInfo.builderId) ?? [] : []; const worktreePath = resolveWorktreePath(config.buildersDir, state); const ageDays = (Date.now() - Date.parse(state.updated_at)) / 86_400_000; const eligibility = evaluateEligibility({ - state, session, + state, builderInfo, + sessions: matchingSessions, worktreeExists: worktreePath !== null, ageDays, maxAgeDays, includeStale, isProcessAlive: processExists, @@ -251,7 +302,10 @@ export async function workspaceRecover(options: WorkspaceRecoverOptions = {}): P printPreview(rows); - const eligible = rows.filter((r): r is RecoverRow & { eligibility: { eligible: true } } => r.eligibility.eligible); + const eligible = rows.filter( + (r): r is RecoverRow & { builderInfo: BuilderInfo; eligibility: { eligible: true } } => + r.eligibility.eligible && r.builderInfo !== null, + ); logger.blank(); logger.kv('Eligible', `${eligible.length} / ${rows.length}`); diff --git a/packages/codev/src/commands/porch/state.ts b/packages/codev/src/commands/porch/state.ts index 82483165b..4408eef78 100644 --- a/packages/codev/src/commands/porch/state.ts +++ b/packages/codev/src/commands/porch/state.ts @@ -310,11 +310,12 @@ export function findStatusPath(workspaceRoot: string, projectId: string): string * multi-PR flows (Spec 653) early phases merge status.yaml to main, so the * worktree copy is the fresher one. * - * status.yaml files that fail to parse are skipped silently — callers that - * care about parse failures can re-attempt readState() on individual paths. + * status.yaml files that fail to parse are skipped. Callers can observe these + * skips by passing `onParseError`; otherwise failures are silent. */ export function listAllProjects( workspaceRoot: string, + opts?: { onParseError?: (statusPath: string, err: unknown) => void }, ): Array<{ statusPath: string; state: ProjectState }> { const collected = new Map(); @@ -330,8 +331,8 @@ export function listAllProjects( if (!collected.has(state.id)) { collected.set(state.id, { statusPath, state }); } - } catch { - // Unparseable / missing-fields — skip silently + } catch (err) { + opts?.onParseError?.(statusPath, err); } } }; From c14fd0d16c5adc14a2a3d5e1cc63566bc61a7410 Mon Sep 17 00:00:00 2001 From: Amr Elsayed Date: Sun, 24 May 2026 08:43:21 +1000 Subject: [PATCH 04/11] =?UTF-8?q?[Issue=20#829]=20fix:=20invert=20session-?= =?UTF-8?q?row=20predicate=20=E2=80=94=20absence=20means=20revive?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Real-world test against a 345-project workspace surfaced the core flaw: every PIR builder at plan/dev-approval that was killed by a reboot showed 'no_session_row' and got skipped — exactly the case this command exists for. Cause: Tower's reconciliation (tower-terminals.ts:485-711) runs on Tower startup and DELETES terminal_sessions rows whose shellper can't be reconnected. The user must start Tower (`afx workspace start`) before they can spawn anything, so reconciliation always runs before recovery. By the time `workspace recover` queries the DB, every dead builder's row has already been pruned. The previous predicate treated 'no row' as 'wasn't expected to be running', which inverts reality. Fix: drop 'no_session_row' as a skip reason entirely. The session row is now used only as a positive 'still alive' signal — its absence is the common case for any builder that needs revival. Predicate order also reshuffled so the stale check fires at the right point. Previously, anything missing a session row short-circuited before stale was evaluated, so 73-day-old projects showed 'no_session_row' instead of 'stale' (same skip outcome, misleading reason). New order: 1. terminal phase 2. unsupported protocol 3. worktree missing 4. shellper alive (any matching row) 5. stale (age > maxAge unless --include-stale) 6. otherwise → revive Tests: 43 (up from 41). Added regressions for the PIR-at-gate case and for stale + no-row labelling correctly as 'stale'. --- .../__tests__/workspace-recover.test.ts | 37 ++++++++++++++++++- .../agent-farm/commands/workspace-recover.ts | 31 ++++++++++------ 2 files changed, 55 insertions(+), 13 deletions(-) diff --git a/packages/codev/src/agent-farm/__tests__/workspace-recover.test.ts b/packages/codev/src/agent-farm/__tests__/workspace-recover.test.ts index ed4b82667..26e94df20 100644 --- a/packages/codev/src/agent-farm/__tests__/workspace-recover.test.ts +++ b/packages/codev/src/agent-farm/__tests__/workspace-recover.test.ts @@ -107,7 +107,10 @@ describe('evaluateEligibility', () => { expect(result).toEqual({ eligible: false, reason: 'unsupported_protocol' }); }); - it('skips when no terminal_sessions row exists', () => { + it('revives when no session row exists (Tower already reconciled the dead row)', () => { + // The common post-reboot case: Tower startup runs reconciliation, fails + // to reconnect to the dead shellper, and deletes the row. By the time + // `workspace recover` runs, the row is gone. Absence means "needs revival." const result = evaluateEligibility({ state: makeState(), builderInfo: makeBuilderInfo(), @@ -116,7 +119,7 @@ describe('evaluateEligibility', () => { ageDays: 0, ...defaults(), }); - expect(result).toEqual({ eligible: false, reason: 'no_session_row' }); + expect(result).toEqual({ eligible: true }); }); describe('liveness probe (PID-first per Gemini #829 review)', () => { @@ -274,6 +277,36 @@ describe('evaluateEligibility', () => { }); expect(result).toEqual({ eligible: false, reason: 'terminal' }); }); + + it('skips a stale project even with no session row (post-reconciliation + old)', () => { + // Without the predicate fix, this would have shown `no_session_row`; + // with the new ordering, an old project past the recency window is + // surfaced as `stale` — the more useful diagnostic for the operator. + const result = evaluateEligibility({ + state: makeState(), + builderInfo: makeBuilderInfo(), + sessions: [], + worktreeExists: true, + ageDays: 30, + ...defaults(), + }); + expect(result).toEqual({ eligible: false, reason: 'stale' }); + }); + + it('revives a recent active project even with no session row (the PIR-at-gate case)', () => { + // The motivating bug: a PIR builder sitting at plan-approval, killed by + // a reboot, with its session row cleaned up by Tower's reconciliation. + // Before the predicate fix, this incorrectly skipped with `no_session_row`. + const result = evaluateEligibility({ + state: makeState({ protocol: 'pir', phase: 'plan' }), + builderInfo: makeBuilderInfo({ builderId: 'builder-pir-1661', issueArg: '1661', cliProtocol: 'pir' }), + sessions: [], + worktreeExists: true, + ageDays: 1, + ...defaults(), + }); + expect(result).toEqual({ eligible: true }); + }); }); describe('deriveBuilderInfo', () => { diff --git a/packages/codev/src/agent-farm/commands/workspace-recover.ts b/packages/codev/src/agent-farm/commands/workspace-recover.ts index 348eda02a..dfc8b9af9 100644 --- a/packages/codev/src/agent-farm/commands/workspace-recover.ts +++ b/packages/codev/src/agent-farm/commands/workspace-recover.ts @@ -37,9 +37,8 @@ export interface WorkspaceRecoverOptions { export type IneligibleReason = | 'terminal' | 'unsupported_protocol' - | 'no_session_row' - | 'shellper_alive' | 'worktree_missing' + | 'shellper_alive' | 'stale'; export type EligibilityResult = @@ -84,8 +83,22 @@ function isSessionAlive( * caller and are passed in via `isProcessAlive` and `socketExists`. This keeps * the predicate trivially unit-testable. * - * Order matters: cheap structural checks come first; filesystem-touching - * checks (worktree) later. + * On terminal_sessions and the "no row" case: + * Tower's reconciliation deletes terminal_sessions rows whose shellper can't + * be reconnected (tower-terminals.ts:485-711). That runs on Tower startup, + * which is exactly what happens between a machine reboot and the user + * invoking `workspace recover`. By the time recovery runs, the rows for + * dead builders have already been pruned. So "no matching session row" + * is the COMMON case for any builder that needs revival — not a reason to + * skip. The row is only useful here as a positive "still alive" signal. + * + * Predicate order (cheap structural checks first): + * 1. terminal phase + * 2. unsupported protocol family + * 3. worktree missing on disk + * 4. any matching session row is alive → known still running, leave alone + * 5. stale (older than maxAge, unless includeStale) + * 6. otherwise → revive */ export function evaluateEligibility(inputs: EligibilityInputs): EligibilityResult { const { @@ -99,8 +112,8 @@ export function evaluateEligibility(inputs: EligibilityInputs): EligibilityResul if (builderInfo === null) { return { eligible: false, reason: 'unsupported_protocol' }; } - if (sessions.length === 0) { - return { eligible: false, reason: 'no_session_row' }; + if (!worktreeExists) { + return { eligible: false, reason: 'worktree_missing' }; } // If ANY matching session looks alive, treat the builder as alive. Duplicates // can occur when a prior recovery left a dead row behind (terminal_sessions @@ -108,9 +121,6 @@ export function evaluateEligibility(inputs: EligibilityInputs): EligibilityResul if (sessions.some(s => isSessionAlive(s, isProcessAlive, socketExists))) { return { eligible: false, reason: 'shellper_alive' }; } - if (!worktreeExists) { - return { eligible: false, reason: 'worktree_missing' }; - } if (!includeStale && ageDays > maxAgeDays) { return { eligible: false, reason: 'stale' }; } @@ -192,9 +202,8 @@ function reasonLabel(reason: IneligibleReason): string { switch (reason) { case 'terminal': return 'terminal'; case 'unsupported_protocol': return 'unsupported protocol'; - case 'no_session_row': return 'no session row'; - case 'shellper_alive': return 'shellper alive'; case 'worktree_missing': return 'worktree missing'; + case 'shellper_alive': return 'shellper alive'; case 'stale': return 'stale'; } } From 386353d4f45bcc46bed4ca448b167a0186c84f40 Mon Sep 17 00:00:00 2001 From: Amr Elsayed Date: Sun, 24 May 2026 08:56:09 +1000 Subject: [PATCH 05/11] [Issue #829] feat(recover): hide stale items from dry-run by default MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Real-world test against a 345-project workspace showed the dry-run table was dominated by 90-day-old stale rows that the operator had no intention of recovering anyway. The interesting rows (recent in-flight builders) got buried in noise. By default the preview now filters to projects within --max-age. A one-line footer reports the hidden count and the flag to see them. Pass --include-stale to dump the full set. The eligible count is still computed against ALL rows, so --max-age still gates revival as before — only the display changes. --- .../agent-farm/commands/workspace-recover.ts | 23 +++++++++++++++---- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/packages/codev/src/agent-farm/commands/workspace-recover.ts b/packages/codev/src/agent-farm/commands/workspace-recover.ts index dfc8b9af9..ef71b786f 100644 --- a/packages/codev/src/agent-farm/commands/workspace-recover.ts +++ b/packages/codev/src/agent-farm/commands/workspace-recover.ts @@ -213,6 +213,7 @@ interface RecoverRow { builderInfo: BuilderInfo | null; worktreePath: string | null; eligibility: EligibilityResult; + ageDays: number; } function printPreview(rows: RecoverRow[]): void { @@ -293,7 +294,7 @@ export async function workspaceRecover(options: WorkspaceRecoverOptions = {}): P else sessionsByRoleId.set(s.role_id, [s]); } - const rows: RecoverRow[] = projects.map(({ state }) => { + const allRows: RecoverRow[] = projects.map(({ state }) => { const builderInfo = deriveBuilderInfo(state); const matchingSessions = builderInfo ? sessionsByRoleId.get(builderInfo.builderId) ?? [] : []; const worktreePath = resolveWorktreePath(config.buildersDir, state); @@ -306,17 +307,29 @@ export async function workspaceRecover(options: WorkspaceRecoverOptions = {}): P isProcessAlive: processExists, socketExists: existsSync, }); - return { state, builderInfo, worktreePath, eligibility }; + return { state, builderInfo, worktreePath, eligibility, ageDays }; }); - printPreview(rows); + // By default the preview hides projects beyond the recency window — for a + // large workspace the stale tail dominates the table and obscures the few + // rows the operator actually cares about. --include-stale shows everything. + const visibleRows = includeStale + ? allRows + : allRows.filter(r => r.ageDays <= maxAgeDays); + const hiddenStaleCount = allRows.length - visibleRows.length; + + printPreview(visibleRows); + if (hiddenStaleCount > 0) { + logger.blank(); + logger.info(`${hiddenStaleCount} project(s) older than ${maxAgeDays} day(s) hidden. Pass --include-stale to show them.`); + } - const eligible = rows.filter( + const eligible = allRows.filter( (r): r is RecoverRow & { builderInfo: BuilderInfo; eligibility: { eligible: true } } => r.eligibility.eligible && r.builderInfo !== null, ); logger.blank(); - logger.kv('Eligible', `${eligible.length} / ${rows.length}`); + logger.kv('Eligible', `${eligible.length} / ${allRows.length}`); if (eligible.length === 0) { logger.info(apply ? 'Nothing to revive.' : 'Nothing would be revived.'); From c6c0dc988f7d102652d795e0313b055349e836f3 Mon Sep 17 00:00:00 2001 From: Amr Elsayed Date: Sun, 24 May 2026 09:06:57 +1000 Subject: [PATCH 06/11] [Issue #829] feat(recover): conversation resume via on-disk jsonl discovery MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When a builder is respawned with --resume, the revived agent now picks up the prior Claude conversation instead of starting fresh with a generic resume notice. Mechanism: Claude Code automatically persists every session to ~/.claude/projects//.jsonl, where the encoding replaces / and . with -. On --resume, spawn.ts looks up that directory for the worktree, picks the most-recently-modified jsonl, and passes the UUID through to startBuilderSession. The launch script then runs `claude --resume ` (no role injection, no prompt — both are already part of the saved conversation). No spawn-time bookkeeping is required: Claude already writes the jsonl as a side effect of the original session, so this works retroactively for builders spawned before this code shipped. If no jsonl exists (truly fresh worktree, manually cleaned history, etc.), spawn falls back to the existing behavior — fresh session with a resume-notice prompt. Trade-off: when multiple jsonl files exist in the same encoded-cwd directory (e.g. a worktree was used for unrelated work), the newest wins by mtime. For builder worktrees, which are protocol-scoped and typically host one logical conversation, this is the right answer nearly always. The 'Claude session: … (resuming conversation)' log line tells the operator which UUID was picked so a mismatch is visible. Architects use the same on-disk store; extending this mechanism to `afx architect` is straightforward and tracked separately. Tests: 9 unit tests for the discovery helper covering encoding edge cases, missing directory, no-jsonl directory, multiple-jsonl mtime selection, and ignored non-jsonl entries. --- .../claude-session-discovery.test.ts | 97 +++++++++++++++++++ .../src/agent-farm/commands/spawn-worktree.ts | 37 +++++-- .../codev/src/agent-farm/commands/spawn.ts | 17 ++++ .../utils/claude-session-discovery.ts | 69 +++++++++++++ 4 files changed, 213 insertions(+), 7 deletions(-) create mode 100644 packages/codev/src/agent-farm/__tests__/claude-session-discovery.test.ts create mode 100644 packages/codev/src/agent-farm/utils/claude-session-discovery.ts diff --git a/packages/codev/src/agent-farm/__tests__/claude-session-discovery.test.ts b/packages/codev/src/agent-farm/__tests__/claude-session-discovery.test.ts new file mode 100644 index 000000000..db17d8fdc --- /dev/null +++ b/packages/codev/src/agent-farm/__tests__/claude-session-discovery.test.ts @@ -0,0 +1,97 @@ +/** + * Tests for Claude session discovery via on-disk jsonl introspection. + * + * Issue #829 — conversation resume. + */ + +import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import { mkdtempSync, rmSync, mkdirSync, writeFileSync, utimesSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import { join } from 'node:path'; + +import { + encodeClaudeProjectDir, + findLatestSessionId, +} from '../utils/claude-session-discovery.js'; + +describe('encodeClaudeProjectDir', () => { + it('replaces / with -', () => { + expect(encodeClaudeProjectDir('/Users/x/repo')).toBe('-Users-x-repo'); + }); + + it('replaces . with -', () => { + expect(encodeClaudeProjectDir('/Users/x/repo/.builders/pir-1')).toBe( + '-Users-x-repo--builders-pir-1', + ); + }); + + it('leaves dashes in the source path untouched', () => { + expect(encodeClaudeProjectDir('/Users/x/repo/pir-1298')).toBe( + '-Users-x-repo-pir-1298', + ); + }); + + it('handles paths with multiple dots and slashes', () => { + expect(encodeClaudeProjectDir('/a/b.c/.d.e/f')).toBe('-a-b-c--d-e-f'); + }); +}); + +describe('findLatestSessionId', () => { + let fakeHome: string; + let projectsRoot: string; + + beforeEach(() => { + fakeHome = mkdtempSync(join(tmpdir(), 'csd-test-')); + projectsRoot = join(fakeHome, '.claude', 'projects'); + mkdirSync(projectsRoot, { recursive: true }); + }); + + afterEach(() => { + rmSync(fakeHome, { recursive: true, force: true }); + }); + + function writeSession(absPath: string, uuid: string, mtime: number): void { + const dir = join(projectsRoot, encodeClaudeProjectDir(absPath)); + mkdirSync(dir, { recursive: true }); + const file = join(dir, `${uuid}.jsonl`); + writeFileSync(file, `{"sessionId":"${uuid}"}\n`, 'utf-8'); + const t = mtime / 1000; + utimesSync(file, t, t); + } + + it('returns the newest session UUID by mtime', () => { + const worktree = '/Users/x/repo/.builders/pir-1'; + writeSession(worktree, 'old-uuid', 1_000_000_000_000); + writeSession(worktree, 'newest-uuid', 1_700_000_000_000); + writeSession(worktree, 'middle-uuid', 1_400_000_000_000); + expect(findLatestSessionId(worktree, { homeDir: fakeHome })).toBe('newest-uuid'); + }); + + it('returns null when the project dir does not exist', () => { + expect(findLatestSessionId('/nonexistent/path', { homeDir: fakeHome })).toBeNull(); + }); + + it('returns null when the project dir exists but contains no jsonl files', () => { + const worktree = '/Users/x/repo/.builders/pir-2'; + const dir = join(projectsRoot, encodeClaudeProjectDir(worktree)); + mkdirSync(dir, { recursive: true }); + writeFileSync(join(dir, 'memory'), 'not a jsonl', 'utf-8'); + expect(findLatestSessionId(worktree, { homeDir: fakeHome })).toBeNull(); + }); + + it('ignores non-jsonl files and subdirectories', () => { + const worktree = '/Users/x/repo/.builders/pir-3'; + const dir = join(projectsRoot, encodeClaudeProjectDir(worktree)); + mkdirSync(dir, { recursive: true }); + mkdirSync(join(dir, 'some-uuid'), { recursive: true }); + writeFileSync(join(dir, 'history.txt'), 'text', 'utf-8'); + writeSession(worktree, 'the-uuid', 1_500_000_000_000); + expect(findLatestSessionId(worktree, { homeDir: fakeHome })).toBe('the-uuid'); + }); + + it('returns the single jsonl when only one exists', () => { + const worktree = '/Users/x/repo/.builders/pir-4'; + writeSession(worktree, 'only-uuid', 1_500_000_000_000); + expect(findLatestSessionId(worktree, { homeDir: fakeHome })).toBe('only-uuid'); + }); +}); diff --git a/packages/codev/src/agent-farm/commands/spawn-worktree.ts b/packages/codev/src/agent-farm/commands/spawn-worktree.ts index d7a6d96c7..1203b44a5 100644 --- a/packages/codev/src/agent-farm/commands/spawn-worktree.ts +++ b/packages/codev/src/agent-farm/commands/spawn-worktree.ts @@ -673,7 +673,13 @@ function writeWorktreeFiles( } /** - * Start a terminal session for a builder + * Start a terminal session for a builder. + * + * When `resumeSessionId` is provided, the launch script invokes + * `claude --resume ` instead of a fresh prompt+role invocation. The + * saved Claude conversation contains the system prompt / role context + * already, so role injection and the initial prompt are intentionally + * skipped on that path. */ export async function startBuilderSession( config: Config, @@ -683,18 +689,32 @@ export async function startBuilderSession( prompt: string, roleContent: string | null, roleSource: string | null, + resumeSessionId?: string, ): Promise<{ terminalId: string }> { logger.info('Creating terminal session...'); - // Write initial prompt to a file for reference - const promptFile = resolve(worktreePath, '.builder-prompt.txt'); - writeFileSync(promptFile, prompt); - - // Build the start script with role if provided const scriptPath = resolve(worktreePath, '.builder-start.sh'); let scriptContent: string; - if (roleContent) { + if (resumeSessionId) { + // Resume path: load the prior Claude conversation by UUID. No prompt file, + // no role injection — both are already part of the saved conversation. + logger.info(`Resuming Claude session ${resumeSessionId.slice(0, 8)}…`); + scriptContent = `#!/bin/bash +cd "${worktreePath}" +while true; do + ${baseCmd} --resume "${resumeSessionId}" + echo "" + echo "Agent exited. Restarting in 2 seconds... (Ctrl+C to quit)" + sleep 2 +done +`; + } else if (roleContent) { + // Fresh spawn with role injection. + // Write initial prompt to a file for reference. + const promptFile = resolve(worktreePath, '.builder-prompt.txt'); + writeFileSync(promptFile, prompt); + // Write role to a file for harness-based injection const roleFile = resolve(worktreePath, '.builder-role.md'); // Inject the actual dashboard port into the role prompt @@ -725,6 +745,9 @@ ${envBlock}while true; do done `; } else { + // Fresh spawn without role injection. + const promptFile = resolve(worktreePath, '.builder-prompt.txt'); + writeFileSync(promptFile, prompt); scriptContent = `#!/bin/bash cd "${worktreePath}" while true; do diff --git a/packages/codev/src/agent-farm/commands/spawn.ts b/packages/codev/src/agent-farm/commands/spawn.ts index 0f5ed39eb..874e1a2da 100644 --- a/packages/codev/src/agent-farm/commands/spawn.ts +++ b/packages/codev/src/agent-farm/commands/spawn.ts @@ -37,6 +37,7 @@ const SPAWNING_ARCHITECT_NAME = (process.env.CODEV_ARCHITECT_NAME && process.env.CODEV_ARCHITECT_NAME.trim()) || DEFAULT_ARCHITECT_NAME; import { loadRolePrompt } from '../utils/roles.js'; import { buildAgentName, stripLeadingZeros } from '../utils/agent-names.js'; +import { findLatestSessionId } from '../utils/claude-session-discovery.js'; import { fetchIssue as fetchIssueNonFatal } from '../../lib/github.js'; import { type TemplateContext, @@ -440,6 +441,21 @@ async function spawnSpec(options: SpawnOptions, config: Config): Promise { templateContext.existing_branch = options.branch; } + // On --resume, look for a prior Claude conversation in + // ~/.claude/projects//. If found, the revived builder + // continues that conversation via `claude --resume ` instead of + // starting fresh with a resume-notice prompt. (Issue #829.) + let resumeSessionId: string | undefined; + if (options.resume) { + const found = findLatestSessionId(worktreePath); + if (found) { + logger.kv('Claude session', `${found.slice(0, 8)}… (resuming conversation)`); + resumeSessionId = found; + } else { + logger.info('No prior Claude conversation found for this worktree; starting a fresh session.'); + } + } + const initialPrompt = buildPromptFromTemplate(config, protocol, templateContext); const resumeNotice = options.resume ? `\n${buildResumeNotice(projectId)}\n` : ''; const branchNotice = options.branch @@ -452,6 +468,7 @@ async function spawnSpec(options: SpawnOptions, config: Config): Promise { const { terminalId } = await startBuilderSession( config, builderId, worktreePath, commands.builder, builderPrompt, role?.content ?? null, role?.source ?? null, + resumeSessionId, ); upsertBuilder({ diff --git a/packages/codev/src/agent-farm/utils/claude-session-discovery.ts b/packages/codev/src/agent-farm/utils/claude-session-discovery.ts new file mode 100644 index 000000000..73d618029 --- /dev/null +++ b/packages/codev/src/agent-farm/utils/claude-session-discovery.ts @@ -0,0 +1,69 @@ +// Discover the most-recent Claude conversation session for a given working +// directory by inspecting Claude Code's on-disk session store. +// +// Claude Code automatically persists every interactive session to +// ~/.claude/projects//.jsonl +// where the encoding replaces both '/' and '.' in the absolute path with '-'. +// We use the newest jsonl by mtime as a stand-in for "the last conversation +// that ran in this directory" so reviving a dead builder (or architect) can +// resume via `claude --resume ` without any spawn-time bookkeeping. +// +// This is intentionally a heuristic — multiple jsonl files in the same +// directory mean multiple past sessions, and we pick the most recent. For +// builder worktrees that almost always means the right one; for shared cwds +// the caller should be aware of the ambiguity. + +import { existsSync, readdirSync, statSync } from 'node:fs'; +import { homedir } from 'node:os'; +import { join } from 'node:path'; + +const JSONL_EXT = '.jsonl'; + +/** + * Encode an absolute path to the directory name Claude uses under + * ~/.claude/projects/. The scheme is: replace every '/' and '.' with '-'. + * + * Example: '/Users/x/repos/foo/.builders/pir-1' → '-Users-x-repos-foo--builders-pir-1' + */ +export function encodeClaudeProjectDir(absolutePath: string): string { + return absolutePath.replace(/[/.]/g, '-'); +} + +export function getClaudeProjectDir(absolutePath: string): string { + return join(homedir(), '.claude', 'projects', encodeClaudeProjectDir(absolutePath)); +} + +/** + * Return the session UUID of the most-recently-modified jsonl in the Claude + * project dir for the given cwd, or null if none exists. + * + * Optionally accepts `now` and a `homeDir` override so tests can pin both. + */ +export function findLatestSessionId( + absolutePath: string, + opts?: { homeDir?: string }, +): string | null { + const home = opts?.homeDir ?? homedir(); + const dir = join(home, '.claude', 'projects', encodeClaudeProjectDir(absolutePath)); + if (!existsSync(dir)) return null; + + let bestName: string | null = null; + let bestMtime = -Infinity; + + for (const entry of readdirSync(dir, { withFileTypes: true })) { + if (!entry.isFile() || !entry.name.endsWith(JSONL_EXT)) continue; + const fullPath = join(dir, entry.name); + try { + const mtime = statSync(fullPath).mtimeMs; + if (mtime > bestMtime) { + bestMtime = mtime; + bestName = entry.name; + } + } catch { + // stat failed (race with deletion, permissions) — skip + } + } + + if (!bestName) return null; + return bestName.slice(0, -JSONL_EXT.length); +} From b96dcacc6cf0f3fac6e0722a2cd1b41e55a524dd Mon Sep 17 00:00:00 2001 From: Amr Elsayed Date: Sun, 24 May 2026 09:10:44 +1000 Subject: [PATCH 07/11] [Issue #829] fix(recover): round age display UP so labels match --max-age MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit formatRelativeAge used Math.floor on the day bucket, so a project that was 25 hours old printed as '1d ago' — but `--max-age 1` (strict `ageDays > 1`) excluded it because its actual age was 1.04 days. Operators reasonably expected '1d ago' rows to fall under '--max-age 1'. Switch to Math.ceil, computed from raw ms rather than the floored hour count so sub-hour precision survives. Effect: - exactly 24h → '1d ago' - 24h + 1s → '2d ago' - 47h → '2d ago' - 48h → '2d ago' - 48h + 1s → '3d ago' Now '--max-age N' includes every row labelled 'Nd ago' or less, which is what the label promises. Predicate semantics unchanged. --- .../__tests__/workspace-recover.test.ts | 17 +++++++++++++++++ .../agent-farm/commands/workspace-recover.ts | 7 ++++++- 2 files changed, 23 insertions(+), 1 deletion(-) diff --git a/packages/codev/src/agent-farm/__tests__/workspace-recover.test.ts b/packages/codev/src/agent-farm/__tests__/workspace-recover.test.ts index 26e94df20..d392db73c 100644 --- a/packages/codev/src/agent-farm/__tests__/workspace-recover.test.ts +++ b/packages/codev/src/agent-farm/__tests__/workspace-recover.test.ts @@ -512,6 +512,23 @@ describe('formatRelativeAge', () => { expect(formatRelativeAge(iso)).toMatch(/^\d+d ago$/); }); + it('rounds days UP so the label aligns with --max-age (25h shows 2d, not 1d)', () => { + const iso = new Date(Date.now() - 25 * 3600_000).toISOString(); + expect(formatRelativeAge(iso)).toBe('2d ago'); + }); + + it('rounds 47h up to 2d (still within the ceil(2) bucket)', () => { + const iso = new Date(Date.now() - 47 * 3600_000).toISOString(); + expect(formatRelativeAge(iso)).toBe('2d ago'); + }); + + it('shows "2d ago" rather than "1d ago" for anything strictly older than 24h', () => { + // Just past the boundary — ensures the predicate boundary + // (`ageDays > maxAge`) matches what the label promises. + const iso = new Date(Date.now() - (24 * 3600_000 + 1_000)).toISOString(); + expect(formatRelativeAge(iso)).toBe('2d ago'); + }); + it('returns placeholder for malformed ISO', () => { expect(formatRelativeAge('not a date')).toBe('—'); }); diff --git a/packages/codev/src/agent-farm/commands/workspace-recover.ts b/packages/codev/src/agent-farm/commands/workspace-recover.ts index ef71b786f..a6b3f5490 100644 --- a/packages/codev/src/agent-farm/commands/workspace-recover.ts +++ b/packages/codev/src/agent-farm/commands/workspace-recover.ts @@ -194,7 +194,12 @@ export function formatRelativeAge(iso: string): string { if (minutes < 60) return `${minutes}m ago`; const hours = Math.floor(minutes / 60); if (hours < 24) return `${hours}h ago`; - const days = Math.floor(hours / 24); + // Round UP rather than down so the label aligns with --max-age semantics: + // anything strictly older than 24h prints as "2d ago" (not "1d ago"), so + // a row labelled "1d ago" is actually within --max-age 1 (≤ 24h exact). + // Use ms rather than the floored `hours` to preserve sub-hour precision — + // 24h + 1s must ceil to 2 days, not 1. + const days = Math.ceil(ms / 86_400_000); return `${days}d ago`; } From b37c6eb2c6f410c7173fe29205745f7e9959794e Mon Sep 17 00:00:00 2001 From: Amr Elsayed Date: Sun, 24 May 2026 09:27:22 +1000 Subject: [PATCH 08/11] [Issue #830] feat(tower): resume main architect's Claude conversation after reboot MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When the main architect is spawned by Tower's launchInstance — the path that fires when 'afx workspace start' activates a workspace and 'main' isn't already alive — look up the most-recent jsonl session for the workspace cwd via findLatestSessionId(). If one exists, pass '--resume ' instead of the role-injection args, so the revived architect picks up where it left off after a reboot. Reuses the on-disk Claude session discovery from #831 (builder conversation resume), but at the Tower spawn point rather than the spawn.ts CLI path because architects are server-side spawned. Scope: main architect only. Named sibling architects added via 'afx workspace add-architect' (Spec 755) share the same workspace cwd, so the newest-jsonl heuristic can't disambiguate them. addArchitect() deliberately does NOT use this resume path — siblings continue to get fresh sessions on revival. A per-architect disambiguation design is tracked in #830. --- .../src/agent-farm/servers/tower-instances.ts | 29 ++++++++++++++++++- 1 file changed, 28 insertions(+), 1 deletion(-) diff --git a/packages/codev/src/agent-farm/servers/tower-instances.ts b/packages/codev/src/agent-farm/servers/tower-instances.ts index fc87cc5e5..163832180 100644 --- a/packages/codev/src/agent-farm/servers/tower-instances.ts +++ b/packages/codev/src/agent-farm/servers/tower-instances.ts @@ -13,6 +13,7 @@ import { exec } from 'node:child_process'; import { promisify } from 'node:util'; import { homedir } from 'node:os'; import { encodeWorkspacePath } from '../lib/tower-client.js'; +import { findLatestSessionId } from '../utils/claude-session-discovery.js'; import { loadConfig } from '../../lib/config.js'; const execAsync = promisify(exec); @@ -456,7 +457,33 @@ export async function launchInstance(workspacePath: string): Promise<{ success: // Parse command string to separate command and args, inject role prompt const cmdParts = architectCmd.split(/\s+/); const cmd = cmdParts[0]; - const { args: cmdArgs, env: harnessEnv } = buildArchitectArgs(cmdParts.slice(1), workspacePath); + + // Issue #830 (main architect only): if a prior Claude session exists + // for this workspace cwd, resume it instead of starting fresh. Role + // injection is skipped on the resume path — the saved conversation + // already contains the role/system prompt. + // Named sibling architects (Spec 755) sharing the same cwd would + // collide on the newest-jsonl heuristic; addArchitect deliberately + // does NOT use this resume path. + // + // Lookup is unconditional here (unlike builders, where spawn.ts gates + // discovery behind `options.resume`). The asymmetry is intentional: + // launchInstance only runs when main isn't already alive, so the + // implicit intent is always "spawn the missing main; resume its + // prior conversation if one exists." There is no equivalent user + // intent surface (no flag) on the workspace-start path. + const resumeSessionId = findLatestSessionId(workspacePath); + let cmdArgs: string[]; + let harnessEnv: Record; + if (resumeSessionId) { + cmdArgs = [...cmdParts.slice(1), '--resume', resumeSessionId]; + harnessEnv = {}; + _deps.log('INFO', `Resuming main architect Claude session ${resumeSessionId.slice(0, 8)}… for ${workspacePath}`); + } else { + const built = buildArchitectArgs(cmdParts.slice(1), workspacePath); + cmdArgs = built.args; + harnessEnv = built.env; + } // Build env with CLAUDECODE removed so spawned Claude processes // don't detect a nested session, and merge harness env vars. From 79eb2963d3c2ddf37dc3a104fa2936ab0f67095c Mon Sep 17 00:00:00 2001 From: Amr Elsayed Date: Sun, 24 May 2026 09:48:48 +1000 Subject: [PATCH 09/11] [Issue #829] fix: address cmap-3 round-2 review findings MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two real gaps + two nits from the round-2 review (Gemini + Codex + Claude): 1. Conversation resume parity for spawnIssueDrivenBuilder (Claude). spawnSpec had findLatestSessionId discovery wired in but the sister function that handles bugfix and PIR (spawnIssueDrivenBuilder) was never updated. 'afx workspace recover' against shannon — dominated by PIR builders — was correctly re-spawning each builder process but skipping conversation resume entirely. PIR/bugfix builders came up as fresh claude sessions with no memory of prior work. Extracted the resume-session discovery + logging into a small helper (discoverResumeSession) and called it from both spawn sites. Pure utility findLatestSessionId stays unchanged and importable; the wrapper owns the spawn-CLI gate + log concerns. 2. Multi-architect collision guard for main architect resume (Codex + Gemini). tower-instances.ts launchInstance was using findLatestSessionId unconditionally for main. Named sibling architects (Spec 755) share cwd = workspacePath with main, so all their jsonls land in the same encoded directory. The newest-by-mtime heuristic would attach main to whichever architect was active most recently — possibly a sibling, silently 'stealing' that sibling's conversation context. Conservative fix: before resuming main, check getArchitects() from state.db. If more than one architect is persisted, skip the resume path — main spawns fresh with role injection, and a WARN log surfaces what happened with a reference to #832. Single-architect workspaces (the common case) keep conversation resume. The proper per-architect UUID storage is Waleed's #832. Plus two nits: - JSDoc on findLatestSessionId mentioned a 'now' parameter that doesn't exist (would-have-been-a-test-pin that never landed). Removed. - 'Eligible: X / Y' counter now clarifies when stale rows are hidden: 'Eligible: 3 / 12 visible (345 scanned)' instead of just 'X / 345', which previously looked like every project was being considered. Codex's third blocker (cleanup'd-but-worktree-preserved revival) was deliberately not addressed — Claude and Gemini both read the current behavior as correct. The dry-run preview + --apply confirmation is the safety net, and a preserved worktree IS the signal that the work might be resumable. Tests: 55 still passing. The two real behavior changes are at code sites that aren't easily unit-testable without mocking the entire spawn pipeline / Tower — the changes are small (helper extraction, defensive guard) and the existing predicate tests continue to exercise surrounding logic. --- .../codev/src/agent-farm/commands/spawn.ts | 35 +++++++++++-------- .../agent-farm/commands/workspace-recover.ts | 6 +++- .../src/agent-farm/servers/tower-instances.ts | 30 +++++++++++++--- .../utils/claude-session-discovery.ts | 3 +- 4 files changed, 54 insertions(+), 20 deletions(-) diff --git a/packages/codev/src/agent-farm/commands/spawn.ts b/packages/codev/src/agent-farm/commands/spawn.ts index 874e1a2da..35672259e 100644 --- a/packages/codev/src/agent-farm/commands/spawn.ts +++ b/packages/codev/src/agent-farm/commands/spawn.ts @@ -76,6 +76,24 @@ import { executeForgeCommand, loadForgeConfig } from '../../lib/forge.js'; /** * Log spawn success with terminal WebSocket URL */ +/** + * On --resume, look up the prior Claude conversation jsonl for the worktree + * so the revived builder can pick up the saved conversation via + * `claude --resume ` instead of starting fresh with a resume-notice + * prompt. (Issue #831.) Returns undefined when not resuming or when no + * jsonl exists; callers fall back to the fresh-spawn path in that case. + */ +function discoverResumeSession(worktreePath: string, isResume: boolean | undefined): string | undefined { + if (!isResume) return undefined; + const found = findLatestSessionId(worktreePath); + if (found) { + logger.kv('Claude session', `${found.slice(0, 8)}… (resuming conversation)`); + return found; + } + logger.info('No prior Claude conversation found for this worktree; starting a fresh session.'); + return undefined; +} + function logSpawnSuccess(label: string, terminalId: string, mode?: string): void { const client = getTowerClient(); logger.blank(); @@ -441,20 +459,7 @@ async function spawnSpec(options: SpawnOptions, config: Config): Promise { templateContext.existing_branch = options.branch; } - // On --resume, look for a prior Claude conversation in - // ~/.claude/projects//. If found, the revived builder - // continues that conversation via `claude --resume ` instead of - // starting fresh with a resume-notice prompt. (Issue #829.) - let resumeSessionId: string | undefined; - if (options.resume) { - const found = findLatestSessionId(worktreePath); - if (found) { - logger.kv('Claude session', `${found.slice(0, 8)}… (resuming conversation)`); - resumeSessionId = found; - } else { - logger.info('No prior Claude conversation found for this worktree; starting a fresh session.'); - } - } + const resumeSessionId = discoverResumeSession(worktreePath, options.resume); const initialPrompt = buildPromptFromTemplate(config, protocol, templateContext); const resumeNotice = options.resume ? `\n${buildResumeNotice(projectId)}\n` : ''; @@ -833,11 +838,13 @@ async function spawnIssueDrivenBuilder( : ''; const builderPrompt = `You are a Builder. Read codev/roles/builder.md for your full role definition.\n${resumeNotice}${branchNotice}\n${prompt}`; + const resumeSessionId = discoverResumeSession(worktreePath, options.resume); const role = options.noRole ? null : loadRolePrompt(config, 'builder'); const commands = getResolvedCommands(); const { terminalId } = await startBuilderSession( config, builderId, worktreePath, commands.builder, builderPrompt, role?.content ?? null, role?.source ?? null, + resumeSessionId, ); upsertBuilder({ diff --git a/packages/codev/src/agent-farm/commands/workspace-recover.ts b/packages/codev/src/agent-farm/commands/workspace-recover.ts index a6b3f5490..fc50221ef 100644 --- a/packages/codev/src/agent-farm/commands/workspace-recover.ts +++ b/packages/codev/src/agent-farm/commands/workspace-recover.ts @@ -334,7 +334,11 @@ export async function workspaceRecover(options: WorkspaceRecoverOptions = {}): P r.eligibility.eligible && r.builderInfo !== null, ); logger.blank(); - logger.kv('Eligible', `${eligible.length} / ${allRows.length}`); + if (hiddenStaleCount > 0) { + logger.kv('Eligible', `${eligible.length} / ${visibleRows.length} visible (${allRows.length} scanned)`); + } else { + logger.kv('Eligible', `${eligible.length} / ${allRows.length}`); + } if (eligible.length === 0) { logger.info(apply ? 'Nothing to revive.' : 'Nothing would be revived.'); diff --git a/packages/codev/src/agent-farm/servers/tower-instances.ts b/packages/codev/src/agent-farm/servers/tower-instances.ts index 163832180..8c3b8a950 100644 --- a/packages/codev/src/agent-farm/servers/tower-instances.ts +++ b/packages/codev/src/agent-farm/servers/tower-instances.ts @@ -462,9 +462,6 @@ export async function launchInstance(workspacePath: string): Promise<{ success: // for this workspace cwd, resume it instead of starting fresh. Role // injection is skipped on the resume path — the saved conversation // already contains the role/system prompt. - // Named sibling architects (Spec 755) sharing the same cwd would - // collide on the newest-jsonl heuristic; addArchitect deliberately - // does NOT use this resume path. // // Lookup is unconditional here (unlike builders, where spawn.ts gates // discovery behind `options.resume`). The asymmetry is intentional: @@ -472,7 +469,32 @@ export async function launchInstance(workspacePath: string): Promise<{ success: // implicit intent is always "spawn the missing main; resume its // prior conversation if one exists." There is no equivalent user // intent surface (no flag) on the workspace-start path. - const resumeSessionId = findLatestSessionId(workspacePath); + // + // Multi-architect collision guard (Codex / Gemini cmap-3 round 2): + // Named sibling architects (Spec 755) share `cwd = workspacePath` and + // write their jsonls into the same encoded-cwd directory as main. The + // newest-jsonl heuristic can't distinguish which jsonl belongs to + // which architect — so if any sibling has ever existed in this + // workspace, main risks resuming a sibling's conversation instead of + // its own. Until #832 lands per-architect session UUIDs, the + // conservative behavior is: if siblings are persisted, skip resume + // for main and spawn fresh with role injection. Single-architect + // workspaces (the common case) keep conversation resume. + let safeToResume: boolean; + try { + // Single architect (main only, or empty before first spawn) → safe. + // Multiple → unsafe (sibling jsonls collide with main's in the same cwd). + safeToResume = getArchitects().length <= 1; + } catch { + // state.db read should never fail here, but if it does the safe + // default is to skip resume rather than risk attaching main to + // an unrelated jsonl. + safeToResume = false; + } + const resumeSessionId = safeToResume ? findLatestSessionId(workspacePath) : null; + if (!safeToResume) { + _deps.log('WARN', `Skipping main architect conversation resume for ${workspacePath}: sibling architects detected (or state.db unreadable); cannot disambiguate jsonl by cwd. See #832.`); + } let cmdArgs: string[]; let harnessEnv: Record; if (resumeSessionId) { diff --git a/packages/codev/src/agent-farm/utils/claude-session-discovery.ts b/packages/codev/src/agent-farm/utils/claude-session-discovery.ts index 73d618029..b8d505aaf 100644 --- a/packages/codev/src/agent-farm/utils/claude-session-discovery.ts +++ b/packages/codev/src/agent-farm/utils/claude-session-discovery.ts @@ -37,7 +37,8 @@ export function getClaudeProjectDir(absolutePath: string): string { * Return the session UUID of the most-recently-modified jsonl in the Claude * project dir for the given cwd, or null if none exists. * - * Optionally accepts `now` and a `homeDir` override so tests can pin both. + * `opts.homeDir` lets tests pin the home directory; otherwise resolves via + * `os.homedir()`. */ export function findLatestSessionId( absolutePath: string, From 4e4d68f1fffc82383928c1ab606e9daa5999eca8 Mon Sep 17 00:00:00 2001 From: Amr Elsayed Date: Sun, 24 May 2026 09:54:54 +1000 Subject: [PATCH 10/11] [Issue #829] fix: cmap-3 round-3 nits MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three small follow-ups from round-3 review (all APPROVE / APPROVE_WITH_NITS): - Deleted dangling JSDoc that was orphaned when discoverResumeSession was inserted above logSpawnSuccess (Claude + Codex flagged). - Tightened the multi-architect guard's WARN log and code comment to say 'persisted sibling architects' — that's what we actually check (Codex). Also added a note that the guard only looks at current persisted state, not history, so stale jsonl from removed siblings could still cause wrong-attachment (acceptable until #832). - Added 5 unit tests for discoverResumeSession covering: isResume=false → undefined regardless of jsonls isResume=undefined → undefined isResume=true + no jsonl → undefined isResume=true + multiple jsonls → newest UUID isResume=false short-circuits before any filesystem touch Exports the helper for testability. Tests pin HOME to a temp dir so findLatestSessionId looks at fake jsonls instead of the user's real ~/.claude/projects/. Tests: 60 passing (was 55). The multi-architect guard remains unit-test-light by reviewer consensus — it's a small ternary on an array length deep in tower-instances.ts and would require mocking the whole launchInstance pipeline to isolate. --- .../__tests__/discover-resume-session.test.ts | 92 +++++++++++++++++++ .../codev/src/agent-farm/commands/spawn.ts | 5 +- .../src/agent-farm/servers/tower-instances.ts | 18 ++-- 3 files changed, 104 insertions(+), 11 deletions(-) create mode 100644 packages/codev/src/agent-farm/__tests__/discover-resume-session.test.ts diff --git a/packages/codev/src/agent-farm/__tests__/discover-resume-session.test.ts b/packages/codev/src/agent-farm/__tests__/discover-resume-session.test.ts new file mode 100644 index 000000000..c7c57bcaa --- /dev/null +++ b/packages/codev/src/agent-farm/__tests__/discover-resume-session.test.ts @@ -0,0 +1,92 @@ +/** + * Tests for the discoverResumeSession helper — the spawn-CLI wrapper that + * gates findLatestSessionId on the --resume flag and surfaces a user-facing + * log line. (Issues #829 / #831.) + */ + +import { describe, it, expect, beforeEach, afterEach } from 'vitest'; +import { mkdtempSync, rmSync, mkdirSync, writeFileSync, utimesSync } from 'node:fs'; +import { tmpdir, homedir } from 'node:os'; +import { join } from 'node:path'; + +import { discoverResumeSession } from '../commands/spawn.js'; +import { encodeClaudeProjectDir } from '../utils/claude-session-discovery.js'; + +// discoverResumeSession reads from $HOME via os.homedir() through +// findLatestSessionId. Override the env var for the duration of the test so +// the helper looks at our fake home instead of the user's real one. +function pinHome(fakeHome: string, fn: () => T): T { + const original = process.env.HOME; + process.env.HOME = fakeHome; + try { + return fn(); + } finally { + if (original === undefined) delete process.env.HOME; + else process.env.HOME = original; + } +} + +function writeSession(projectsRoot: string, absPath: string, uuid: string, mtimeMs: number): void { + const dir = join(projectsRoot, encodeClaudeProjectDir(absPath)); + mkdirSync(dir, { recursive: true }); + const file = join(dir, `${uuid}.jsonl`); + writeFileSync(file, `{"sessionId":"${uuid}"}\n`, 'utf-8'); + const t = mtimeMs / 1000; + utimesSync(file, t, t); +} + +describe('discoverResumeSession', () => { + let fakeHome: string; + let projectsRoot: string; + + beforeEach(() => { + fakeHome = mkdtempSync(join(tmpdir(), 'drs-test-')); + projectsRoot = join(fakeHome, '.claude', 'projects'); + mkdirSync(projectsRoot, { recursive: true }); + }); + + afterEach(() => { + rmSync(fakeHome, { recursive: true, force: true }); + }); + + it('returns undefined when isResume is false (no filesystem touch)', () => { + // Even if a jsonl exists, a non-resume spawn must not pick it up. + const worktree = '/Users/x/repo/.builders/spir-1'; + writeSession(projectsRoot, worktree, 'should-not-pick', 1_700_000_000_000); + pinHome(fakeHome, () => { + expect(discoverResumeSession(worktree, false)).toBeUndefined(); + }); + }); + + it('returns undefined when isResume is undefined', () => { + const worktree = '/Users/x/repo/.builders/spir-2'; + writeSession(projectsRoot, worktree, 'should-not-pick', 1_700_000_000_000); + pinHome(fakeHome, () => { + expect(discoverResumeSession(worktree, undefined)).toBeUndefined(); + }); + }); + + it('returns undefined when isResume is true but no jsonl exists', () => { + const worktree = '/Users/x/repo/.builders/spir-3-no-jsonl'; + pinHome(fakeHome, () => { + expect(discoverResumeSession(worktree, true)).toBeUndefined(); + }); + }); + + it('returns the newest jsonl UUID when isResume is true and jsonls exist', () => { + const worktree = '/Users/x/repo/.builders/pir-1661'; + writeSession(projectsRoot, worktree, 'older-uuid', 1_500_000_000_000); + writeSession(projectsRoot, worktree, 'newest-uuid', 1_700_000_000_000); + pinHome(fakeHome, () => { + expect(discoverResumeSession(worktree, true)).toBe('newest-uuid'); + }); + }); + + it('does not consult the filesystem when isResume is false (perf safety)', () => { + // Negative case: isResume=false short-circuits before any filesystem + // access happens. Tests pass even if HOME points at /nonexistent. + pinHome('/nonexistent-home-path', () => { + expect(discoverResumeSession('/some/worktree', false)).toBeUndefined(); + }); + }); +}); diff --git a/packages/codev/src/agent-farm/commands/spawn.ts b/packages/codev/src/agent-farm/commands/spawn.ts index 35672259e..3f1f86957 100644 --- a/packages/codev/src/agent-farm/commands/spawn.ts +++ b/packages/codev/src/agent-farm/commands/spawn.ts @@ -73,9 +73,6 @@ import { executeForgeCommand, loadForgeConfig } from '../../lib/forge.js'; // ID and Session Management // ============================================================================= -/** - * Log spawn success with terminal WebSocket URL - */ /** * On --resume, look up the prior Claude conversation jsonl for the worktree * so the revived builder can pick up the saved conversation via @@ -83,7 +80,7 @@ import { executeForgeCommand, loadForgeConfig } from '../../lib/forge.js'; * prompt. (Issue #831.) Returns undefined when not resuming or when no * jsonl exists; callers fall back to the fresh-spawn path in that case. */ -function discoverResumeSession(worktreePath: string, isResume: boolean | undefined): string | undefined { +export function discoverResumeSession(worktreePath: string, isResume: boolean | undefined): string | undefined { if (!isResume) return undefined; const found = findLatestSessionId(worktreePath); if (found) { diff --git a/packages/codev/src/agent-farm/servers/tower-instances.ts b/packages/codev/src/agent-farm/servers/tower-instances.ts index 8c3b8a950..1ab315b4d 100644 --- a/packages/codev/src/agent-farm/servers/tower-instances.ts +++ b/packages/codev/src/agent-farm/servers/tower-instances.ts @@ -473,13 +473,17 @@ export async function launchInstance(workspacePath: string): Promise<{ success: // Multi-architect collision guard (Codex / Gemini cmap-3 round 2): // Named sibling architects (Spec 755) share `cwd = workspacePath` and // write their jsonls into the same encoded-cwd directory as main. The - // newest-jsonl heuristic can't distinguish which jsonl belongs to - // which architect — so if any sibling has ever existed in this - // workspace, main risks resuming a sibling's conversation instead of - // its own. Until #832 lands per-architect session UUIDs, the - // conservative behavior is: if siblings are persisted, skip resume - // for main and spawn fresh with role injection. Single-architect + // newest-jsonl heuristic can't tell which jsonl belongs to which + // architect — so if any sibling is currently persisted in state.db, + // main risks resuming a sibling's conversation instead of its own. + // Until #832 lands per-architect session UUIDs, the conservative + // behavior is: if state.db reports >1 architect, skip resume for + // main and spawn fresh with role injection. Single-architect // workspaces (the common case) keep conversation resume. + // + // Note: this only checks current persisted state, not history. A + // sibling that existed before but was removed leaves stale jsonl + // files behind; main could still pick one up. Acceptable until #832. let safeToResume: boolean; try { // Single architect (main only, or empty before first spawn) → safe. @@ -493,7 +497,7 @@ export async function launchInstance(workspacePath: string): Promise<{ success: } const resumeSessionId = safeToResume ? findLatestSessionId(workspacePath) : null; if (!safeToResume) { - _deps.log('WARN', `Skipping main architect conversation resume for ${workspacePath}: sibling architects detected (or state.db unreadable); cannot disambiguate jsonl by cwd. See #832.`); + _deps.log('WARN', `Skipping main architect conversation resume for ${workspacePath}: persisted sibling architects detected (or state.db unreadable); cannot disambiguate jsonl by cwd. See #832.`); } let cmdArgs: string[]; let harnessEnv: Record; From 4d112a386e8a7ede454a1ae03a8017133fe69700 Mon Sep 17 00:00:00 2001 From: Amr Elsayed Date: Sun, 24 May 2026 10:06:08 +1000 Subject: [PATCH 11/11] [Issue #829] fix: pass workspace path to getArchitects (post-merge) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit After merging origin/main, Bugfix #826 iter-6 changed getArchitects() to take a workspacePath parameter (workspace-scoped architect table). CI caught the missing argument before this branch was rebuilt. Updated the multi-architect collision guard to pass resolvedPath — the canonical workspace path the architect rows are keyed by. Verified locally: pnpm exec tsc --noEmit clean; 1913 tests pass (was 60 in my own suites; merge brought in the full agent-farm test set including the new bugfix-826-migration tests). --- packages/codev/src/agent-farm/servers/tower-instances.ts | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/packages/codev/src/agent-farm/servers/tower-instances.ts b/packages/codev/src/agent-farm/servers/tower-instances.ts index ca86b8cce..15f43bb62 100644 --- a/packages/codev/src/agent-farm/servers/tower-instances.ts +++ b/packages/codev/src/agent-farm/servers/tower-instances.ts @@ -488,7 +488,9 @@ export async function launchInstance(workspacePath: string): Promise<{ success: try { // Single architect (main only, or empty before first spawn) → safe. // Multiple → unsafe (sibling jsonls collide with main's in the same cwd). - safeToResume = getArchitects().length <= 1; + // Bugfix #826: getArchitects is workspace-scoped — pass resolvedPath + // (the canonical workspace path used by the architect table). + safeToResume = getArchitects(resolvedPath).length <= 1; } catch { // state.db read should never fail here, but if it does the safe // default is to skip resume rather than risk attaching main to