diff --git a/.trajectories/compacted/release-6.2.3.json b/.trajectories/compacted/release-6.2.3.json index 45ce515e2..60d419657 100644 --- a/.trajectories/compacted/release-6.2.3.json +++ b/.trajectories/compacted/release-6.2.3.json @@ -3,11 +3,7 @@ "version": 1, "type": "compacted", "compactedAt": "2026-05-19T14:41:29.877Z", - "sourceTrajectories": [ - "traj_47akjihewlow", - "traj_f9wxa8ujeg78", - "traj_sqerp89tc436" - ], + "sourceTrajectories": ["traj_47akjihewlow", "traj_f9wxa8ujeg78", "traj_sqerp89tc436"], "dateRange": { "start": "2026-05-19T00:45:33.159Z", "end": "2026-05-19T01:38:29.105Z" @@ -15,9 +11,7 @@ "summary": { "totalDecisions": 4, "totalEvents": 4, - "uniqueAgents": [ - "default" - ] + "uniqueAgents": ["default"] }, "decisionGroups": [ { @@ -83,9 +77,5 @@ "crates/broker/src/worker.rs", "crates/broker/src/wrap.rs" ], - "commits": [ - "7182810c", - "2ecfb018", - "040e6d9f" - ] -} \ No newline at end of file + "commits": ["7182810c", "2ecfb018", "040e6d9f"] +} diff --git a/.trajectories/compacted/release-6.2.3.md b/.trajectories/compacted/release-6.2.3.md index a5b37a97c..209568c3b 100644 --- a/.trajectories/compacted/release-6.2.3.md +++ b/.trajectories/compacted/release-6.2.3.md @@ -1,6 +1,7 @@ # Trajectory Compaction: May 19, 2026 - May 19, 2026 ## Summary + - Sessions: 3 - Decisions: 4 - Events: 4 @@ -9,15 +10,19 @@ - Commits: 3 ## Architecture + - Split broker runtime by responsibility -> Split broker runtime by responsibility (traj_47akjihewlow) - Split broker binary entrypoint mechanically -> Split broker binary entrypoint mechanically (traj_f9wxa8ujeg78) ## Other + - Apply issue 875 to crates/broker/src/main.rs -> Apply issue 875 to crates/broker/src/main.rs (traj_sqerp89tc436) - Removed repository-root bin fallback -> Removed repository-root bin fallback (traj_sqerp89tc436) ## Key Learnings + - None ## Key Findings -- None \ No newline at end of file + +- None diff --git a/.trajectories/completed/2026-05/traj_2gpglosdsq7s.json b/.trajectories/completed/2026-05/traj_2gpglosdsq7s.json new file mode 100644 index 000000000..5155e6896 --- /dev/null +++ b/.trajectories/completed/2026-05/traj_2gpglosdsq7s.json @@ -0,0 +1,53 @@ +{ + "id": "traj_2gpglosdsq7s", + "version": 1, + "task": { + "title": "Fix broker session read paths and agent listing errors" + }, + "status": "completed", + "startedAt": "2026-05-19T12:37:18.367Z", + "completedAt": "2026-05-19T12:48:50.116Z", + "agents": [ + { + "name": "default", + "role": "lead", + "joinedAt": "2026-05-19T12:44:57.363Z" + } + ], + "chapters": [ + { + "id": "chap_fxwoou59eukx", + "title": "Work", + "agentName": "default", + "startedAt": "2026-05-19T12:44:57.363Z", + "endedAt": "2026-05-19T12:48:50.116Z", + "events": [ + { + "ts": 1779194697364, + "type": "decision", + "content": "Resolved CLI read surfaces through the project broker connection file: Resolved CLI read surfaces through the project broker connection file", + "raw": { + "question": "Resolved CLI read surfaces through the project broker connection file", + "chosen": "Resolved CLI read surfaces through the project broker connection file", + "alternatives": [], + "reasoning": "status already reads the project .agent-relay/connection.json; passing that path explicitly prevents AGENT_RELAY_STATE_DIR from redirecting who/agents/history/replies to a stale broker" + }, + "significance": "high" + } + ] + } + ], + "retrospective": { + "summary": "Fixed broker read surfaces to use the project broker connection and report agent-list query failures instead of empty lists", + "approach": "Standard approach", + "confidence": 0.9 + }, + "commits": [], + "filesChanged": [], + "projectId": "/Users/khaliqgant/Projects/AgentWorkforce/relay-worktrees/broker-session-query-and-listagents", + "tags": [], + "_trace": { + "startRef": "f5dd259e2a7009bcdbc1e9aa30c750ac74e5aeca", + "endRef": "f5dd259e2a7009bcdbc1e9aa30c750ac74e5aeca" + } +} diff --git a/.trajectories/completed/2026-05/traj_2gpglosdsq7s.md b/.trajectories/completed/2026-05/traj_2gpglosdsq7s.md new file mode 100644 index 000000000..d220cc37c --- /dev/null +++ b/.trajectories/completed/2026-05/traj_2gpglosdsq7s.md @@ -0,0 +1,33 @@ +# Trajectory: Fix broker session read paths and agent listing errors + +> **Status:** ✅ Completed +> **Confidence:** 90% +> **Started:** May 19, 2026 at 02:37 PM +> **Completed:** May 19, 2026 at 02:48 PM + +--- + +## Summary + +Fixed broker read surfaces to use the project broker connection and report agent-list query failures instead of empty lists + +**Approach:** Standard approach + +--- + +## Key Decisions + +### Resolved CLI read surfaces through the project broker connection file + +- **Chose:** Resolved CLI read surfaces through the project broker connection file +- **Reasoning:** status already reads the project .agent-relay/connection.json; passing that path explicitly prevents AGENT_RELAY_STATE_DIR from redirecting who/agents/history/replies to a stale broker + +--- + +## Chapters + +### 1. Work + +_Agent: default_ + +- Resolved CLI read surfaces through the project broker connection file: Resolved CLI read surfaces through the project broker connection file diff --git a/.trajectories/completed/2026-05/traj_gnqvtoxtc8dy.json b/.trajectories/completed/2026-05/traj_gnqvtoxtc8dy.json new file mode 100644 index 000000000..4c6f5d14e --- /dev/null +++ b/.trajectories/completed/2026-05/traj_gnqvtoxtc8dy.json @@ -0,0 +1,53 @@ +{ + "id": "traj_gnqvtoxtc8dy", + "version": 1, + "task": { + "title": "Fix broker half-start recovery" + }, + "status": "completed", + "startedAt": "2026-05-19T12:34:36.057Z", + "completedAt": "2026-05-19T12:47:18.115Z", + "agents": [ + { + "name": "default", + "role": "lead", + "joinedAt": "2026-05-19T12:41:16.026Z" + } + ], + "chapters": [ + { + "id": "chap_b3d0wym6dp08", + "title": "Work", + "agentName": "default", + "startedAt": "2026-05-19T12:41:16.026Z", + "endedAt": "2026-05-19T12:47:18.115Z", + "events": [ + { + "ts": 1779194476027, + "type": "decision", + "content": "Recover half-started detached brokers by killing foreground CLI wrappers and unready broker PIDs before retrying: Recover half-started detached brokers by killing foreground CLI wrappers and unready broker PIDs before retrying", + "raw": { + "question": "Recover half-started detached brokers by killing foreground CLI wrappers and unready broker PIDs before retrying", + "chosen": "Recover half-started detached brokers by killing foreground CLI wrappers and unready broker PIDs before retrying", + "alternatives": [], + "reasoning": "The failure mode leaves a live agent-relay up --foreground process without usable connection metadata; scanning only agent-relay-broker misses that wrapper, so up/down --force must reap both wrapper and broker PID candidates." + }, + "significance": "high" + } + ] + } + ], + "retrospective": { + "summary": "Added deterministic recovery for detached broker half-starts by reaping unready broker PIDs and metadata-less foreground wrappers before restart, and by cleaning failed detached children on readiness timeout.", + "approach": "Standard approach", + "confidence": 0.9 + }, + "commits": [], + "filesChanged": [], + "projectId": "/Users/khaliqgant/Projects/AgentWorkforce/relay-worktrees/broker-half-start-recovery", + "tags": [], + "_trace": { + "startRef": "f5dd259e2a7009bcdbc1e9aa30c750ac74e5aeca", + "endRef": "f5dd259e2a7009bcdbc1e9aa30c750ac74e5aeca" + } +} diff --git a/.trajectories/completed/2026-05/traj_gnqvtoxtc8dy.md b/.trajectories/completed/2026-05/traj_gnqvtoxtc8dy.md new file mode 100644 index 000000000..14ba7492b --- /dev/null +++ b/.trajectories/completed/2026-05/traj_gnqvtoxtc8dy.md @@ -0,0 +1,33 @@ +# Trajectory: Fix broker half-start recovery + +> **Status:** ✅ Completed +> **Confidence:** 90% +> **Started:** May 19, 2026 at 02:34 PM +> **Completed:** May 19, 2026 at 02:47 PM + +--- + +## Summary + +Added deterministic recovery for detached broker half-starts by reaping unready broker PIDs and metadata-less foreground wrappers before restart, and by cleaning failed detached children on readiness timeout. + +**Approach:** Standard approach + +--- + +## Key Decisions + +### Recover half-started detached brokers by killing foreground CLI wrappers and unready broker PIDs before retrying + +- **Chose:** Recover half-started detached brokers by killing foreground CLI wrappers and unready broker PIDs before retrying +- **Reasoning:** The failure mode leaves a live agent-relay up --foreground process without usable connection metadata; scanning only agent-relay-broker misses that wrapper, so up/down --force must reap both wrapper and broker PID candidates. + +--- + +## Chapters + +### 1. Work + +_Agent: default_ + +- Recover half-started detached brokers by killing foreground CLI wrappers and unready broker PIDs before retrying: Recover half-started detached brokers by killing foreground CLI wrappers and unready broker PIDs before retrying diff --git a/CHANGELOG.md b/CHANGELOG.md index 9b3cb3b1f..39fd65df8 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -68,6 +68,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - PTY context budget detection uses the latest percentage in output and can re-emit after the budget rises. - `agent-relay agents:logs` now cooks PTY redraws into line-oriented output by default and keeps raw terminal bytes behind `--raw`. - `agent-relay agents:logs --raw` preserves non-UTF-8 bytes, and follow mode keeps split escape/codepoint sequences intact. +- `agent-relay up --no-dashboard` and `agent-relay down --force` now recover half-started brokers that stayed alive without readable connection metadata. +- `agent-relay who` and `agent-relay agents` now fail clearly when broker queries fail instead of printing an empty agent list. +- `agent-relay history` and `agent-relay replies` now resolve the project broker session even when `AGENT_RELAY_STATE_DIR` points elsewhere. +- `agent-relay doctor` now fails with an actionable diagnostic for half-started, stale-connection, and unresolved-API-key-template brokers instead of reporting "healthy". - CLI readiness checks use the live VT grid and cursor position to avoid false ready states in alternate screens and menus. - `agent-relay history --from ` returns the newest messages after chronological sorting. - `agent-relay replies --unread` prints nothing when there are no unread messages. @@ -84,12 +88,16 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [6.2.6] - 2026-05-20 ### Product Perspective + #### User-Impacting Fixes + - Flush UTF-8 decoder on normal pty_closed path - Preserve split multi-byte UTF-8 in worker_stream (#922) (#922) ### Technical Perspective + #### Releases + - v6.2.6 --- @@ -97,15 +105,20 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [6.2.5] - 2026-05-19 ### Product Perspective + #### User-Impacting Fixes + - Handle write_pty frames in PTY worker (#920) ### Technical Perspective + #### Dependencies & Tooling + - Sync package-lock.json for next 15.5.18 bump - Bump next from 15.5.14 to 15.5.18 in /web #### Releases + - v6.2.5 --- @@ -113,10 +126,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [6.2.4] - 2026-05-19 ### Technical Perspective + #### Architecture & API Changes + - Use relaycast sdk 1.1 helpers #### Releases + - v6.2.4 --- @@ -124,17 +140,22 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 ## [6.2.3] - 2026-05-19 ### Product Perspective + #### User-Facing Features & Improvements + - **Align reported version with product release line** (#904) #### User-Impacting Fixes + - Address coderabbit review on version handling - Use next/link for docs navigation - Pass idle threshold to spawned workers - Address runtime review findings ### Technical Perspective + #### Architecture & API Changes + - Narrow public crate API - Group relaycast broker integration - Extract broker runtime event handlers @@ -143,11 +164,13 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Move broker crate under crates #### Dependencies & Tooling + - Record runtime split trajectory - Complete issue 875 trajectory file list - Update issue 875 trajectory metadata #### Releases + - v6.2.3 --- diff --git a/docs/doctor-orchestration-repros.md b/docs/doctor-orchestration-repros.md new file mode 100644 index 000000000..cfba04850 --- /dev/null +++ b/docs/doctor-orchestration-repros.md @@ -0,0 +1,140 @@ +# Doctor Orchestration Repros + +These are deterministic local repros for orchestration states that previously +required comparing `status`, `who`, and messaging command output by hand. +Credential values in observed output are redacted. + +## Stale or Wrong Broker Connection + +Run from a temporary project with the built CLI: + +```bash +CLI=/Users/khaliqgant/Projects/AgentWorkforce/relay/dist/src/cli/index.js +TMP=$(mktemp -d /tmp/relay-repro-A-C-XXXXXX) +cd "$TMP" +node "$CLI" up --no-dashboard --port 49200 +cat .agent-relay/connection.json +node "$CLI" status +node "$CLI" who --json +kill -9 11078 +node "$CLI" up --no-dashboard --port 49300 +node -e 'const fs=require("fs"); const old=JSON.parse(process.argv[1]); const next=JSON.parse(process.argv[2]); old.pid=next.pid; fs.writeFileSync(".agent-relay/connection.json", JSON.stringify(old,null,2));' "$CONN1" "$CONN2" +cat .agent-relay/connection.json +node "$CLI" status +node "$CLI" status --wait-for=1 +node "$CLI" who --json +``` + +Observed output: + +```text +Broker started. +Broker PID: 11078 +Stop with: agent-relay down + +{ + "api_key": "br_", + "pid": 11078, + "port": 49201, + "url": "http://127.0.0.1:49201" +} + +Status: RUNNING +Mode: broker (stdio) +PID: 11078 +Project: /private/tmp/relay-repro-A-C-HuYoNc +Agents: 0 +Workspace Key: rk_live_ +Observer: https://agentrelay.com/observer?key=rk_live_ + +[] + +Broker started. +Broker PID: 11410 +Stop with: agent-relay down + +{ + "api_key": "br_", + "pid": 11410, + "port": 49201, + "url": "http://127.0.0.1:49201" +} + +Status: RUNNING +Mode: broker (stdio) +PID: 11410 +Project: /private/tmp/relay-repro-A-C-HuYoNc + +Status: STARTING +Mode: broker (stdio) +PID: 11410 +Project: /private/tmp/relay-repro-A-C-HuYoNc +Broker process is running, but the API did not become ready before timeout. + +[] +``` + +## Unresolved API Key Template + +With the same temporary project, the correctly resolved broker session key +allowed an orchestrator read to reach Relaycast: + +```bash +node "$CLI" replies WorkerA --json +``` + +Observed output: + +```text +No DM conversation with WorkerA. +``` + +The literal unresolved template fails before a meaningful orchestrator read: + +```bash +RELAY_API_KEY='${RELAY_API_KEY}' node "$CLI" replies WorkerA --json +``` + +Observed output: + +```text +Failed to initialize relaycast client: Workspace key required (rk_live_...) +``` + +## Half-Started Broker With Missing Metadata + +Run with Relaycast environment variables unset so messaging commands must rely +on local broker metadata: + +```bash +CLI=/Users/khaliqgant/Projects/AgentWorkforce/relay/dist/src/cli/index.js +RUN=(env -u RELAY_API_KEY -u RELAY_AGENT_TOKEN -u RELAY_WORKSPACES_JSON -u RELAY_DEFAULT_WORKSPACE -u RELAY_WORKSPACE_ID -u RELAY_BASE_URL -u RELAY_BROKER_URL -u RELAY_BROKER_API_KEY -u RELAY_AGENT_NAME -u RELAY_AGENT_TYPE -u RELAY_STRICT_AGENT_NAME node "$CLI") +TMP=$(mktemp -d /tmp/relay-repro-half2-XXXXXX) +cd "$TMP" +"${RUN[@]}" up --no-dashboard --port 49600 +rm .agent-relay/connection.json +ps -p 15596 -o pid=,comm= +"${RUN[@]}" status +"${RUN[@]}" history +"${RUN[@]}" replies WorkerA --json +"${RUN[@]}" up --no-dashboard --port 49700 +``` + +Observed output: + +```text +Broker started. +Broker PID: 15596 +Stop with: agent-relay down + +15596 /Users/khaliqgant/Projects/AgentWorkforce/relay/target/release/agent-relay-broker + +Status: STOPPED + +Failed to initialize relaycast client: Failed to read broker connection metadata. Start the broker with `agent-relay up` or set RELAY_API_KEY. + +Failed to initialize relaycast client: Failed to read broker connection metadata. Start the broker with `agent-relay up` or set RELAY_API_KEY. + +Broker background start did not become ready within 10s (pid: 16245). +Run `agent-relay status --wait-for=10` for details, or `agent-relay down --force` to clean up. +``` diff --git a/src/cli/commands/agent-management.test.ts b/src/cli/commands/agent-management.test.ts index 247c7eee0..855eab69b 100644 --- a/src/cli/commands/agent-management.test.ts +++ b/src/cli/commands/agent-management.test.ts @@ -331,6 +331,21 @@ describe('registerAgentManagementCommands', () => { expect(exitCode).toBe(1); }); + it('fails agents:kill when listing agents fails', async () => { + const client = createClientMock({ + listAgents: vi.fn(async () => { + throw new Error('session query failed'); + }), + }); + const { program, deps } = createHarness({ client }); + + const exitCode = await runCommand(program, ['agents:kill', 'WorkerF']); + + expect(exitCode).toBe(1); + expect(deps.error).toHaveBeenCalledWith('Failed to list agents: session query failed'); + expect(deps.killProcess).not.toHaveBeenCalled(); + }); + it('lists agents including remote agents with --remote', async () => { const client = createClientMock({ listAgents: vi.fn(async () => [{ name: 'WorkerLocal', runtime: 'pty', pid: 2222 }]), diff --git a/src/cli/commands/agent-management.ts b/src/cli/commands/agent-management.ts index 802a22817..9f9b6f65c 100644 --- a/src/cli/commands/agent-management.ts +++ b/src/cli/commands/agent-management.ts @@ -8,6 +8,7 @@ import { getProjectPaths } from '@agent-relay/config'; import { runAgentsCommand, runAgentsLogsCommand, runWhoCommand } from '../lib/agent-management-listing.js'; import { defaultExit } from '../lib/exit.js'; +import { connectProjectBrokerClient } from '../lib/project-broker-client.js'; type ShadowMode = 'subagent' | 'process'; type ShadowTrigger = 'SESSION_END' | 'CODE_WRITTEN' | 'REVIEW_REQUEST' | 'EXPLICIT_ASK' | 'ALL_MESSAGES'; @@ -110,7 +111,7 @@ async function createSdkClient(cwd: string, autoStart: boolean): Promise []); + let agents: WorkerInfo[] = []; + try { + agents = await client.listAgents(); + } catch (err: unknown) { + const detail = err instanceof Error ? err.message : String(err); + deps.error(`Warning: spawned ${name}, but failed to refresh agent list: ${detail}`); + } const spawned = agents.find((agent) => agent.name === name); if (spawned?.pid) { deps.log(`Spawned agent: ${name} (pid: ${spawned.pid})`); @@ -584,7 +591,16 @@ export function registerAgentManagementCommands( deps.exit(1); return; } - const workers = await client.listAgents().catch(() => []); + let workers: WorkerInfo[]; + try { + workers = await client.listAgents(); + } catch (err: unknown) { + const detail = err instanceof Error ? err.message : String(err); + deps.error(`Failed to list agents: ${detail}`); + await client.shutdown().catch(() => undefined); + deps.exit(1); + return; + } await client.shutdown().catch(() => undefined); const worker = workers.find((entry) => entry.name === name); diff --git a/src/cli/commands/core.test.ts b/src/cli/commands/core.test.ts index 50135b64a..d079b3f23 100644 --- a/src/cli/commands/core.test.ts +++ b/src/cli/commands/core.test.ts @@ -618,11 +618,16 @@ describe('registerCoreCommands', () => { it('up --no-dashboard exits non-zero when the detached broker never becomes ready', async () => { const spawnedProcess = createSpawnedProcessMock(); let now = 0; + let childRunning = true; const sleepImpl = vi.fn(async (ms: number) => { now += ms; }); const killImpl = vi.fn((pid: number, signal?: NodeJS.Signals | number) => { - if (pid === 9001 && signal === 0) return; + if (pid === 9001 && signal === 0 && childRunning) return; + if (pid === 9001 && signal === 'SIGTERM') { + childRunning = false; + return; + } throw new Error('unexpected kill check'); }); const { program, deps } = createHarness({ @@ -641,10 +646,13 @@ describe('registerCoreCommands', () => { expect(deps.error).toHaveBeenCalledWith( 'Run `agent-relay status --wait-for=10` for details, or `agent-relay down --force` to clean up.' ); + expect(killImpl).toHaveBeenCalledWith(9001, 'SIGTERM'); + expect(deps.warn).toHaveBeenCalledWith('Cleaning up failed broker start (pid: 9001)'); expect(deps.log).not.toHaveBeenCalledWith('Broker started.'); }); it('down --force only kills actual orphaned broker executables for the project', async () => { + const runningPids = new Set([222, 444, 666]); const execCommand = vi.fn(async (command: string) => { if (command === 'ps aux') { return { @@ -655,6 +663,8 @@ describe('registerCoreCommands', () => { 'khaliqgant 333 0.0 0.0 1 1 ?? S 1:00PM 0:00.01 /opt/bin/agent-relay-broker init --name project --channels general --persist', 'khaliqgant 444 0.0 0.0 1 1 ?? S 1:00PM 0:00.01 /opt/bin/agent-relay-broker init --state-dir /tmp/project/.agent-relay --persist', 'khaliqgant 555 0.0 0.0 1 1 ?? S 1:00PM 0:00.01 /opt/bin/agent-relay-broker init --state-dir /tmp/project-other/.agent-relay --persist', + 'khaliqgant 666 0.0 0.0 1 1 ?? S 1:00PM 0:00.01 /Users/test/.agent-relay/bin/agent-relay up --no-dashboard --foreground', + 'khaliqgant 777 0.0 0.0 1 1 ?? S 1:00PM 0:00.01 /Users/test/.agent-relay/bin/agent-relay status --wait-for=30', ].join('\n'), stderr: '', }; @@ -665,33 +675,151 @@ describe('registerCoreCommands', () => { if (command.includes('-p 333 ')) { return { stdout: 'p333\nfcwd\nn/tmp/project-other\n', stderr: '' }; } + if (command.includes('-p 666 ')) { + return { stdout: 'p666\nfcwd\nn/tmp/project\n', stderr: '' }; + } throw new Error(`unexpected command: ${command}`); }); - const killImpl = vi.fn(() => undefined); - const { program, deps } = createHarness({ execCommand, killImpl }); + const killImpl = vi.fn((pid: number, signal?: NodeJS.Signals | number) => { + if (signal === 0) { + if (runningPids.has(pid)) return; + throw new Error('not running'); + } + runningPids.delete(pid); + }); + let now = 0; + const { program, deps } = createHarness({ + execCommand, + killImpl, + nowImpl: vi.fn(() => now), + sleepImpl: vi.fn(async (ms: number) => { + now += ms; + }), + }); const exitCode = await runCommand(program, ['down', '--force']); expect(exitCode).toBeUndefined(); expect(killImpl).toHaveBeenCalledWith(222, 'SIGTERM'); expect(killImpl).toHaveBeenCalledWith(444, 'SIGTERM'); + expect(killImpl).toHaveBeenCalledWith(666, 'SIGTERM'); expect(killImpl).not.toHaveBeenCalledWith(111, 'SIGTERM'); expect(killImpl).not.toHaveBeenCalledWith(333, 'SIGTERM'); expect(killImpl).not.toHaveBeenCalledWith(555, 'SIGTERM'); + expect(killImpl).not.toHaveBeenCalledWith(777, 'SIGTERM'); expect(deps.warn).toHaveBeenCalledWith('Killing orphaned broker process (pid: 222)'); expect(deps.warn).toHaveBeenCalledWith('Killing orphaned broker process (pid: 444)'); + expect(deps.warn).toHaveBeenCalledWith('Killing orphaned broker process (pid: 666)'); expect(deps.log).toHaveBeenCalledWith('Cleaned up (was not running)'); }); + it('up --no-dashboard reaps a foreground child orphan before starting cleanly', async () => { + const spawnedProcess = createSpawnedProcessMock({ pid: 9001 }); + const runningPids = new Set([777, 9001, 4242]); + const fs = createFsMock(); + let now = 0; + const execCommand = vi.fn(async (command: string) => { + if (command === 'ps aux') { + return { + stdout: [ + 'USER PID %CPU %MEM VSZ RSS TT STAT STARTED TIME COMMAND', + 'khaliqgant 777 0.0 0.0 1 1 ?? S 1:00PM 0:00.01 /Users/test/.agent-relay/bin/agent-relay up --no-dashboard --foreground', + ].join('\n'), + stderr: '', + }; + } + if (command.includes('-p 777 ')) { + return { stdout: 'p777\nfcwd\nn/tmp/project\n', stderr: '' }; + } + throw new Error(`unexpected command: ${command}`); + }); + const sleepImpl = vi.fn(async (ms: number) => { + now += ms; + fs.writeFileSync('/tmp/project/.agent-relay/connection.json', connectionFile(4242)); + }); + const killImpl = vi.fn((pid: number, signal?: NodeJS.Signals | number) => { + if (signal === 0) { + if (runningPids.has(pid)) return; + throw new Error('not running'); + } + runningPids.delete(pid); + }); + const { program, deps } = createHarness({ + fs, + spawnedProcess, + execCommand, + killImpl, + nowImpl: vi.fn(() => now), + sleepImpl, + }); + + const exitCode = await runCommand(program, ['up', '--no-dashboard']); + + expect(exitCode).toBe(0); + expect(killImpl).toHaveBeenCalledWith(777, 'SIGTERM'); + expect(deps.warn).toHaveBeenCalledWith('Killing orphaned broker process (pid: 777)'); + expect(deps.spawnProcess).toHaveBeenCalledTimes(1); + expect(deps.log).toHaveBeenCalledWith('Broker started.'); + expect(deps.log).toHaveBeenCalledWith('Broker PID: 4242'); + }); + + it('up --no-dashboard replaces a live broker PID whose API never becomes ready', async () => { + const spawnedProcess = createSpawnedProcessMock({ pid: 9001 }); + const runningPids = new Set([3030, 9001, 4242]); + const fs = createFsMock({ ['/tmp/project/.agent-relay/connection.json']: connectionFile(3030) }); + let now = 0; + const sleepImpl = vi.fn(async (ms: number) => { + now += ms; + fs.writeFileSync('/tmp/project/.agent-relay/connection.json', connectionFile(4242)); + }); + const killImpl = vi.fn((pid: number, signal?: NodeJS.Signals | number) => { + if (signal === 0) { + if (runningPids.has(pid)) return; + throw new Error('not running'); + } + runningPids.delete(pid); + }); + sdkStatusClient.getStatus + .mockRejectedValueOnce(new Error('503 Service Unavailable')) + .mockResolvedValue({ agent_count: 0, pending_delivery_count: 0 }); + const { program, deps } = createHarness({ + fs, + spawnedProcess, + killImpl, + nowImpl: vi.fn(() => now), + sleepImpl, + }); + + const exitCode = await runCommand(program, ['up', '--no-dashboard']); + + expect(exitCode).toBe(0); + expect(killImpl).toHaveBeenCalledWith(3030, 'SIGTERM'); + expect(fs.unlinkSync).toHaveBeenCalledWith('/tmp/project/.agent-relay/connection.json'); + expect(deps.warn).toHaveBeenCalledWith( + 'Broker process is running but the API is not ready; killing half-started broker (pid: 3030).' + ); + expect(deps.spawnProcess).toHaveBeenCalledTimes(1); + expect(deps.log).toHaveBeenCalledWith('Broker PID: 4242'); + }); + it('up --no-dashboard reports the broker PID when the detached broker is live but API-unready', async () => { const spawnedProcess = createSpawnedProcessMock({ pid: 9001 }); let now = 0; - const fs = createFsMock({ ['/tmp/project/.agent-relay/connection.json']: connectionFile(4242) }); + const runningPids = new Set([9001, 4242]); + const fs = createFsMock(); const sleepImpl = vi.fn(async (ms: number) => { now += ms; + fs.writeFileSync('/tmp/project/.agent-relay/connection.json', connectionFile(4242)); }); const killImpl = vi.fn((pid: number, signal?: NodeJS.Signals | number) => { - if ((pid === 9001 || pid === 4242) && signal === 0) return; + if (signal === 0) { + if (runningPids.has(pid)) return; + throw new Error('not running'); + } + if (pid === 9001 || pid === 4242) { + runningPids.delete(pid); + return; + } throw new Error('unexpected kill check'); }); sdkStatusClient.getStatus.mockRejectedValue(new Error('503 Service Unavailable')); @@ -710,6 +838,8 @@ describe('registerCoreCommands', () => { 'Broker background start did not become ready within 10s (pid: 4242).' ); expect(deps.error).toHaveBeenCalledWith('Broker process is running, but the API did not become ready.'); + expect(killImpl).toHaveBeenCalledWith(9001, 'SIGTERM'); + expect(killImpl).toHaveBeenCalledWith(4242, 'SIGTERM'); }); it('up --no-dashboard reports spawn failures without claiming background success', async () => { diff --git a/src/cli/commands/doctor.test.ts b/src/cli/commands/doctor.test.ts index 97fffb2b7..2cfa57e6e 100644 --- a/src/cli/commands/doctor.test.ts +++ b/src/cli/commands/doctor.test.ts @@ -13,6 +13,10 @@ const sdkMock = vi.hoisted(() => ({ shutdown: vi.fn(), })); +const childProcessMock = vi.hoisted(() => ({ + execFileSync: vi.fn(), +})); + // Store availability in an object to ensure closure works correctly across module resets const mockAvailability = { betterAvailable: true, nodeAvailable: true }; @@ -33,6 +37,10 @@ vi.mock('@agent-relay/sdk', () => ({ }, })); +vi.mock('node:child_process', () => ({ + execFileSync: childProcessMock.execFileSync, +})); + // doctor.ts now reads AGENT_RELAY_STORAGE_TYPE and AGENT_RELAY_STORAGE_PATH // env vars directly instead of importing getStorageConfigFromEnv @@ -150,6 +158,24 @@ function collectLogs() { }; } +function writeConnection(overrides: Partial<{ url: string; api_key: string; pid: number }> = {}) { + fs.mkdirSync(dataDir, { recursive: true }); + fs.writeFileSync( + path.join(dataDir, 'connection.json'), + JSON.stringify( + { + url: 'http://127.0.0.1:39999', + api_key: 'br_test', + pid: process.pid, + ...overrides, + }, + null, + 2 + ), + 'utf-8' + ); +} + beforeEach(() => { tempRoot = fs.mkdtempSync(path.join(os.tmpdir(), 'doctor-test-')); dataDir = path.join(tempRoot, '.agent-relay'); @@ -164,6 +190,8 @@ beforeEach(() => { sdkMock.getStatus.mockReset(); sdkMock.shutdown.mockReset(); sdkMock.connect.mockReset(); + childProcessMock.execFileSync.mockReset(); + childProcessMock.execFileSync.mockReturnValue(''); sdkMock.getStatus.mockResolvedValue({ auth: { authenticated: false, workspace_count: 0 }, pending_deliveries: [], @@ -183,6 +211,7 @@ afterEach(() => { delete process.env.AGENT_RELAY_DOCTOR_FORCE_BETTER_SQLITE3; delete process.env.AGENT_RELAY_STORAGE_TYPE; delete process.env.AGENT_RELAY_STORAGE_PATH; + delete process.env.RELAY_API_KEY; process.exitCode = undefined; vi.restoreAllMocks(); }); @@ -307,9 +336,10 @@ describe('doctor diagnostics', () => { expect(process.exitCode).toBe(1); }); - it('shuts down the broker client when status lookup throws', async () => { + it('fails when broker connection metadata points at an unreachable broker', async () => { process.env.AGENT_RELAY_DOCTOR_FORCE_NODE_SQLITE = '1'; process.env.AGENT_RELAY_DOCTOR_FORCE_BETTER_SQLITE3 = '1'; + writeConnection(); sdkMock.getStatus.mockRejectedValueOnce(new Error('broker unavailable')); const { logs, restore } = collectLogs(); const { runDoctor } = await loadDoctor(); @@ -317,7 +347,52 @@ describe('doctor diagnostics', () => { await runDoctor(); restore(); - expect(logs.join('\n')).toContain('Skipped (broker unavailable: broker unavailable)'); + const output = logs.join('\n'); + expect(output).toContain('Broker connection'); + expect(output).toContain('Stale or unreachable broker connection metadata: broker unavailable'); + expect(output).toContain('agent-relay down --force'); + expect(output).toContain('Some checks failed'); + expect(process.exitCode).toBe(1); expect(sdkMock.shutdown).toHaveBeenCalledTimes(1); }); + + it('fails when a literal RELAY_API_KEY template is still unresolved', async () => { + process.env.AGENT_RELAY_DOCTOR_FORCE_NODE_SQLITE = '1'; + process.env.AGENT_RELAY_DOCTOR_FORCE_BETTER_SQLITE3 = '1'; + process.env.RELAY_API_KEY = '${RELAY_API_KEY}'; + const { logs, restore } = collectLogs(); + const { runDoctor } = await loadDoctor(); + + await runDoctor(); + + restore(); + const output = logs.join('\n'); + expect(output).toContain('Relaycast API key'); + expect(output).toContain('Unresolved RELAY_API_KEY template (${RELAY_API_KEY})'); + expect(output).toContain('real rk_live_... workspace key'); + expect(output).toContain('Some checks failed'); + expect(process.exitCode).toBe(1); + }); + + it('fails when a broker process is alive but connection metadata is missing', async () => { + process.env.AGENT_RELAY_DOCTOR_FORCE_NODE_SQLITE = '1'; + process.env.AGENT_RELAY_DOCTOR_FORCE_BETTER_SQLITE3 = '1'; + childProcessMock.execFileSync.mockReturnValue( + `12345 /tmp/agent-relay-broker init --name ${path.basename(tempRoot)} --state-dir ${dataDir}\n` + ); + const { logs, restore } = collectLogs(); + const { runDoctor } = await loadDoctor(); + + await runDoctor(); + + restore(); + const output = logs.join('\n'); + expect(output).toContain('Broker connection'); + expect(output).toContain('Broker process alive but'); + expect(output).toContain('connection.json is missing'); + expect(output).toContain('pid: 12345'); + expect(output).toContain('half-started broker'); + expect(output).toContain('Some checks failed'); + expect(process.exitCode).toBe(1); + }); }); diff --git a/src/cli/commands/messaging.ts b/src/cli/commands/messaging.ts index 78836e867..5534c6fd6 100644 --- a/src/cli/commands/messaging.ts +++ b/src/cli/commands/messaging.ts @@ -4,6 +4,7 @@ import { getProjectPaths } from '@agent-relay/config'; import { defaultExit } from '../lib/exit.js'; import { parseSince, sanitizeForTerminal, sanitizeForTerminalLine } from '../lib/formatting.js'; +import { connectProjectBrokerClient } from '../lib/project-broker-client.js'; type ExitFn = (code: number) => never; const MAX_DM_FETCH_LIMIT = 1000; @@ -799,7 +800,7 @@ function sortConversationsMostRecentFirst(conversations: DmConversationSummary[] async function createDefaultClient(cwd: string): Promise { // Connect to an existing broker if one is running, otherwise spawn try { - const client = AgentRelayClient.connect({ cwd }); + const client = connectProjectBrokerClient(cwd); return client as unknown as MessagingBrokerClient; } catch { const client = await AgentRelayClient.spawn({ cwd }); @@ -815,7 +816,7 @@ async function resolveRelaycastApiKey(cwd: string): Promise { let client: AgentRelayClient; try { - client = AgentRelayClient.connect({ cwd }); + client = connectProjectBrokerClient(cwd); } catch { throw new Error( 'Failed to read broker connection metadata. Start the broker with `agent-relay up` or set RELAY_API_KEY.' diff --git a/src/cli/lib/agent-management-listing.test.ts b/src/cli/lib/agent-management-listing.test.ts index e5adef7af..d79288f06 100644 --- a/src/cli/lib/agent-management-listing.test.ts +++ b/src/cli/lib/agent-management-listing.test.ts @@ -13,6 +13,7 @@ import { function createDeps(options?: { workers?: ListingWorkerInfo[]; listAgentsError?: Error; + createClientError?: Error; nowIso?: string; metrics?: Array<{ name: string; pid: number; memory_bytes: number; uptime_secs: number }>; getMetricsError?: Error; @@ -42,11 +43,15 @@ function createDeps(options?: { const deps: AgentManagementListingDependencies = { getProjectRoot: vi.fn(() => '/tmp/project'), getDataDir: vi.fn(() => '/tmp/data'), - createClient: vi.fn(() => ({ - listAgents, - ...(getMetrics ? { getMetrics } : {}), - shutdown, - })), + createClient: options?.createClientError + ? vi.fn(() => { + throw options.createClientError; + }) + : vi.fn(() => ({ + listAgents, + ...(getMetrics ? { getMetrics } : {}), + shutdown, + })), fileExists: vi.fn(() => false), readFile: vi.fn(() => ''), fetch: vi.fn(async () => { @@ -59,7 +64,7 @@ function createDeps(options?: { exit, }; - return { deps, listAgents, shutdown, log, error }; + return { deps, listAgents, shutdown, log, error, exit }; } describe('agent-management-listing JSON output', () => { @@ -249,16 +254,29 @@ describe('agent-management-listing JSON output', () => { expect(output).toContain('claude'); }); - it('runAgentsCommand returns [] JSON when listAgents fails', async () => { - const { deps, log, shutdown } = createDeps({ + it('runAgentsCommand exits non-zero when listAgents fails instead of emitting [] JSON', async () => { + const { deps, log, shutdown, error } = createDeps({ listAgentsError: new Error('broker unavailable'), }); - await runAgentsCommand({ json: true }, deps); + await expect(runAgentsCommand({ json: true }, deps)).rejects.toThrow('exit:1'); expect(shutdown).toHaveBeenCalledTimes(1); - expect(log).toHaveBeenCalledTimes(1); - expect(JSON.parse(log.mock.calls[0][0] as string)).toEqual([]); + expect(log).not.toHaveBeenCalled(); + expect(error).toHaveBeenCalledWith('Failed to query broker agents: broker unavailable'); + expect(error).toHaveBeenCalledWith('Start the broker with `agent-relay up` and try again.'); + }); + + it('runWhoCommand exits non-zero when broker client creation fails instead of emitting [] JSON', async () => { + const { deps, log, error } = createDeps({ + createClientError: new Error('stale connection refused'), + }); + + await expect(runWhoCommand({ json: true }, deps)).rejects.toThrow('exit:1'); + + expect(log).not.toHaveBeenCalled(); + expect(error).toHaveBeenCalledWith('Failed to query broker agents: stale connection refused'); + expect(error).toHaveBeenCalledWith('Start the broker with `agent-relay up` and try again.'); }); }); diff --git a/src/cli/lib/agent-management-listing.ts b/src/cli/lib/agent-management-listing.ts index eb8444647..dd88972f7 100644 --- a/src/cli/lib/agent-management-listing.ts +++ b/src/cli/lib/agent-management-listing.ts @@ -109,6 +109,11 @@ function tableCell(value: string | null | undefined, fallback = '-'): string { return sanitizeForTerminalLine(value ?? fallback); } +function formatListAgentsError(err: unknown): string { + const detail = err instanceof Error ? err.message : String(err); + return `Failed to query broker agents: ${sanitizeForTerminalLine(detail)}`; +} + function shouldHideLocalAgentByDefault(name: string | undefined): boolean { if (!name) return true; if (name.startsWith('__')) return true; @@ -269,16 +274,23 @@ export async function runAgentsCommand( let client: Awaited>; try { client = await deps.createClient(deps.getProjectRoot()); - } catch { - if (options.json) { - deps.log(JSON.stringify([], null, 2)); - } else { - deps.log('No agents found. Ensure the broker is running and agents are connected.'); - } + } catch (err: unknown) { + deps.error(formatListAgentsError(err)); + deps.error('Start the broker with `agent-relay up` and try again.'); + deps.exit(1); return; } - const workers = await client.listAgents().catch(() => []); - await client.shutdown().catch(() => undefined); + let workers: ListingWorkerInfo[]; + try { + workers = await client.listAgents(); + } catch (err: unknown) { + deps.error(formatListAgentsError(err)); + deps.error('Start the broker with `agent-relay up` and try again.'); + deps.exit(1); + return; + } finally { + await client.shutdown().catch(() => undefined); + } const combined: CombinedAgent[] = workers .filter((worker) => (options.all ? true : !shouldHideLocalAgentByDefault(worker.name))) @@ -384,13 +396,10 @@ export async function runWhoCommand( let client: Awaited>; try { client = await deps.createClient(deps.getProjectRoot()); - } catch { - if (options.json) { - deps.log(JSON.stringify([], null, 2)); - } else { - const hint = options.all ? '' : ' (use --all to include internal/cli agents)'; - deps.log(`No active agents found${hint}.`); - } + } catch (err: unknown) { + deps.error(formatListAgentsError(err)); + deps.error('Start the broker with `agent-relay up` and try again.'); + deps.exit(1); return; } // Real per-agent metrics from the broker (pid / memory / uptime). This @@ -412,46 +421,46 @@ export async function runWhoCommand( } } - const onlineAgents = await client - .listAgents() - .then((list) => - list - .filter((agent) => (options.all ? true : !shouldHideLocalAgentByDefault(agent.name))) - .map((agent) => { - const m = metricsByName.get(agent.name); - return { - name: agent.name, - cli: agent.cli || agent.runtime || null, - // An agent present in the broker's live list is connected. We - // do not synthesize idle/exited here — that requires broker - // lifecycle state the CLI cannot observe without a follow-up - // broker change. - status: 'online' as const, - pid: m?.pid ?? agent.pid ?? null, - uptimeSecs: typeof m?.uptime_secs === 'number' ? m.uptime_secs : null, - memoryBytes: typeof m?.memory_bytes === 'number' ? m.memory_bytes : null, - lastActivity: agent.last_activity_at ?? null, - contextBudgetPct: typeof agent.context_budget_pct === 'number' ? agent.context_budget_pct : null, - currentState: agent.current_state ?? 'working', - }; - }) - ) - .catch( - () => - [] as Array<{ - name: string; - cli: string | null; - status: 'online'; - pid: number | null; - uptimeSecs: number | null; - memoryBytes: number | null; - lastActivity: string | null; - contextBudgetPct: number | null; - currentState: 'working' | 'idle' | 'blocked_on_send'; - }> - ); - - await client.shutdown().catch(() => undefined); + let onlineAgents: Array<{ + name: string; + cli: string | null; + status: 'online'; + pid: number | null; + uptimeSecs: number | null; + memoryBytes: number | null; + lastActivity: string | null; + contextBudgetPct: number | null; + currentState: 'working' | 'idle' | 'blocked_on_send'; + }>; + try { + onlineAgents = (await client.listAgents()) + .filter((agent) => (options.all ? true : !shouldHideLocalAgentByDefault(agent.name))) + .map((agent) => { + const m = metricsByName.get(agent.name); + return { + name: agent.name, + cli: agent.cli || agent.runtime || null, + // An agent present in the broker's live list is connected. We + // do not synthesize idle/exited here — that requires broker + // lifecycle state the CLI cannot observe without a follow-up + // broker change. + status: 'online' as const, + pid: m?.pid ?? agent.pid ?? null, + uptimeSecs: typeof m?.uptime_secs === 'number' ? m.uptime_secs : null, + memoryBytes: typeof m?.memory_bytes === 'number' ? m.memory_bytes : null, + lastActivity: agent.last_activity_at ?? null, + contextBudgetPct: typeof agent.context_budget_pct === 'number' ? agent.context_budget_pct : null, + currentState: agent.current_state ?? 'working', + }; + }); + } catch (err: unknown) { + deps.error(formatListAgentsError(err)); + deps.error('Start the broker with `agent-relay up` and try again.'); + deps.exit(1); + return; + } finally { + await client.shutdown().catch(() => undefined); + } if (options.json) { deps.log(JSON.stringify(onlineAgents, null, 2)); diff --git a/src/cli/lib/broker-lifecycle.ts b/src/cli/lib/broker-lifecycle.ts index 286fc8a6e..e599e2e4a 100644 --- a/src/cli/lib/broker-lifecycle.ts +++ b/src/cli/lib/broker-lifecycle.ts @@ -321,6 +321,20 @@ function isBrokerExecutableCommand(command: string): boolean { return basename === 'agent-relay-broker' || basename.startsWith('agent-relay-broker-'); } +function isForegroundBrokerCliCommand(command: string): boolean { + if (command.includes('agent-relay-mcp')) { + return false; + } + if (!/(?:^|\s)up(?:\s|$)/.test(command) || !/(?:^|\s)--foreground(?:\s|=|$)/.test(command)) { + return false; + } + return /(?:^|\s)(?:\S*agent-relay(?:\.js)?|\S*agent-relay-[^\s]+)(?:\s|$)/.test(command); +} + +function isBrokerProcessCommand(command: string): boolean { + return isBrokerExecutableCommand(command) || isForegroundBrokerCliCommand(command); +} + function escapeRegExp(value: string): string { return value.replace(/[.*+?^${}()|[\]\\]/g, '\\$&'); } @@ -351,32 +365,62 @@ async function processCwdMatchesProjectRoot( } } -async function killOrphanedBrokerProcesses(projectRoot: string, deps: CoreDependencies): Promise { +async function terminateProcess(pid: number, deps: CoreDependencies, force: boolean): Promise { + try { + deps.killProcess(pid, 'SIGTERM'); + } catch { + return false; + } + + const exited = await waitForProcessExit(pid, force ? 500 : 300, deps); + if (exited || !force) { + return exited; + } + + try { + deps.killProcess(pid, 'SIGKILL'); + } catch { + return false; + } + return waitForProcessExit(pid, 500, deps); +} + +async function killOrphanedBrokerProcesses( + projectRoot: string, + deps: CoreDependencies, + options?: { force?: boolean } +): Promise<{ matchedCount: number; killedCount: number }> { + let matchedCount = 0; + let killedCount = 0; try { const resolvedProjectRoot = path.resolve(projectRoot); const brokerName = path.basename(resolvedProjectRoot) || 'project'; const candidates: ProcessInfo[] = []; try { const processList = await deps.execCommand('ps aux'); - const brokerProcesses = processList.stdout + const relayProcesses = processList.stdout .split('\n') .map(parsePsAuxLine) .filter((process): process is ProcessInfo => process !== null) - .filter((process) => isBrokerExecutableCommand(process.command)); + .filter((process) => isBrokerProcessCommand(process.command)); const matchedPids = new Set(); - for (const processInfo of brokerProcesses) { + for (const processInfo of relayProcesses) { if (commandHasProjectRoot(processInfo.command, resolvedProjectRoot)) { candidates.push(processInfo); matchedPids.add(processInfo.pid); } } - for (const processInfo of brokerProcesses) { + for (const processInfo of relayProcesses) { + if (matchedPids.has(processInfo.pid)) { + continue; + } + const cwdMatches = await processCwdMatchesProjectRoot(processInfo, resolvedProjectRoot, deps); + if (!cwdMatches) continue; if ( - matchedPids.has(processInfo.pid) || - !commandHasBrokerName(processInfo.command, brokerName) || - !(await processCwdMatchesProjectRoot(processInfo, resolvedProjectRoot, deps)) + isBrokerExecutableCommand(processInfo.command) && + !commandHasBrokerName(processInfo.command, brokerName) ) { continue; } @@ -390,20 +434,19 @@ async function killOrphanedBrokerProcesses(projectRoot: string, deps: CoreDepend if (pid === deps.pid) { continue; } + matchedCount += 1; deps.warn(`Killing orphaned broker process (pid: ${pid})`); - try { - deps.killProcess(pid, 'SIGTERM'); - } catch { - // Process may have already exited. + const killed = await terminateProcess(pid, deps, options?.force === true); + if (killed) { + killedCount += 1; + } else if (options?.force === true) { + deps.warn(`Broker orphan process may still be running (pid: ${pid})`); } } - // Give killed processes a moment to exit. - if (candidates.length > 0) { - await deps.sleep(300); - } } catch { // Best-effort orphan cleanup. } + return { matchedCount, killedCount }; } function ensureBundledRelaycastMcpCommand(deps: CoreDependencies): void { @@ -428,6 +471,49 @@ async function waitForProcessExit(pid: number, timeoutMs: number, deps: CoreDepe return false; } +async function recoverHalfStartedBroker( + paths: CoreProjectPaths, + deps: CoreDependencies +): Promise<'running' | 'recovered' | 'clear' | 'blocked'> { + deps.fs.mkdirSync(paths.dataDir, { recursive: true }); + const readiness = await waitForBrokerReadiness(paths, deps, 0, true); + if (readiness.state === 'running') { + return 'running'; + } + + if (readiness.state === 'starting') { + deps.warn( + `Broker process is running but the API is not ready; killing half-started broker (pid: ${readiness.conn.pid}).` + ); + const stopped = await terminateProcess(readiness.conn.pid, deps, true); + if (!stopped) { + deps.error( + `Failed to stop half-started broker process (pid: ${readiness.conn.pid}). ` + + 'Run `agent-relay down --force` to retry cleanup, or remove `.agent-relay/` after stopping the process.' + ); + return 'blocked'; + } + cleanupBrokerFiles(paths, deps); + return 'recovered'; + } + + const orphanCleanup = await killOrphanedBrokerProcesses(paths.projectRoot, deps, { force: true }); + if (orphanCleanup.matchedCount > 0) { + if (orphanCleanup.killedCount < orphanCleanup.matchedCount) { + deps.error( + 'Failed to stop all half-started broker processes. ' + + 'Run `agent-relay down --force` to retry cleanup, or remove `.agent-relay/` after stopping the processes.' + ); + return 'blocked'; + } + cleanupBrokerFiles(paths, deps); + return 'recovered'; + } + + cleanupBrokerFiles(paths, deps); + return 'clear'; +} + function cleanupBrokerFiles(paths: CoreProjectPaths, deps: CoreDependencies): void { const runtimePath = path.join(paths.dataDir, 'runtime.json'); const relaySockPath = path.join(paths.dataDir, 'relay.sock'); @@ -1146,6 +1232,23 @@ export async function runUpCommand(options: UpOptions, deps: CoreDependencies): } if (options.background || (options.dashboard === false && !options.foreground)) { + const preflight = await recoverHalfStartedBroker(paths, deps); + if (preflight === 'running') { + const pid = readBrokerPid(paths.dataDir, deps); + deps.error( + pid + ? `Broker already running for this project (pid: ${pid}).` + : 'Broker already running for this project.' + ); + deps.error('Run `agent-relay status` to inspect it, then `agent-relay down` to stop it.'); + deps.exit(1); + return; + } + if (preflight === 'blocked') { + deps.exit(1); + return; + } + const args = childUpArgsForDetachedStart(options, deps); const invocation = detachedCliInvocation(deps, args); let child: SpawnedProcess; @@ -1175,6 +1278,24 @@ export async function runUpCommand(options: UpOptions, deps: CoreDependencies): deps.error( 'Run `agent-relay status --wait-for=10` for details, or `agent-relay down --force` to clean up.' ); + const cleanupPids = new Set(); + if (typeof child.pid === 'number' && child.pid > 0) { + cleanupPids.add(child.pid); + } + if (readiness.state === 'starting') { + cleanupPids.add(readiness.conn.pid); + } + for (const cleanupPid of cleanupPids) { + deps.warn(`Cleaning up failed broker start (pid: ${cleanupPid})`); + const stopped = await terminateProcess(cleanupPid, deps, true); + if (!stopped) { + deps.error( + `Failed to stop half-started broker process (pid: ${cleanupPid}). ` + + 'Run `agent-relay down --force` to retry cleanup, or remove `.agent-relay/` after stopping the process.' + ); + } + } + cleanupBrokerFiles(paths, deps); deps.exit(1); return; } @@ -1522,7 +1643,7 @@ export async function runDownCommand(options: DownOptions, deps: CoreDependencie const conn = readBrokerConnectionFromFs(deps.fs, paths.dataDir); if (!conn) { if (options.force) { - await killOrphanedBrokerProcesses(paths.projectRoot, deps); + await killOrphanedBrokerProcesses(paths.projectRoot, deps, { force: true }); cleanupBrokerFiles(paths, deps); deps.log('Cleaned up (was not running)'); } else { diff --git a/src/cli/lib/doctor.ts b/src/cli/lib/doctor.ts index 27946677e..55e1b013b 100644 --- a/src/cli/lib/doctor.ts +++ b/src/cli/lib/doctor.ts @@ -1,5 +1,6 @@ import fs from 'node:fs'; import path from 'node:path'; +import { execFileSync } from 'node:child_process'; import { createRequire } from 'node:module'; import { getProjectPaths } from '@agent-relay/config'; import { AgentRelayClient, type BrokerStatus } from '@agent-relay/sdk'; @@ -37,8 +38,191 @@ interface DiagnosticDb { close?: () => void; } +interface DoctorBrokerConnection { + url: string; + api_key: string; + pid: number; +} + +interface BrokerProcessInfo { + pid: number; + command: string; +} + +function parseBrokerConnection(raw: string): DoctorBrokerConnection | null { + try { + const parsed = JSON.parse(raw) as unknown; + if ( + typeof parsed === 'object' && + parsed !== null && + !Array.isArray(parsed) && + typeof (parsed as { url?: unknown }).url === 'string' && + typeof (parsed as { api_key?: unknown }).api_key === 'string' && + typeof (parsed as { pid?: unknown }).pid === 'number' && + (parsed as { pid: number }).pid > 0 + ) { + const conn = parsed as DoctorBrokerConnection; + return { url: conn.url, api_key: conn.api_key, pid: conn.pid }; + } + } catch { + // Handled by caller as invalid metadata. + } + return null; +} + +function readBrokerConnectionFile(dataDir: string): { + path: string; + exists: boolean; + conn: DoctorBrokerConnection | null; +} { + const connPath = path.join(dataDir, 'connection.json'); + if (!fs.existsSync(connPath)) { + return { path: connPath, exists: false, conn: null }; + } + + try { + return { + path: connPath, + exists: true, + conn: parseBrokerConnection(fs.readFileSync(connPath, 'utf-8')), + }; + } catch { + return { path: connPath, exists: true, conn: null }; + } +} + +function isProcessRunning(pid: number): boolean { + try { + process.kill(pid, 0); + return true; + } catch { + return false; + } +} + +function parsePsLine(line: string): BrokerProcessInfo | null { + const match = line.match(/^\s*(\d+)\s+(.+)$/); + if (!match) return null; + const pid = Number.parseInt(match[1], 10); + if (!Number.isFinite(pid) || pid <= 0 || pid === process.pid) return null; + return { pid, command: match[2] }; +} + +function findLiveBrokerProcesses(projectRoot: string, dataDir: string): BrokerProcessInfo[] { + let output: string; + try { + output = execFileSync('ps', ['axo', 'pid=,command='], { encoding: 'utf-8' }); + } catch { + return []; + } + + const resolvedProjectRoot = path.resolve(projectRoot); + const resolvedDataDir = path.resolve(dataDir); + const brokerName = path.basename(resolvedProjectRoot); + return output + .split('\n') + .map(parsePsLine) + .filter((processInfo): processInfo is BrokerProcessInfo => processInfo !== null) + .filter((processInfo) => processInfo.command.includes('agent-relay-broker')) + .filter( + (processInfo) => + processInfo.command.includes(resolvedProjectRoot) || + processInfo.command.includes(resolvedDataDir) || + (brokerName !== '' && processInfo.command.includes(`--name ${brokerName}`)) + ); +} + +function unresolvedTemplate(value: string | undefined): string | null { + const trimmed = value?.trim(); + if (!trimmed) return null; + const match = trimmed.match(/\$\{[^}]+\}/); + return match?.[0] ?? null; +} + async function checkBrokerReliability(): Promise { let client: AgentRelayClient | undefined; + const paths = getProjectPaths(); + const connection = readBrokerConnectionFile(paths.dataDir); + const relayApiKeyTemplate = unresolvedTemplate(process.env.RELAY_API_KEY); + const relayApiKeyResult: CheckResult | null = relayApiKeyTemplate + ? { + name: 'Relaycast API key', + ok: false, + message: `Unresolved RELAY_API_KEY template (${relayApiKeyTemplate})`, + remediation: 'Export a real rk_live_... workspace key instead of a literal ${...} placeholder.', + } + : null; + + if (!connection.exists) { + const liveBrokers = findLiveBrokerProcesses(paths.projectRoot, paths.dataDir); + if (liveBrokers.length > 0) { + return [ + ...(relayApiKeyResult ? [relayApiKeyResult] : []), + { + name: 'Broker connection', + ok: false, + message: `Broker process alive but ${relativePath(connection.path)} is missing (pid${liveBrokers.length === 1 ? '' : 's'}: ${liveBrokers.map((processInfo) => processInfo.pid).join(', ')})`, + remediation: + 'Run `agent-relay down --force`, then `agent-relay up` to clear the half-started broker.', + }, + { + name: 'Outbound queues', + ok: true, + message: 'Skipped (broker connection metadata missing)', + }, + ]; + } + + return [ + ...(relayApiKeyResult ? [relayApiKeyResult] : []), + { + name: 'Broker connection', + ok: true, + message: 'Skipped (broker not running: no connection metadata found)', + }, + { + name: 'Outbound queues', + ok: true, + message: 'Skipped (broker not running)', + }, + ]; + } + + if (!connection.conn) { + return [ + ...(relayApiKeyResult ? [relayApiKeyResult] : []), + { + name: 'Broker connection', + ok: false, + message: `Invalid broker connection metadata at ${relativePath(connection.path)}`, + remediation: 'Run `agent-relay down --force`, then `agent-relay up` to rewrite connection metadata.', + }, + { + name: 'Outbound queues', + ok: true, + message: 'Skipped (broker connection metadata invalid)', + }, + ]; + } + + if (!isProcessRunning(connection.conn.pid)) { + return [ + ...(relayApiKeyResult ? [relayApiKeyResult] : []), + { + name: 'Broker connection', + ok: false, + message: `Stale broker connection metadata: pid ${connection.conn.pid} is not running`, + remediation: + 'Run `agent-relay down --force`, then `agent-relay up` to remove stale connection metadata.', + }, + { + name: 'Outbound queues', + ok: true, + message: 'Skipped (broker process is not running)', + }, + ]; + } + try { client = AgentRelayClient.connect({ cwd: process.cwd() }); const status = await client.getStatus(); @@ -50,6 +234,12 @@ async function checkBrokerReliability(): Promise { const pending = typedStatus.pending_deliveries ?? []; const stuck = pending.filter((delivery) => (delivery.age_ms ?? 0) >= 10_000 || delivery.last_error); return [ + ...(relayApiKeyResult ? [relayApiKeyResult] : []), + { + name: 'Broker connection', + ok: true, + message: `Reachable at ${connection.conn.url} (pid ${connection.conn.pid})`, + }, { name: 'Broker auth', ok: true, @@ -71,10 +261,13 @@ async function checkBrokerReliability(): Promise { } catch (err: unknown) { const message = err instanceof Error ? err.message : String(err); return [ + ...(relayApiKeyResult ? [relayApiKeyResult] : []), { - name: 'Broker auth', - ok: true, - message: `Skipped (broker unavailable: ${message})`, + name: 'Broker connection', + ok: false, + message: `Stale or unreachable broker connection metadata: ${message}`, + remediation: + 'Run `agent-relay status --wait-for=10` to confirm readiness, or `agent-relay down --force` before retrying.', }, { name: 'Outbound queues', diff --git a/src/cli/lib/project-broker-client.test.ts b/src/cli/lib/project-broker-client.test.ts new file mode 100644 index 000000000..ad36fd24f --- /dev/null +++ b/src/cli/lib/project-broker-client.test.ts @@ -0,0 +1,32 @@ +import { afterEach, describe, expect, it, vi } from 'vitest'; + +const connectMock = vi.hoisted(() => vi.fn()); + +vi.mock('@agent-relay/sdk', () => ({ + AgentRelayClient: { + connect: connectMock, + }, +})); + +describe('project broker client resolution', () => { + afterEach(() => { + connectMock.mockReset(); + vi.unstubAllEnvs(); + }); + + it('connects through the project connection file even when AGENT_RELAY_STATE_DIR points elsewhere', async () => { + const client = { getSession: vi.fn(async () => ({ workspace_key: 'rk_live_project' })) }; + connectMock.mockReturnValue(client); + vi.stubEnv('AGENT_RELAY_STATE_DIR', '/tmp/stale-state'); + + const { connectProjectBrokerClient, getProjectBrokerConnectionPath } = + await import('./project-broker-client.js'); + + expect(getProjectBrokerConnectionPath('/tmp/project')).toBe('/tmp/project/.agent-relay/connection.json'); + expect(connectProjectBrokerClient('/tmp/project')).toBe(client); + expect(connectMock).toHaveBeenCalledWith({ + cwd: '/tmp/project', + connectionPath: '/tmp/project/.agent-relay/connection.json', + }); + }); +}); diff --git a/src/cli/lib/project-broker-client.ts b/src/cli/lib/project-broker-client.ts new file mode 100644 index 000000000..c1f38cf4f --- /dev/null +++ b/src/cli/lib/project-broker-client.ts @@ -0,0 +1,14 @@ +import path from 'node:path'; + +import { AgentRelayClient } from '@agent-relay/sdk'; + +export function getProjectBrokerConnectionPath(projectRoot: string): string { + return path.join(projectRoot, '.agent-relay', 'connection.json'); +} + +export function connectProjectBrokerClient(projectRoot: string): AgentRelayClient { + return AgentRelayClient.connect({ + cwd: projectRoot, + connectionPath: getProjectBrokerConnectionPath(projectRoot), + }); +}