From 2ecfb01884a7cd3eae7a67768c20b84feb994c29 Mon Sep 17 00:00:00 2001 From: Will Washburn Date: Mon, 18 May 2026 20:56:13 -0400 Subject: [PATCH 1/8] refactor: split broker main entrypoint --- .../completed/2026-05/traj_f9wxa8ujeg78.json | 57 + .../completed/2026-05/traj_f9wxa8ujeg78.md | 34 + .trajectories/index.json | 9 +- crates/broker/src/cli/mod.rs | 241 + crates/broker/src/cli_mcp_args.rs | 2 +- crates/broker/src/main.rs | 7730 +---------------- crates/broker/src/pty_worker.rs | 22 +- crates/broker/src/routing.rs | 2 +- crates/broker/src/runtime.rs | 7512 ++++++++++++++++ crates/broker/src/worker.rs | 2 +- crates/broker/src/wrap.rs | 34 +- 11 files changed, 7906 insertions(+), 7739 deletions(-) create mode 100644 .trajectories/completed/2026-05/traj_f9wxa8ujeg78.json create mode 100644 .trajectories/completed/2026-05/traj_f9wxa8ujeg78.md create mode 100644 crates/broker/src/cli/mod.rs create mode 100644 crates/broker/src/runtime.rs diff --git a/.trajectories/completed/2026-05/traj_f9wxa8ujeg78.json b/.trajectories/completed/2026-05/traj_f9wxa8ujeg78.json new file mode 100644 index 000000000..edd1dc7cc --- /dev/null +++ b/.trajectories/completed/2026-05/traj_f9wxa8ujeg78.json @@ -0,0 +1,57 @@ +{ + "id": "traj_f9wxa8ujeg78", + "version": 1, + "task": { + "title": "Refactor broker main for issue 875", + "source": { + "system": "plain", + "id": "#875" + } + }, + "status": "completed", + "startedAt": "2026-05-19T00:54:40.328Z", + "completedAt": "2026-05-19T00:55:57.506Z", + "agents": [ + { + "name": "default", + "role": "lead", + "joinedAt": "2026-05-19T00:54:43.119Z" + } + ], + "chapters": [ + { + "id": "chap_si2lityvv2kf", + "title": "Work", + "agentName": "default", + "startedAt": "2026-05-19T00:54:43.119Z", + "endedAt": "2026-05-19T00:55:57.506Z", + "events": [ + { + "ts": 1779152083121, + "type": "decision", + "content": "Split broker binary entrypoint mechanically: Split broker binary entrypoint mechanically", + "raw": { + "question": "Split broker binary entrypoint mechanically", + "chosen": "Split broker binary entrypoint mechanically", + "alternatives": [], + "reasoning": "Kept behavior stable by moving clap parsing to crates/broker/src/cli/mod.rs and the existing broker runtime/test code to crates/broker/src/runtime.rs, then updated sibling modules to import moved helpers explicitly." + }, + "significance": "high" + } + ] + } + ], + "retrospective": { + "summary": "Split agent-relay-broker main.rs into a thin entrypoint, cli command parser module, and runtime module; updated sibling imports and verified cargo check, cargo test, cargo test --release, cargo fmt --check, and cargo clippy -- -D warnings.", + "approach": "Standard approach", + "confidence": 0.9 + }, + "commits": [], + "filesChanged": [], + "projectId": "/Users/will/Projects/AgentWorkforce/relay", + "tags": [], + "_trace": { + "startRef": "8a6b9b41b6d2de072e41ecd62f382419e1efb764", + "endRef": "8a6b9b41b6d2de072e41ecd62f382419e1efb764" + } +} diff --git a/.trajectories/completed/2026-05/traj_f9wxa8ujeg78.md b/.trajectories/completed/2026-05/traj_f9wxa8ujeg78.md new file mode 100644 index 000000000..dfdf3aaba --- /dev/null +++ b/.trajectories/completed/2026-05/traj_f9wxa8ujeg78.md @@ -0,0 +1,34 @@ +# Trajectory: Refactor broker main for issue 875 + +> **Status:** ✅ Completed +> **Task:** #875 +> **Confidence:** 90% +> **Started:** May 18, 2026 at 08:54 PM +> **Completed:** May 18, 2026 at 08:55 PM + +--- + +## Summary + +Split agent-relay-broker main.rs into a thin entrypoint, cli command parser module, and runtime module; updated sibling imports and verified cargo check, cargo test, cargo test --release, cargo fmt --check, and cargo clippy -- -D warnings. + +**Approach:** Standard approach + +--- + +## Key Decisions + +### Split broker binary entrypoint mechanically + +- **Chose:** Split broker binary entrypoint mechanically +- **Reasoning:** Kept behavior stable by moving clap parsing to crates/broker/src/cli/mod.rs and the existing broker runtime/test code to crates/broker/src/runtime.rs, then updated sibling modules to import moved helpers explicitly. + +--- + +## Chapters + +### 1. Work + +_Agent: default_ + +- Split broker binary entrypoint mechanically: Split broker binary entrypoint mechanically diff --git a/.trajectories/index.json b/.trajectories/index.json index fb4ae1559..043669111 100644 --- a/.trajectories/index.json +++ b/.trajectories/index.json @@ -1,6 +1,6 @@ { "version": 1, - "lastUpdated": "2026-05-19T00:17:27.820Z", + "lastUpdated": "2026-05-19T00:55:57.678Z", "trajectories": { "traj_05xg7j388bc4": { "title": "Add browser workflow step integration", @@ -953,6 +953,13 @@ "startedAt": "2026-05-19T00:07:13.993Z", "completedAt": "2026-05-19T00:17:27.680Z", "path": "/Users/will/Projects/AgentWorkforce/relay/.trajectories/completed/2026-05/traj_jmf9pyt3zikn.json" + }, + "traj_f9wxa8ujeg78": { + "title": "Refactor broker main for issue 875", + "status": "completed", + "startedAt": "2026-05-19T00:54:40.328Z", + "completedAt": "2026-05-19T00:55:57.506Z", + "path": "/Users/will/Projects/AgentWorkforce/relay/.trajectories/completed/2026-05/traj_f9wxa8ujeg78.json" } } } diff --git a/crates/broker/src/cli/mod.rs b/crates/broker/src/cli/mod.rs new file mode 100644 index 000000000..335b6295f --- /dev/null +++ b/crates/broker/src/cli/mod.rs @@ -0,0 +1,241 @@ +use std::path::PathBuf; + +use anyhow::Result; +use clap::{Parser, Subcommand, ValueEnum}; +use relay_broker::{ + protocol::HeadlessProvider as ProtocolHeadlessProvider, + telemetry::{TelemetryClient, TelemetryEvent}, +}; + +use crate::{cli_mcp_args, pty_worker, runtime, swarm, wrap}; + +#[derive(Debug, Parser)] +#[command(name = "agent-relay-broker")] +#[command(about = "Agent relay broker and worker runtime")] +struct Cli { + #[command(subcommand)] + command: Commands, +} + +#[derive(Debug, Subcommand)] +enum Commands { + Init(InitCommand), + Pty(PtyCommand), + Headless(HeadlessCommand), + /// Compute MCP injection args and side-effect config file paths for a CLI + /// without spawning it. Outputs JSON to stdout. + McpArgs(McpArgsCommand), + /// Run ad-hoc swarm execution via the relay broker + Swarm(swarm::SwarmArgs), + /// Capture the current visible PTY screen of a running worker and print + /// it. Talks to the broker over its listen API. + DumpPty(DumpPtyCommand), + /// Internal: wraps a CLI in a PTY with interactive passthrough. + /// Used by the SDK — not for direct user invocation. + /// Usage: agent-relay-broker wrap codex -- --full-auto + #[command(hide = true)] + Wrap { + /// The CLI to wrap (e.g. "codex", "claude") + cli: String, + /// Additional arguments passed to the wrapped CLI + #[arg(trailing_var_arg = true, allow_hyphen_values = true)] + args: Vec, + }, +} + +impl Commands { + fn telemetry_name(&self) -> &'static str { + match self { + Commands::Init(_) => "init", + Commands::Pty(_) => "pty", + Commands::Headless(_) => "headless", + Commands::McpArgs(_) => "mcp_args", + Commands::Swarm(_) => "swarm", + Commands::DumpPty(_) => "dump_pty", + Commands::Wrap { .. } => "wrap", + } + } +} + +pub(crate) async fn run() -> Result<()> { + runtime::init_tracing(); + + let cli = Cli::parse(); + let telemetry = TelemetryClient::new(); + telemetry.track(TelemetryEvent::CliCommandRun { + command_name: cli.command.telemetry_name().to_string(), + }); + + match cli.command { + Commands::Init(cmd) => runtime::run_init(cmd, telemetry).await, + Commands::Pty(cmd) => pty_worker::run_pty_worker(cmd).await, + Commands::Headless(cmd) => runtime::run_headless_worker(cmd).await, + Commands::McpArgs(cmd) => cli_mcp_args::run_mcp_args(cmd).await, + Commands::Swarm(args) => swarm::run_swarm(args).await, + Commands::DumpPty(cmd) => runtime::run_dump_pty(cmd).await, + Commands::Wrap { cli, args } => wrap::run_wrap(cli, args, false, telemetry).await, + } +} + +#[derive(Debug, clap::Args, Clone)] +pub(crate) struct DumpPtyCommand { + /// Worker name to snapshot. + pub(crate) name: String, + + /// Snapshot format. `plain` is one-line-per-row UTF-8; `ansi` is the + /// reproduction byte stream (control characters + SGR + cursor commands) + /// suitable for piping into a terminal. + #[arg(long, default_value = "plain")] + pub(crate) format: DumpPtyFormat, + + /// Override the broker base URL. Falls back to RELAY_BROKER_URL, then to + /// reading `.agent-relay/connection.json` in the current directory. + #[arg(long)] + pub(crate) broker_url: Option, + + /// Override the broker API key. Falls back to RELAY_BROKER_API_KEY, then + /// to reading `.agent-relay/connection.json` in the current directory. + #[arg(long)] + pub(crate) api_key: Option, + + /// Override the directory containing `.agent-relay/connection.json` when + /// auto-discovering the broker. + #[arg(long)] + pub(crate) state_dir: Option, +} + +#[derive(Debug, Clone, Copy, ValueEnum)] +pub(crate) enum DumpPtyFormat { + Plain, + Ansi, +} + +impl DumpPtyFormat { + pub(crate) fn as_wire_str(&self) -> &'static str { + match self { + Self::Plain => "plain", + Self::Ansi => "ansi", + } + } +} + +#[derive(Debug, clap::Args, Clone)] +pub(crate) struct McpArgsCommand { + /// CLI name or command to compute MCP args for. + #[arg(long)] + pub(crate) cli: String, + + /// Relaycast agent name to inject into the MCP configuration. + #[arg(long)] + pub(crate) agent_name: String, + + /// Relaycast API key. Falls back to RELAY_API_KEY when omitted. + #[arg(long)] + pub(crate) api_key: Option, + + /// Relaycast base URL. Falls back to RELAY_BASE_URL when omitted. + #[arg(long)] + pub(crate) base_url: Option, + + /// Pre-registered agent token to pass to the child MCP server. + #[arg(long)] + pub(crate) agent_token: Option, + + /// Register a fresh Relaycast agent token and inject it into the child MCP server. + #[arg(long)] + pub(crate) register: bool, + + /// Multi-workspace context JSON to pass to the child MCP server. + #[arg(long)] + pub(crate) workspaces_json: Option, + + /// Default workspace ID/name to pass to the child MCP server. + #[arg(long)] + pub(crate) default_workspace: Option, + + /// Working directory used by CLIs that need local MCP config files. + #[arg(long)] + pub(crate) cwd: Option, + + /// Existing CLI args as a JSON string array, e.g. '["--foo","--bar"]'. + #[arg(long)] + pub(crate) existing_args: Option, +} + +#[derive(Debug, clap::Args)] +pub(crate) struct InitCommand { + #[arg(long, default_value = "")] + pub(crate) name: String, + + #[arg(long, default_value = "general")] + pub(crate) channels: String, + + /// Optional HTTP API port for dashboard proxy (0 = disabled) + #[arg(long, default_value = "0")] + pub(crate) api_port: u16, + + /// Bind address for the HTTP API (default: 127.0.0.1). + /// Use 0.0.0.0 to accept connections from outside the host (e.g. in + /// Daytona sandboxes where a remote client connects via preview URL). + #[arg(long, default_value = "127.0.0.1")] + pub(crate) api_bind: String, + + /// Enable persistence: write state, pending-deliveries, lock, PID, and MCP + /// config to `.agent-relay/` in the working directory. When omitted (the + /// default), runtime files are written to a deterministic temp directory and + /// cleaned up opportunistically; identity registration is non-strict to avoid + /// stale-name collisions across short-lived sessions. + #[arg(long, default_value_t = false)] + pub(crate) persist: bool, + + /// Override the directory used for broker state files (connection.json, + /// locks, state, pending-deliveries). Defaults to `.agent-relay/` in the + /// working directory when `--persist` is set, or a temp directory otherwise. + #[arg(long)] + pub(crate) state_dir: Option, +} + +#[derive(Debug, clap::Args, Clone)] +pub(crate) struct PtyCommand { + pub(crate) cli: String, + + #[arg(last = true)] + pub(crate) args: Vec, + + #[arg(long)] + pub(crate) agent_name: Option, + + /// Emit delivery_active events when output matches progress patterns. + #[arg(long)] + pub(crate) progress: bool, + + /// Silence duration in seconds before emitting agent_idle (0 = disabled). + #[arg(long, default_value = "30")] + pub(crate) idle_threshold_secs: u64, +} + +#[derive(Debug, clap::Args, Clone)] +pub(crate) struct HeadlessCommand { + pub(crate) provider: HeadlessCliProvider, + + #[arg(last = true)] + pub(crate) args: Vec, + + #[arg(long)] + pub(crate) agent_name: Option, +} + +#[derive(Debug, Clone, Copy, ValueEnum)] +pub(crate) enum HeadlessCliProvider { + Claude, + Opencode, +} + +impl From for ProtocolHeadlessProvider { + fn from(value: HeadlessCliProvider) -> Self { + match value { + HeadlessCliProvider::Claude => Self::Claude, + HeadlessCliProvider::Opencode => Self::Opencode, + } + } +} diff --git a/crates/broker/src/cli_mcp_args.rs b/crates/broker/src/cli_mcp_args.rs index 43de92c94..372068afa 100644 --- a/crates/broker/src/cli_mcp_args.rs +++ b/crates/broker/src/cli_mcp_args.rs @@ -10,7 +10,7 @@ use relay_broker::{ }; use serde::{Deserialize, Serialize}; -use crate::McpArgsCommand; +use crate::cli::McpArgsCommand; #[derive(Debug, Clone, PartialEq, Eq, Serialize, Deserialize)] #[serde(rename_all = "camelCase")] diff --git a/crates/broker/src/main.rs b/crates/broker/src/main.rs index d70cd7ffb..619366367 100644 --- a/crates/broker/src/main.rs +++ b/crates/broker/src/main.rs @@ -1,18 +1,12 @@ -use std::{ - collections::{HashMap, HashSet, VecDeque}, - path::{Path, PathBuf}, - process::Stdio, - sync::{Arc, OnceLock}, - time::{Duration, Instant}, -}; - mod broker; +mod cli; mod cli_mcp_args; mod helpers; mod listen_api; mod pty_worker; mod readiness; mod routing; +mod runtime; mod spawner; mod swarm; mod swarm_tui; @@ -21,7730 +15,14 @@ mod worker; mod worker_request; mod wrap; -use helpers::{ - agent_name_eq, detect_bypass_permissions_prompt, detect_claude_trust_prompt, - detect_codex_model_prompt, detect_gemini_action_required, detect_gemini_trust_prompt, - detect_gemini_untrusted_banner, detect_opencode_permission_prompt, floor_char_boundary, - is_auto_suggestion, is_bypass_selection_menu, is_in_editor_mode, is_self_name, - normalize_cli_name, parse_cli_command, strip_ansi, -}; -use listen_api::{ - broadcast_if_relevant, listen_api_router, DeliveryRouteError, ListenApiConfig, - ListenApiRequest, SetInboundDeliveryModeOk, -}; -use routing::display_target_for_dashboard; - -use anyhow::{Context, Result}; -use clap::{Parser, Subcommand, ValueEnum}; -use relaycast::WsEvent; -use serde::{Deserialize, Serialize}; -use serde_json::{json, Value}; -use tokio::{ - io::{AsyncBufReadExt, AsyncReadExt, AsyncWriteExt, BufReader}, - sync::{broadcast, mpsc, Notify, RwLock}, - time::{timeout, MissedTickBehavior}, -}; -use uuid::Uuid; - -use relay_broker::{ - auth::AuthClient, - control::{can_release_child, is_human_sender}, - dedup::DedupCache, - message_bridge::{map_ws_broker_command, map_ws_event}, - multi_workspace::{MultiWorkspaceSession, WorkspaceInboundMessage, WorkspaceMembershipSummary}, - protocol::{ - AgentRuntime, AgentSpec, HeadlessProvider as ProtocolHeadlessProvider, - MessageInjectionMode, ProtocolEnvelope, RelayDelivery, PROTOCOL_VERSION, - }, - pty::PtySession, - relaycast_ws::{ - format_worker_preregistration_error, registration_retry_after_secs, - retry_agent_registration, RegRetryOutcome, RelaycastHttpClient, WsControl, - }, - replay_buffer::{ReplayBuffer, DEFAULT_REPLAY_CAPACITY}, - snippets::ensure_relaycast_mcp_config, - telemetry::{ActionSource, TelemetryClient, TelemetryEvent}, - types::{ - BrokerCommandEvent, BrokerCommandPayload, InboundDeliveryDispatch, InboundDeliveryMode, - InboundDeliveryState, InboundKind, PendingRelayMessage, SenderKind, - }, -}; - -use spawner::{spawn_env_vars, Spawner}; -use worker::{WorkerEvent, WorkerHandle, WorkerRegistry}; - -const DEFAULT_DELIVERY_RETRY_MS: u64 = 1_000; -const MAX_DELIVERY_RETRIES: u32 = 10; -const DEFAULT_RELAYCAST_BASE_URL: &str = "https://api.relaycast.dev"; -use helpers::resolve_dm_participants_cached; -const THREAD_HISTORY_LIMIT: usize = 1_000; -const DEFAULT_HTTP_API_LOCAL_DELIVERY_TIMEOUT_MS: u64 = 3_000; -const DEFAULT_HTTP_API_RELAYCAST_SEND_TIMEOUT_MS: u64 = 20_000; -const DEFAULT_HTTP_API_EVENT_EMIT_TIMEOUT_MS: u64 = 200; -static TRACING_GUARD: OnceLock = OnceLock::new(); - -fn startup_debug_enabled() -> bool { - std::env::var("AGENT_RELAY_STARTUP_DEBUG") - .map(|value| { - let trimmed = value.trim(); - !trimmed.is_empty() && trimmed != "0" && !trimmed.eq_ignore_ascii_case("false") - }) - .unwrap_or(false) -} - -fn log_startup_phase(enabled: bool, started_at: Instant, message: impl AsRef) { - if enabled { - eprintln!( - "[agent-relay][startup +{}ms] {}", - started_at.elapsed().as_millis(), - message.as_ref() - ); - } -} - -#[derive(Debug, Parser)] -#[command(name = "agent-relay-broker")] -#[command(about = "Agent relay broker and worker runtime")] -struct Cli { - #[command(subcommand)] - command: Commands, -} - -#[derive(Debug, Subcommand)] -enum Commands { - Init(InitCommand), - Pty(PtyCommand), - Headless(HeadlessCommand), - /// Compute MCP injection args and side-effect config file paths for a CLI - /// without spawning it. Outputs JSON to stdout. - McpArgs(McpArgsCommand), - /// Run ad-hoc swarm execution via the relay broker - Swarm(swarm::SwarmArgs), - /// Capture the current visible PTY screen of a running worker and print - /// it. Talks to the broker over its listen API. - DumpPty(DumpPtyCommand), - /// Internal: wraps a CLI in a PTY with interactive passthrough. - /// Used by the SDK — not for direct user invocation. - /// Usage: agent-relay-broker wrap codex -- --full-auto - #[command(hide = true)] - Wrap { - /// The CLI to wrap (e.g. "codex", "claude") - cli: String, - /// Additional arguments passed to the wrapped CLI - #[arg(trailing_var_arg = true, allow_hyphen_values = true)] - args: Vec, - }, -} - -#[derive(Debug, clap::Args, Clone)] -pub(crate) struct DumpPtyCommand { - /// Worker name to snapshot. - pub(crate) name: String, - - /// Snapshot format. `plain` is one-line-per-row UTF-8; `ansi` is the - /// reproduction byte stream (control characters + SGR + cursor commands) - /// suitable for piping into a terminal. - #[arg(long, default_value = "plain")] - pub(crate) format: DumpPtyFormat, - - /// Override the broker base URL. Falls back to RELAY_BROKER_URL, then to - /// reading `.agent-relay/connection.json` in the current directory. - #[arg(long)] - pub(crate) broker_url: Option, - - /// Override the broker API key. Falls back to RELAY_BROKER_API_KEY, then - /// to reading `.agent-relay/connection.json` in the current directory. - #[arg(long)] - pub(crate) api_key: Option, - - /// Override the directory containing `.agent-relay/connection.json` when - /// auto-discovering the broker. - #[arg(long)] - pub(crate) state_dir: Option, -} - -#[derive(Debug, Clone, Copy, ValueEnum)] -pub(crate) enum DumpPtyFormat { - Plain, - Ansi, -} - -impl DumpPtyFormat { - fn as_wire_str(&self) -> &'static str { - match self { - Self::Plain => "plain", - Self::Ansi => "ansi", - } - } -} - -#[derive(Debug, clap::Args, Clone)] -pub(crate) struct McpArgsCommand { - /// CLI name or command to compute MCP args for. - #[arg(long)] - cli: String, - - /// Relaycast agent name to inject into the MCP configuration. - #[arg(long)] - agent_name: String, - - /// Relaycast API key. Falls back to RELAY_API_KEY when omitted. - #[arg(long)] - api_key: Option, - - /// Relaycast base URL. Falls back to RELAY_BASE_URL when omitted. - #[arg(long)] - base_url: Option, - - /// Pre-registered agent token to pass to the child MCP server. - #[arg(long)] - agent_token: Option, - - /// Register a fresh Relaycast agent token and inject it into the child MCP server. - #[arg(long)] - register: bool, - - /// Multi-workspace context JSON to pass to the child MCP server. - #[arg(long)] - workspaces_json: Option, - - /// Default workspace ID/name to pass to the child MCP server. - #[arg(long)] - default_workspace: Option, - - /// Working directory used by CLIs that need local MCP config files. - #[arg(long)] - cwd: Option, - - /// Existing CLI args as a JSON string array, e.g. '["--foo","--bar"]'. - #[arg(long)] - existing_args: Option, -} - -#[derive(Debug, clap::Args)] -struct InitCommand { - #[arg(long, default_value = "")] - name: String, - - #[arg(long, default_value = "general")] - channels: String, - - /// Optional HTTP API port for dashboard proxy (0 = disabled) - #[arg(long, default_value = "0")] - api_port: u16, - - /// Bind address for the HTTP API (default: 127.0.0.1). - /// Use 0.0.0.0 to accept connections from outside the host (e.g. in - /// Daytona sandboxes where a remote client connects via preview URL). - #[arg(long, default_value = "127.0.0.1")] - api_bind: String, - - /// Enable persistence: write state, pending-deliveries, lock, PID, and MCP - /// config to `.agent-relay/` in the working directory. When omitted (the - /// default), runtime files are written to a deterministic temp directory and - /// cleaned up opportunistically; identity registration is non-strict to avoid - /// stale-name collisions across short-lived sessions. - #[arg(long, default_value_t = false)] - persist: bool, - - /// Override the directory used for broker state files (connection.json, - /// locks, state, pending-deliveries). Defaults to `.agent-relay/` in the - /// working directory when `--persist` is set, or a temp directory otherwise. - #[arg(long)] - state_dir: Option, -} - -#[derive(Debug, clap::Args, Clone)] -struct PtyCommand { - cli: String, - - #[arg(last = true)] - args: Vec, - - #[arg(long)] - agent_name: Option, - - /// Emit delivery_active events when output matches progress patterns. - #[arg(long)] - progress: bool, - - /// Silence duration in seconds before emitting agent_idle (0 = disabled). - #[arg(long, default_value = "30")] - idle_threshold_secs: u64, -} - -#[derive(Debug, clap::Args, Clone)] -struct HeadlessCommand { - provider: HeadlessCliProvider, - - #[arg(last = true)] - args: Vec, - - #[arg(long)] - agent_name: Option, -} - -#[derive(Debug, Clone, Copy, ValueEnum)] -enum HeadlessCliProvider { - Claude, - Opencode, -} - -impl From for ProtocolHeadlessProvider { - fn from(value: HeadlessCliProvider) -> Self { - match value { - HeadlessCliProvider::Claude => Self::Claude, - HeadlessCliProvider::Opencode => Self::Opencode, - } - } -} - -fn headless_provider_cli_name(provider: &ProtocolHeadlessProvider) -> &'static str { - match provider { - ProtocolHeadlessProvider::Claude => "claude", - ProtocolHeadlessProvider::Opencode => "opencode", - } -} - -fn headless_provider_command( - provider: &ProtocolHeadlessProvider, - task: &str, - extra_args: &[String], -) -> (String, Vec) { - match provider { - ProtocolHeadlessProvider::Claude => { - let mut args = vec![ - "-p".to_string(), - "--dangerously-skip-permissions".to_string(), - ]; - args.extend(extra_args.iter().cloned()); - args.push(task.to_string()); - ("claude".to_string(), args) - } - ProtocolHeadlessProvider::Opencode => { - let mut args = vec!["run".to_string()]; - args.extend(extra_args.iter().cloned()); - args.push(task.to_string()); - ("opencode".to_string(), args) - } - } -} - -fn headless_provider_from_cli(value: &str) -> Option { - match value.trim().to_ascii_lowercase().as_str() { - "claude" => Some(ProtocolHeadlessProvider::Claude), - "opencode" => Some(ProtocolHeadlessProvider::Opencode), - _ => None, - } -} - -fn runtime_label(runtime: &AgentRuntime) -> &'static str { - match runtime { - AgentRuntime::Pty => "pty", - AgentRuntime::Headless => "headless", - } -} - -#[allow(clippy::too_many_arguments)] -fn build_http_api_spawn_spec( - name: String, - cli: String, - transport: Option, - model: Option, - args: Vec, - channels: Vec, - cwd: Option, - team: Option, - shadow_of: Option, - shadow_mode: Option, - restart_policy: Option, -) -> Result { - let runtime = match transport - .as_deref() - .map(str::trim) - .filter(|value| !value.is_empty()) - .map(|value| value.to_ascii_lowercase()) - { - None => AgentRuntime::Pty, - Some(value) if value == "pty" => AgentRuntime::Pty, - Some(value) if value == "headless" => AgentRuntime::Headless, - Some(other) => { - anyhow::bail!("unsupported transport '{other}' (expected 'pty' or 'headless')") - } - }; - let parsed_restart_policy = match restart_policy { - Some(v) => Some(serde_json::from_value(v).context("invalid restart_policy")?), - None => None, - }; - - let (provider, cli_command, model) = match runtime { - AgentRuntime::Pty => (None, Some(cli), model), - AgentRuntime::Headless => { - let provider = headless_provider_from_cli(&cli).with_context(|| { - format!( - "provider '{cli}' does not support headless transport (supported: claude, opencode)" - ) - })?; - (Some(provider), None, model) - } - }; - - Ok(AgentSpec { - name, - runtime, - provider, - cli: cli_command, - model, - cwd, - team, - shadow_of, - shadow_mode, - args, - channels, - restart_policy: parsed_restart_policy, - }) -} - -#[derive(Debug)] -struct RuntimePaths { - persist: bool, - state: PathBuf, - pending: PathBuf, - /// Held for process lifetime to prevent concurrent broker instances (persist mode only). - #[allow(dead_code)] - _lock: Option, -} - -/// Shared Relaycast connection state used by run_init and run_wrap. -#[derive(Clone)] -struct RelayWorkspace { - workspace_id: String, - workspace_alias: Option, - relay_workspace_key: String, - self_name: String, - self_agent_id: String, - self_names: HashSet, - self_agent_ids: HashSet, - http_client: RelaycastHttpClient, - ws_control_tx: mpsc::Sender, -} - -struct RelaySession { - http_base: String, - default_workspace_id: Option, - workspaces: Vec, - ws_inbound_rx: mpsc::Receiver, -} - -#[derive(Clone)] -struct RelayReadyState { - workspace_key: String, - memberships: Vec, - default_workspace_id: Option, -} - -async fn serve_startup_api_until_ready( - listener: tokio::net::TcpListener, - relay_ready: Arc, -) -> tokio::net::TcpListener { - loop { - tokio::select! { - _ = relay_ready.notified() => { - return listener; - } - accepted = listener.accept() => { - match accepted { - Ok((stream, _addr)) => { - tokio::spawn(handle_startup_api_connection(stream)); - } - Err(error) => { - tracing::warn!(error = %error, "startup API accept failed"); - tokio::time::sleep(Duration::from_millis(50)).await; - } - } - } - } - } -} - -async fn handle_startup_api_connection(mut stream: tokio::net::TcpStream) { - let mut buffer = [0_u8; 1024]; - let read = match timeout(Duration::from_secs(5), stream.read(&mut buffer)).await { - Ok(Ok(read)) => read, - Ok(Err(error)) => { - tracing::debug!(error = %error, "failed reading startup API request"); - return; - } - Err(_) => return, - }; - - let request = String::from_utf8_lossy(&buffer[..read]); - let path = request - .lines() - .next() - .and_then(|line| line.split_whitespace().nth(1)) - .unwrap_or("/"); - let (status, content_type, body) = if path == "/health" { - ( - "200 OK", - "application/json", - listen_api::listen_api_health_payload(None, vec![]).to_string(), - ) - } else { - ( - "503 Service Unavailable", - "text/plain; charset=utf-8", - "Broker is starting, please retry".to_string(), - ) - }; - let response = format!( - "HTTP/1.1 {status}\r\ncontent-type: {content_type}\r\ncontent-length: {}\r\nconnection: close\r\n\r\n{body}", - body.len() - ); - if let Err(error) = stream.write_all(response.as_bytes()).await { - tracing::debug!(error = %error, "failed writing startup API response"); - } -} - -/// Build the standard env-var array passed to every spawned child agent. -fn normalize_initial_task(task: Option) -> Option { - task.and_then(|value| { - if value.trim().is_empty() { - None - } else { - Some(value) - } - }) -} - -struct RelaySessionOptions<'a> { - paths: &'a RuntimePaths, - requested_name: &'a str, - channels: Vec, - strict_name: bool, - agent_type: Option<&'a str>, - /// Read .mcp.json for additional self-name identities - read_mcp_identity: bool, - /// Write relaycast server entry to .mcp.json - ensure_mcp_config: bool, - runtime_cwd: &'a Path, -} - -async fn connect_relay(opts: RelaySessionOptions<'_>) -> Result { - let startup_debug = startup_debug_enabled(); - let connect_started = Instant::now(); - let http_base = std::env::var("RELAYCAST_BASE_URL") - .ok() - .or_else(|| std::env::var("RELAY_BASE_URL").ok()) - .unwrap_or_else(|| DEFAULT_RELAYCAST_BASE_URL.to_string()); - let ws_base = std::env::var("RELAYCAST_WS_URL") - .unwrap_or_else(|_| derive_ws_base_url_from_http(&http_base)); - - log_startup_phase( - startup_debug, - connect_started, - format!( - "connect_relay begin requested_name='{}' channels={}", - opts.requested_name, - opts.channels.join(",") - ), - ); - let auth = AuthClient::new(http_base.clone()); - let sessions = auth - .startup_session_set_with_options( - Some(opts.requested_name), - opts.strict_name, - opts.agent_type, - ) - .await - .context("failed to initialize relaycast session")?; - log_startup_phase( - startup_debug, - connect_started, - format!( - "startup_session_set_with_options complete memberships={}", - sessions.memberships.len() - ), - ); - - let default_session = sessions - .default_session() - .or_else(|| sessions.memberships.first()) - .context("no relaycast memberships were initialized")?; - let relay_workspace_key = default_session.credentials.api_key.clone(); - let self_agent_id = default_session.credentials.agent_id.clone(); - let self_token = default_session.token.clone(); - let agent_name = default_session - .credentials - .agent_name - .clone() - .unwrap_or_else(|| opts.requested_name.to_string()); - - let identity_debug = format!( - "agent_name='{}' -requested='{}' -agent_id='{}' -token_prefix='{}' -default_workspace='{}' -workspace_count='{}' -timestamp='{}' -", - agent_name, - opts.requested_name, - self_agent_id, - &self_token[..self_token.len().min(16)], - default_session.credentials.workspace_id, - sessions.memberships.len(), - chrono::Utc::now().to_rfc3339() - ); - let debug_path = opts - .paths - .state - .parent() - .unwrap() - .join("identity-debug.txt"); - if std::env::var("AGENT_RELAY_NO_DEBUG_FILES").is_err() { - let _ = std::fs::write(&debug_path, &identity_debug); - eprintln!( - "[agent-relay] identity debug written to {}", - debug_path.display() - ); - } - if agent_name != opts.requested_name { - eprintln!( - "[agent-relay] WARNING: registered as '{}' (requested '{}')", - agent_name, opts.requested_name - ); - } - - if opts.ensure_mcp_config { - if let Err(error) = ensure_relaycast_mcp_config( - opts.runtime_cwd, - Some(relay_workspace_key.as_str()), - Some(http_base.as_str()), - None, - ) { - tracing::warn!("failed to ensure .mcp.json: {error}"); - } - } - - log_startup_phase( - startup_debug, - connect_started, - "MultiWorkspaceSession::new begin", - ); - let mut multi = MultiWorkspaceSession::new( - http_base.clone(), - ws_base, - auth, - sessions, - opts.channels, - opts.read_mcp_identity, - opts.runtime_cwd, - relay_broker::events::EventEmitter::new(false), - ); - log_startup_phase( - startup_debug, - connect_started, - format!( - "MultiWorkspaceSession::new complete handles={} default_workspace={:?}", - multi.handles.len(), - multi.default_workspace_id - ), - ); - - let default_workspace_id = multi.default_workspace_id.clone(); - let workspaces = multi - .handles - .drain(..) - .map(|handle| RelayWorkspace { - workspace_id: handle.workspace_id, - workspace_alias: handle.workspace_alias, - relay_workspace_key: handle.relay_workspace_key, - self_name: handle.self_name, - self_agent_id: handle.self_agent_id, - self_names: handle.self_names, - self_agent_ids: handle.self_agent_ids, - http_client: handle.http_client, - ws_control_tx: handle.ws_control_tx, - }) - .collect(); - - Ok(RelaySession { - http_base, - default_workspace_id, - workspaces, - ws_inbound_rx: multi.inbound_rx, - }) -} - -#[derive(Debug, Clone)] -struct PendingDelivery { - worker_name: String, - delivery: RelayDelivery, - attempts: u32, - next_retry_at: Instant, -} - -/// Serializable snapshot of pending deliveries for crash recovery. -#[derive(Debug, Clone, Serialize, Deserialize)] -struct PersistedPendingDelivery { - worker_name: String, - delivery: RelayDelivery, - attempts: u32, -} - -fn save_pending_deliveries( - path: &Path, - deliveries: &HashMap, -) -> Result<()> { - let persisted: Vec = deliveries - .values() - .map(|pd| PersistedPendingDelivery { - worker_name: pd.worker_name.clone(), - delivery: pd.delivery.clone(), - attempts: pd.attempts, - }) - .collect(); - let json = serde_json::to_string_pretty(&persisted)?; - let dir = path.parent().unwrap_or(path); - let mut tmp = tempfile::NamedTempFile::new_in(dir) - .with_context(|| format!("failed creating temp file in {}", dir.display()))?; - std::io::Write::write_all(&mut tmp, json.as_bytes())?; - tmp.persist(path) - .with_context(|| format!("failed persisting pending deliveries to {}", path.display()))?; - Ok(()) -} - -fn load_pending_deliveries(path: &Path) -> HashMap { - let data = match std::fs::read_to_string(path) { - Ok(d) => d, - Err(_) => return HashMap::new(), - }; - let persisted: Vec = match serde_json::from_str(&data) { - Ok(v) => v, - Err(_) => return HashMap::new(), - }; - persisted - .into_iter() - .map(|p| { - let id = p.delivery.delivery_id.clone(); - ( - id, - PendingDelivery { - worker_name: p.worker_name, - delivery: p.delivery, - attempts: p.attempts, - next_retry_at: Instant::now(), // retry immediately on restart - }, - ) - }) - .collect() -} - -// These payload structs were used by the stdio protocol handler (handle_sdk_frame). -#[derive(Debug, Serialize)] -struct AgentMetrics { - name: String, - pid: u32, - memory_bytes: u64, - uptime_secs: u64, -} - -#[derive(Debug, Deserialize)] -struct DeliveryAckPayload { - delivery_id: String, - event_id: String, -} - -#[derive(Debug, Clone, Serialize, PartialEq, Eq)] -#[serde(rename_all = "camelCase")] -struct ThreadInfo { - thread_id: String, - name: String, - unread_count: usize, - #[serde(skip_serializing_if = "Option::is_none")] - last_message: Option, - #[serde(skip_serializing_if = "Option::is_none")] - last_message_at: Option, -} - -#[derive(Debug, Clone)] -struct ThreadAccumulator { - info: ThreadInfo, - sort_key: i64, -} - -fn normalize_sender(sender: Option) -> String { - let raw = sender - .unwrap_or_else(|| "human:orchestrator".to_string()) - .trim() - .to_string(); - if raw.is_empty() { - return "human:orchestrator".to_string(); - } - if let Some(rest) = raw.strip_prefix("human:") { - let normalized_rest = rest.trim(); - if normalized_rest.is_empty() { - return "human:orchestrator".to_string(); - } - return format!("human:{normalized_rest}"); - } - raw -} - -fn sender_is_dashboard_label(sender: &str, self_name: &str) -> bool { - let trimmed = sender.trim(); - trimmed.eq_ignore_ascii_case("Dashboard") - || trimmed.eq_ignore_ascii_case("human:Dashboard") - || trimmed.eq_ignore_ascii_case("human:orchestrator") - || trimmed.eq_ignore_ascii_case(self_name) -} +use anyhow::Result; #[tokio::main] async fn main() -> Result<()> { - init_tracing(); - - let cli = Cli::parse(); - let telemetry = TelemetryClient::new(); - - let command_name = match &cli.command { - Commands::Init(_) => "init", - Commands::Pty(_) => "pty", - Commands::Headless(_) => "headless", - Commands::McpArgs(_) => "mcp_args", - Commands::Swarm(_) => "swarm", - Commands::DumpPty(_) => "dump_pty", - Commands::Wrap { .. } => "wrap", - }; - telemetry.track(TelemetryEvent::CliCommandRun { - command_name: command_name.to_string(), - }); - - match cli.command { - Commands::Init(cmd) => run_init(cmd, telemetry).await, - Commands::Pty(cmd) => pty_worker::run_pty_worker(cmd).await, - Commands::Headless(cmd) => run_headless_worker(cmd).await, - Commands::McpArgs(cmd) => cli_mcp_args::run_mcp_args(cmd).await, - Commands::Swarm(args) => swarm::run_swarm(args).await, - Commands::DumpPty(cmd) => run_dump_pty(cmd).await, - Commands::Wrap { cli, args } => wrap::run_wrap(cli, args, false, telemetry).await, - } -} - -/// Connection metadata discovered from a running broker — typically by -/// reading `/connection.json` or from explicit CLI flags / env. -struct BrokerConnection { - base_url: String, - api_key: Option, -} - -/// Resolve the broker connection by checking, in order: -/// -/// 1. Explicit CLI args (`--broker-url`, `--api-key`). When `--broker-url` -/// is supplied without an API key, we still attempt to fall back to the -/// API key from env / `.agent-relay/connection.json` so users don't have -/// to repeat `--api-key` for every dump-pty invocation. -/// 2. Env vars `RELAY_BROKER_URL` / `RELAY_BROKER_API_KEY`. -/// 3. `connection.json` in the supplied state dir, otherwise -/// `.agent-relay/connection.json` directly under the current working -/// directory. The bare `cwd` is intentionally NOT probed — an unrelated -/// `connection.json` sitting in the user's repo root must not silently -/// redirect the snapshot request (and its broker API key) elsewhere. -fn discover_broker_connection( - explicit_url: Option<&str>, - explicit_api_key: Option<&str>, - state_dir: Option<&Path>, -) -> Result { - // Walk the same search roots used for the URL fallback, but only to - // pull out a stored `api_key`. Lets `--broker-url` reuse the broker's - // saved key when the env var and `--api-key` are both unset. - let api_key_from_connection_file = || -> Option { - let cwd = std::env::current_dir().ok()?; - let roots: Vec = match state_dir { - Some(dir) => vec![dir.to_path_buf()], - None => vec![cwd.join(".agent-relay")], - }; - for root in roots { - let path = root.join("connection.json"); - if !path.is_file() { - continue; - } - let body = std::fs::read_to_string(&path).ok()?; - let value: Value = serde_json::from_str(&body).ok()?; - if let Some(key) = value.get("api_key").and_then(Value::as_str) { - if !key.trim().is_empty() { - return Some(key.to_string()); - } - } - } - None - }; - - let resolve_api_key = |explicit: Option<&str>| -> Option { - explicit - .map(ToString::to_string) - .or_else(|| std::env::var("RELAY_BROKER_API_KEY").ok()) - .or_else(api_key_from_connection_file) - .filter(|value| !value.trim().is_empty()) - }; - - if let Some(url) = explicit_url { - return Ok(BrokerConnection { - base_url: url.trim_end_matches('/').to_string(), - api_key: resolve_api_key(explicit_api_key), - }); - } - - if let Ok(url) = std::env::var("RELAY_BROKER_URL") { - let trimmed = url.trim(); - if !trimmed.is_empty() { - return Ok(BrokerConnection { - base_url: trimmed.trim_end_matches('/').to_string(), - api_key: resolve_api_key(explicit_api_key), - }); - } - } - - let cwd = std::env::current_dir().context("failed to read current directory")?; - let search_roots: Vec = match state_dir { - Some(dir) => vec![dir.to_path_buf()], - None => vec![cwd.join(".agent-relay")], - }; - - for root in &search_roots { - let path = root.join("connection.json"); - if !path.is_file() { - continue; - } - let body = std::fs::read_to_string(&path) - .with_context(|| format!("failed reading {}", path.display()))?; - let value: Value = serde_json::from_str(&body) - .with_context(|| format!("failed parsing {}", path.display()))?; - let url = value - .get("url") - .and_then(Value::as_str) - .with_context(|| format!("connection file missing 'url': {}", path.display()))? - .to_string(); - let api_key = explicit_api_key - .map(ToString::to_string) - .or_else(|| std::env::var("RELAY_BROKER_API_KEY").ok()) - .or_else(|| { - value - .get("api_key") - .and_then(Value::as_str) - .map(ToString::to_string) - }) - .filter(|value| !value.trim().is_empty()); - return Ok(BrokerConnection { - base_url: url.trim_end_matches('/').to_string(), - api_key, - }); - } - - anyhow::bail!( - "could not locate broker connection. Pass --broker-url, set RELAY_BROKER_URL, \ - or run from a directory containing .agent-relay/connection.json" - ); -} - -/// `agent-relay-broker dump-pty ` — capture and print a worker's -/// current visible screen by hitting the broker's snapshot route. -async fn run_dump_pty(cmd: DumpPtyCommand) -> Result<()> { - use base64::Engine; - - let connection = discover_broker_connection( - cmd.broker_url.as_deref(), - cmd.api_key.as_deref(), - cmd.state_dir.as_deref(), - )?; - - let url = format!( - "{}/api/spawned/{}/snapshot?format={}", - connection.base_url, - urlencoding::encode(&cmd.name), - cmd.format.as_wire_str(), - ); - - let client = reqwest::Client::builder() - .timeout(Duration::from_secs(10)) - .build() - .context("failed to build http client")?; - - let mut request = client.get(&url); - if let Some(key) = connection.api_key.as_deref() { - request = request.header("X-API-Key", key); - } - let response = request - .send() - .await - .with_context(|| format!("failed reaching broker at {url}"))?; - let status = response.status(); - let body_bytes = response - .bytes() - .await - .context("failed reading broker response body")?; - - if !status.is_success() { - let body_str = String::from_utf8_lossy(&body_bytes); - anyhow::bail!("broker returned {status}: {body_str}"); - } - - let body: Value = - serde_json::from_slice(&body_bytes).context("broker response was not valid JSON")?; - let screen = body - .get("screen") - .and_then(Value::as_str) - .context("broker response missing 'screen' field")?; - - match cmd.format { - DumpPtyFormat::Plain => { - // The plain payload already includes the trailing newline per row. - // Print as-is so pipelines see a stable terminator. - use std::io::Write; - let mut stdout = std::io::stdout().lock(); - stdout - .write_all(screen.as_bytes()) - .context("failed writing snapshot to stdout")?; - stdout.flush().ok(); - } - DumpPtyFormat::Ansi => { - let bytes = base64::engine::general_purpose::STANDARD - .decode(screen) - .context("broker returned non-base64 ansi screen")?; - use std::io::Write; - let mut stdout = std::io::stdout().lock(); - stdout - .write_all(&bytes) - .context("failed writing snapshot to stdout")?; - stdout.flush().ok(); - } - } - - Ok(()) -} - -async fn run_init(cmd: InitCommand, telemetry: TelemetryClient) -> Result<()> { - let broker_start = Instant::now(); - let startup_debug = startup_debug_enabled(); - let mut agent_spawn_count: u32 = 0; - telemetry.track(TelemetryEvent::BrokerStart); - - let runtime_cwd = std::env::current_dir()?; - let resolved_name = if cmd.name.trim().is_empty() { - runtime_cwd - .file_name() - .and_then(|name| name.to_str()) - .filter(|name| !name.is_empty()) - .unwrap_or("project") - .to_string() - } else { - cmd.name.trim().to_string() - }; - let custom_state_dir = cmd.state_dir.as_ref().map(PathBuf::from); - log_startup_phase( - startup_debug, - broker_start, - format!( - "run_init begin name='{}' cwd='{}' persist={} channels='{}'", - resolved_name, - runtime_cwd.display(), - cmd.persist, - cmd.channels - ), - ); - let paths = if cmd.persist || custom_state_dir.is_some() { - ensure_runtime_paths(&runtime_cwd, &resolved_name, custom_state_dir.as_deref())? - } else { - // Warn if a stale .agent-relay/ dir exists from a previous persist run. - // Agents can read files from it directly (logs, state) and get confused. - let stale_dir = runtime_cwd.join(".agent-relay"); - if stale_dir.exists() { - eprintln!( - "[agent-relay] WARNING: stale .agent-relay/ directory found in {}", - runtime_cwd.display() - ); - eprintln!( - "[agent-relay] WARNING: remove it to avoid confusing spawned agents: rm -rf {}", - stale_dir.display() - ); - } - ensure_ephemeral_paths(&runtime_cwd, &resolved_name)? - }; - log_startup_phase( - startup_debug, - broker_start, - format!("runtime paths ready state='{}'", paths.state.display()), - ); - let mut state = if cmd.persist || custom_state_dir.is_some() { - broker::BrokerState::load(&paths.state).unwrap_or_default() - } else { - broker::BrokerState::default() - }; - - // Clean up agents from previous sessions whose processes have died - let reaped = state.reap_dead_agents(); - if !reaped.is_empty() { - tracing::info!( - agents = ?reaped, - "reaped {} dead agent(s) from previous session", - reaped.len() - ); - if paths.persist { - if let Err(error) = state.save(&paths.state) { - tracing::warn!(path = %paths.state.display(), error = %error, "failed to persist broker state after reaping dead agents"); - } - } - } - - if std::env::var("AGENT_RELAY_DISABLE_RELAYCAST").is_ok() { - anyhow::bail!( - "AGENT_RELAY_DISABLE_RELAYCAST is no longer supported; broker requires Relaycast" - ); - } - - // Use RELAY_AGENT_TYPE env var if set (e.g. "agent" for SDK-spawned brokers), - // otherwise default to "human" for interactive CLI usage. - let agent_type_env = std::env::var("RELAY_AGENT_TYPE").ok(); - let agent_type_ref = agent_type_env.as_deref().unwrap_or("human"); - - // HTTP/WS API — always started. This is the primary transport for SDK - // consumers, dashboards, and remote clients. When no explicit API key - // is configured, generate a random one so control endpoints are always - // authenticated (the key is written to the runtime metadata file for - // SDK discovery). - let api_key = std::env::var("RELAY_BROKER_API_KEY") - .ok() - .filter(|v| !v.trim().is_empty()) - .unwrap_or_else(|| format!("br_{}", Uuid::new_v4().simple())); - - // Set the env var so listen_api's configured_broker_api_key() picks it up. - std::env::set_var("RELAY_BROKER_API_KEY", &api_key); - - let relay_ready = Arc::new(Notify::new()); - let relay_ready_state: Arc>> = Arc::new(RwLock::new(None)); - let (api_tx, mut api_rx) = mpsc::channel::(32); - let bind_addr = format!("{}:{}", cmd.api_bind, cmd.api_port); - log_startup_phase( - startup_debug, - broker_start, - format!("binding API listener on {}", bind_addr), - ); - let listener = tokio::net::TcpListener::bind(&bind_addr) - .await - .with_context(|| format!("failed to bind API on {}", bind_addr))?; - let actual_port = listener.local_addr()?.port(); - log_startup_phase( - startup_debug, - broker_start, - format!("API listener bound on {}:{}", cmd.api_bind, actual_port), - ); - // Machine-readable on stdout (SDK parses this to discover the port). - // Diagnostic logs stay on stderr via tracing/eprintln. - println!( - "[agent-relay] API listening on http://{}:{}", - cmd.api_bind, actual_port - ); - - // Write connection file so CLI commands can find this broker. - let connection_dir = paths.state.parent().unwrap(); - let connection_path = connection_dir.join("connection.json"); - let connection = json!({ - "url": format!("http://{}:{}", cmd.api_bind, actual_port), - "port": actual_port, - "api_key": &api_key, - "pid": std::process::id(), - }); - if let Ok(json_str) = serde_json::to_string_pretty(&connection) { - if let Ok(mut tmp) = tempfile::NamedTempFile::new_in(connection_dir) { - use std::io::Write; - if tmp.write_all(json_str.as_bytes()).is_ok() { - let _ = tmp.persist(&connection_path); - tracing::info!(path = %connection_path.display(), "wrote connection file"); - } - } - } - - let (startup_listener_tx, startup_listener_rx) = - tokio::sync::oneshot::channel::(); - let relay_ready_for_startup = relay_ready.clone(); - tokio::spawn(async move { - let listener = serve_startup_api_until_ready(listener, relay_ready_for_startup).await; - let _ = startup_listener_tx.send(listener); - }); - - log_startup_phase(startup_debug, broker_start, "calling connect_relay"); - let relay = connect_relay(RelaySessionOptions { - paths: &paths, - requested_name: &resolved_name, - channels: channels_from_csv(&cmd.channels), - // Ephemeral brokers are short-lived and frequently restarted by tests/SDK - // callers. Use non-strict registration so stale Relaycast identities from - // prior runs don't hard-fail startup. - strict_name: cmd.persist, - agent_type: Some(agent_type_ref), - read_mcp_identity: true, - ensure_mcp_config: cmd.persist, - runtime_cwd: &runtime_cwd, - }) - .await?; - log_startup_phase(startup_debug, broker_start, "connect_relay completed"); - - let RelaySession { - http_base, - default_workspace_id, - workspaces, - mut ws_inbound_rx, - } = relay; - let workspace_lookup: HashMap = workspaces - .iter() - .cloned() - .map(|workspace| (workspace.workspace_id.clone(), workspace)) - .collect(); - let default_workspace = if let Some(default_workspace_id) = default_workspace_id.as_deref() { - workspaces - .iter() - .find(|workspace| workspace.workspace_id == default_workspace_id) - .or_else(|| workspaces.first()) - } else { - workspaces.first() - } - .cloned() - .context("no relay workspace was available after initialization")?; - let relay_workspace_key = default_workspace.relay_workspace_key.clone(); - let self_names = default_workspace.self_names.clone(); - let ws_control_tx = default_workspace.ws_control_tx.clone(); - let relaycast_http = default_workspace.http_client.clone(); - let workspace_memberships: Vec = workspaces - .iter() - .map(|workspace| WorkspaceMembershipSummary { - workspace_id: workspace.workspace_id.clone(), - workspace_alias: workspace.workspace_alias.clone(), - is_default: default_workspace_id - .as_deref() - .is_some_and(|workspace_id| workspace_id == workspace.workspace_id), - }) - .collect(); - let relay_workspaces_json = serde_json::to_string( - &workspaces - .iter() - .map(|workspace| { - serde_json::json!({ - "workspace_id": workspace.workspace_id, - "workspace_alias": workspace.workspace_alias, - "api_key": workspace.relay_workspace_key, - }) - }) - .collect::>(), - )?; - - // Broadcast channel for streaming dashboard-relevant events to WS clients. - // Created before publishing the ready router so replay and WS endpoints are - // available as soon as Relaycast workspace data is known. - let (events_tx, _events_rx) = broadcast::channel::(512); - let replay_buffer = ReplayBuffer::new(DEFAULT_REPLAY_CAPACITY); - - let ready_router = listen_api_router(ListenApiConfig { - tx: api_tx.clone(), - events_tx: events_tx.clone(), - replay_buffer: replay_buffer.clone(), - workspace_key: Some(relay_workspace_key.clone()), - memberships: workspace_memberships.clone(), - default_workspace_id: default_workspace_id.clone(), - persist: cmd.persist, - }); - { - let mut ready = relay_ready_state.write().await; - *ready = Some(RelayReadyState { - workspace_key: relay_workspace_key.clone(), - memberships: workspace_memberships.clone(), - default_workspace_id: default_workspace_id.clone(), - }); - } - if let Some(ready) = relay_ready_state.read().await.as_ref() { - log_startup_phase( - startup_debug, - broker_start, - format!( - "relay ready workspace_key_set={} memberships={} default_workspace={:?}", - !ready.workspace_key.is_empty(), - ready.memberships.len(), - ready.default_workspace_id - ), - ); - } - relay_ready.notify_one(); - let listener = startup_listener_rx - .await - .context("startup API listener task stopped before Relaycast readiness handoff")?; - tokio::spawn(async move { - if let Err(e) = axum::serve(listener, ready_router).await { - tracing::error!(error = %e, "HTTP API server error"); - } - }); - - log_startup_phase( - startup_debug, - broker_start, - format!( - "ensuring default channels for {} workspaces", - workspaces.len() - ), - ); - for workspace in &workspaces { - if let Err(error) = workspace.http_client.ensure_default_channels().await { - tracing::warn!(workspace_id = %workspace.workspace_id, error = %error, "failed to ensure default channels"); - } - } - log_startup_phase(startup_debug, broker_start, "default channels ensured"); - - let extra_channels = channels_from_csv(&cmd.channels); - log_startup_phase( - startup_debug, - broker_start, - format!("ensuring extra channels count={}", extra_channels.len()), - ); - for workspace in &workspaces { - if let Err(error) = workspace - .http_client - .ensure_extra_channels(&extra_channels) - .await - { - tracing::warn!(workspace_id = %workspace.workspace_id, error = %error, "failed to ensure extra channels"); - } - } - log_startup_phase(startup_debug, broker_start, "extra channels ensured"); - - if !extra_channels.is_empty() { - log_startup_phase( - startup_debug, - broker_start, - "subscribing websocket control channels", - ); - for workspace in &workspaces { - let _ = workspace - .ws_control_tx - .send(WsControl::Subscribe(extra_channels.clone())) - .await; - } - log_startup_phase( - startup_debug, - broker_start, - "websocket subscriptions updated", - ); - } - - let mut worker_env = vec![ - ("RELAY_BASE_URL".to_string(), http_base.clone()), - ("RELAY_API_KEY".to_string(), relay_workspace_key.clone()), - ( - "RELAY_WORKSPACES_JSON".to_string(), - relay_workspaces_json.clone(), - ), - ]; - if let Some(default_workspace_id) = default_workspace_id.clone() { - // Do NOT stamp RELAYFILE_WORKSPACE from default_workspace_id. The - // relaycast workspace id and the relayfile workspace id are - // independent — a relayfile JWT scoped to a different workspace will - // 403 with "workspace mismatch" when the relayfile MCP sends the - // wrong id. Callers that share an id across both services (e.g. the - // canonical `relay on start` flow) set RELAYFILE_WORKSPACE - // themselves through per-spawn env_vars. - worker_env.push(( - "RELAY_DEFAULT_WORKSPACE".to_string(), - default_workspace_id.clone(), - )); - worker_env.push(("RELAY_WORKSPACE_ID".to_string(), default_workspace_id)); - } - - let (sdk_out_tx, mut sdk_out_rx) = mpsc::channel::>(1024); - let events_tx_for_stdout = events_tx.clone(); - let replay_buffer_for_stdout = replay_buffer.clone(); - tokio::spawn(async move { - while let Some(frame) = sdk_out_rx.recv().await { - // Broadcast events to WS clients (the primary SDK transport) - if frame.msg_type == "event" { - broadcast_if_relevant( - &events_tx_for_stdout, - &replay_buffer_for_stdout, - &frame.payload, - ) - .await; - } - // Note: stdout writing is removed. The HTTP/WS API is the - // only SDK transport. Events flow through broadcast_if_relevant - // → events_tx → WS clients. - } - }); - - let (worker_event_tx, mut worker_event_rx) = mpsc::channel::(1024); - let worker_logs_dir = paths - .state - .parent() - .expect("state path should always have a parent") - .join("team") - .join("worker-logs"); - let mut workers = - WorkerRegistry::new(worker_event_tx, worker_env, worker_logs_dir, broker_start); - - // Load crash insights from previous session - let crash_insights_path = paths.state.parent().unwrap().join("crash-insights.json"); - let mut crash_insights = - relay_broker::crash_insights::CrashInsights::load(&crash_insights_path); - - let mut sdk_lines = BufReader::new(tokio::io::stdin()).lines(); - let mut stdin_open = true; - let mut reap_tick = tokio::time::interval(Duration::from_millis(500)); - reap_tick.set_missed_tick_behavior(MissedTickBehavior::Skip); - let mut dedup = DedupCache::new(Duration::from_secs(300), 8192); - let delivery_retry_interval = delivery_retry_interval(); - let mut pending_deliveries = load_pending_deliveries(&paths.pending); - let mut terminal_failed_deliveries: HashSet = HashSet::new(); - // Outstanding worker-bound RPC requests waiting on a `*_response` - // frame from the wrapped worker. Keyed by the `request_id` we put on - // the outbound request frame; the reply `oneshot` is consumed when - // the worker echoes the same `request_id` back, or the entry expires - // via the deadline sweep in the `reap_tick` arm below. - // - // The generic correlation infrastructure lives in `crate::worker_request` - // so each new request/response route (`snapshot_pty`, `delivery-mode`, - // `pending`, `flush`, ...) costs about five lines of broker plumbing. - let mut pending_requests: HashMap = HashMap::new(); - // Per-worker inbound-delivery-mode + pending-relay-message queue. Lives - // parallel to `workers.workers` so we can swap modes / inspect / - // drain without touching `WorkerHandle` (which holds OS-level - // process state). See `relay_broker::types::InboundDeliveryState`. Entries - // are created lazily on first lookup and removed wherever workers - // exit (`Release` arm, `worker_exited` frame, `reap_exited` sweep). - let mut delivery_states: HashMap = HashMap::new(); - let mut dm_participants_cache: HashMap)> = HashMap::new(); - let mut recent_thread_messages: VecDeque = VecDeque::new(); - if !pending_deliveries.is_empty() { - tracing::info!( - count = pending_deliveries.len(), - "loaded {} pending deliveries from previous session", - pending_deliveries.len() - ); - } - - let mut shutdown = false; - - // Owner lease: in ephemeral mode, the broker shuts down if the SDK - // doesn't renew the lease within this duration. Replaces stdin EOF - // detection. Disabled in persist mode. - let lease_duration = if cmd.persist { - None - } else { - Some(Duration::from_secs(120)) - }; - let mut last_lease_renewal = Instant::now(); - let mut lease_check = tokio::time::interval(Duration::from_secs(10)); - lease_check.set_missed_tick_behavior(MissedTickBehavior::Skip); - - // Graceful-shutdown signal: SIGTERM on unix, Ctrl+Break/Close on Windows. - // `tokio::signal::ctrl_c()` is handled in its own select! arm below and - // works on both platforms. - #[cfg(unix)] - let mut sigterm = tokio::signal::unix::signal(tokio::signal::unix::SignalKind::terminate())?; - #[cfg(windows)] - let mut sigterm = tokio::signal::windows::ctrl_shutdown()?; - - while !shutdown { - tokio::select! { - _ = tokio::signal::ctrl_c() => { - shutdown = true; - } - - _ = lease_check.tick() => { - if let Some(duration) = lease_duration { - if last_lease_renewal.elapsed() > duration { - tracing::info!( - elapsed_secs = last_lease_renewal.elapsed().as_secs(), - lease_secs = duration.as_secs(), - "owner lease expired — shutting down" - ); - shutdown = true; - } - } - } - - _ = sigterm.recv() => { - tracing::info!("received SIGTERM, shutting down"); - shutdown = true; - } - - // HTTP API requests (when --api-port is active) - result = api_rx.recv() => { - if let Some(req) = result { - match req { - ListenApiRequest::Spawn { - name, - cli, - transport, - model, - args, - task, - channels, - cwd, - team, - shadow_of, - shadow_mode, - continue_from, - idle_threshold_secs, - skip_relay_prompt, - restart_policy, - agent_token, - reply, - } => { - let effective_channels = if channels.is_empty() { - default_spawn_channels() - } else { - channels.clone() - }; - let spec = match build_http_api_spawn_spec( - name.clone(), - cli.clone(), - transport, - model.clone(), - args, - effective_channels.clone(), - cwd, - team, - shadow_of, - shadow_mode, - *restart_policy, - ) { - Ok(spec) => spec, - Err(error) => { - let _ = reply.send(Err(error.to_string())); - continue; - } - }; - let mut preregistration_warning: Option = None; - let registration_result = retry_agent_registration( - &relaycast_http, &name, Some(&cli), - ).await; - let worker_relay_key = match registration_result { - Ok(token) => Some(token), - Err(RegRetryOutcome::RetryableExhausted(error)) => { - let message = format_worker_preregistration_error(&name, &error); - tracing::warn!( - worker = %name, - error = %error, - "continuing spawn without pre-registration after retries exhausted" - ); - preregistration_warning = Some(message); - None - } - Err(RegRetryOutcome::Fatal(error)) => { - let _ = reply.send(Err(format_worker_preregistration_error(&name, &error))); - continue; - } - }; - - // Caller-supplied agent_token overrides auto-registration - let worker_relay_key = agent_token.or(worker_relay_key); - - let mut effective_task = normalize_initial_task(task); - if let Some(ref continue_from) = continue_from { - let continuity_dir = continuity_dir(&paths.state); - let continuity_file = continuity_dir.join(format!("{}.json", continue_from)); - if continuity_file.exists() { - match std::fs::read_to_string(&continuity_file) { - Ok(contents) => { - if let Ok(ctx) = serde_json::from_str::(&contents) { - let prev_task = ctx - .get("initial_task") - .and_then(Value::as_str) - .unwrap_or("unknown"); - let summary = ctx - .get("summary") - .and_then(Value::as_str) - .unwrap_or("no summary available"); - let messages = ctx - .get("message_history") - .and_then(Value::as_array) - .map(|msgs| { - msgs.iter() - .filter_map(|m| { - let from = m - .get("from") - .and_then(Value::as_str) - .unwrap_or("?"); - let text = m - .get("text") - .and_then(Value::as_str) - .unwrap_or(""); - if text.is_empty() { - None - } else { - Some(format!(" {}: {}", from, text)) - } - }) - .collect::>() - .join("\n") - }) - .unwrap_or_default(); - - let continuity_block = format!( - "## Continuity Context (from previous session as '{}')\n\ - Previous task: {}\n\ - Session summary: {}\n{}", - continue_from, - prev_task, - summary, - if messages.is_empty() { - String::new() - } else { - format!("Recent messages:\n{}\n", messages) - } - ); - - effective_task = Some(match effective_task { - Some(new_task) => { - format!( - "{}\n\n## Current Task\n{}", - continuity_block, new_task - ) - } - None => continuity_block, - }); - tracing::info!( - agent = %name, - continue_from = %continue_from, - "injected continuity context from previous session for HTTP API spawn" - ); - } - } - Err(e) => { - tracing::warn!( - agent = %name, - continue_from = %continue_from, - error = %e, - "failed to read continuity file for HTTP API spawn" - ); - } - } - } else { - tracing::warn!( - agent = %name, - continue_from = %continue_from, - "no continuity file found at {}", - continuity_file.display() - ); - } - } - - match workers.spawn( - spec, - Some("Dashboard".to_string()), - None, - worker_relay_key.clone(), - skip_relay_prompt, - idle_threshold_secs.map(|s| s.to_string()), - ).await { - Ok(effective_spec) => { - if let Some(ref task_text) = effective_task { - workers.initial_tasks.insert(name.clone(), task_text.clone()); - } - agent_spawn_count += 1; - telemetry.track(TelemetryEvent::AgentSpawn { - cli: cli.clone(), - runtime: runtime_label(&effective_spec.runtime).to_string(), - spawn_source: ActionSource::HumanDashboard, - has_task: effective_task.is_some(), - is_shadow: effective_spec.shadow_of.is_some() - || effective_spec.shadow_mode.is_some(), - }); - let pid = workers.worker_pid(&name).unwrap_or(0); - state.agents.insert( - name.clone(), - broker::PersistedAgent { - runtime: effective_spec.runtime.clone(), - parent: Some("Dashboard".to_string()), - channels: effective_spec.channels.clone(), - pid: workers.worker_pid(&name), - started_at: Some( - std::time::SystemTime::now() - .duration_since(std::time::UNIX_EPOCH) - .unwrap_or_default() - .as_secs(), - ), - spec: Some(effective_spec.clone()), - restart_policy: None, - initial_task: effective_task, - - }, - ); - if paths.persist { let _ = state.save(&paths.state); } - note_local_spawn_control_dedup( - &mut dedup, - default_workspace_id - .as_deref() - .or_else(|| workspaces.first().map(|workspace| workspace.workspace_id.as_str())), - &name, - worker_relay_key.as_deref(), - ); - let _ = send_event( - &sdk_out_tx, - json!({ - "kind":"agent_spawned", - "name":&name, - "runtime":runtime_label(&effective_spec.runtime), - "provider": effective_spec.provider.clone(), - "cli": effective_spec.cli.clone(), - "model": effective_spec.model.clone(), - "pid":pid, - "source":"http_api", - "pre_registered": worker_relay_key.is_some(), - "registration_warning": preregistration_warning.clone(), - }), - ).await; - publish_agent_state_transition( - &ws_control_tx, - &name, - "spawned", - Some("http_api_spawn"), - ) - .await; - let _ = reply.send(Ok(json!({ - "success": true, - "name": name, - "runtime": runtime_label(&effective_spec.runtime), - "model": effective_spec.model.clone(), - "pid": pid, - "pre_registered": worker_relay_key.is_some(), - "warning": preregistration_warning, - }))); - } - Err(e) => { - eprintln!("[agent-relay] HTTP API: failed to spawn '{}': {}", name, e); - let _ = reply.send(Err(e.to_string())); - } - } - } - ListenApiRequest::SetModel { name, model, timeout_ms, reply } => { - let Some(handle) = workers.workers.get_mut(&name) else { - let _ = reply.send(Err(format!("unknown worker '{}'", name))); - continue; - }; - - let model_command = format!("/model {}\n", model); - let result = async { - handle - .stdin - .write_all(model_command.as_bytes()) - .await - .with_context(|| { - format!("failed writing model command to worker '{}'", name) - })?; - handle - .stdin - .flush() - .await - .with_context(|| { - format!("failed flushing worker '{}' stdin", name) - })?; - if let Some(timeout_ms) = timeout_ms { - tracing::info!( - name = %name, - timeout_ms, - "HTTP API set_model timeout_ms is currently advisory only" - ); - } - Ok::<(), anyhow::Error>(()) - } - .await; - - match result { - Ok(()) => { - let _ = reply.send(Ok(json!({ - "name": name, - "model": model, - "success": true, - }))); - } - Err(error) => { - let _ = reply.send(Err(error.to_string())); - } - } - } - ListenApiRequest::Release { name, reason, reply } => { - if let Some(ref r) = reason { - tracing::info!(worker = %name, reason = %r, "releasing agent via HTTP API"); - } - // Unregister from supervisor before release to prevent - // auto-restart of intentionally released agents. - workers.supervisor.unregister(&name); - workers.metrics.on_release(&name); - match workers.release(&name).await { - Ok(()) => { - if let Err(error) = relaycast_http.mark_agent_offline(&name).await { - tracing::warn!( - worker = %name, - error = %error, - "failed to mark released worker offline in relaycast" - ); - } - let dropped = drop_pending_for_worker(&mut pending_deliveries, &name); - if dropped > 0 { - let _ = send_event( - &sdk_out_tx, - json!({"kind":"delivery_dropped","name":&name,"count":dropped,"reason":"agent_released"}), - ).await; - } - fail_pending_requests_for_worker(&mut pending_requests, &name, "agent_released"); - delivery_states.remove(&name); - state.agents.remove(&name); - if paths.persist { let _ = state.save(&paths.state); } - let _ = send_event( - &sdk_out_tx, - json!({"kind":"agent_released","name":&name}), - ).await; - publish_agent_state_transition( - &ws_control_tx, - &name, - "exited", - Some("http_api_release"), - ) - .await; - let _ = reply.send(Ok(json!({ "success": true, "name": name }))); - } - Err(e) => { - let message = e.to_string(); - if is_unknown_worker_error_message(&message) { - relaycast_http.forget_agent_registration(&name); - state.agents.remove(&name); - if paths.persist { - let _ = state.save(&paths.state); - } - tracing::debug!( - worker = %name, - "ignoring duplicate HTTP API release for already exited worker" - ); - let _ = reply.send(Ok(json!({ "success": true, "name": name }))); - } else { - eprintln!("[agent-relay] HTTP API: failed to release '{}': {}", name, e); - let _ = reply.send(Err(message)); - } - } - } - } - ListenApiRequest::Send { - to, - text, - from, - thread_id, - workspace_id, - workspace_alias, - mode, - reply, - } => { - let normalized_to = to.trim().to_string(); - let selected_workspace = if let Some(workspace_id) = workspace_id.as_deref() { - workspace_lookup - .get(workspace_id) - .cloned() - .ok_or_else(|| format!("workspace_not_found:workspace '{}' is not attached", workspace_id)) - } else if let Some(workspace_alias) = workspace_alias.as_deref() { - workspaces - .iter() - .find(|workspace| { - workspace - .workspace_alias - .as_deref() - .is_some_and(|alias| alias.eq_ignore_ascii_case(workspace_alias)) - }) - .cloned() - .ok_or_else(|| format!("workspace_not_found:workspace alias '{}' is not attached", workspace_alias)) - } else if workspaces.len() == 1 { - Ok(workspaces[0].clone()) - } else if let Some(default_workspace_id) = default_workspace_id.as_deref() { - workspace_lookup - .get(default_workspace_id) - .cloned() - .ok_or_else(|| format!("workspace_not_found: default workspace '{}' not found", default_workspace_id)) - } else { - Err("ambiguous_workspace:workspaceId or workspaceAlias is required when multiple workspaces are attached".to_string()) - }; - let selected_workspace = match selected_workspace { - Ok(workspace) => workspace, - Err(error) => { - let _ = reply.send(Err(error)); - continue; - } - }; - let selected_workspace_id = selected_workspace.workspace_id.clone(); - let selected_workspace_alias = selected_workspace.workspace_alias.clone(); - let workspace_self_name = selected_workspace.self_name.clone(); - let normalized_sender = normalize_sender(from.clone()); - let from_dashboard = - sender_is_dashboard_label(&normalized_sender, &workspace_self_name); - let delivery_from = if from_dashboard { - workspace_self_name.clone() - } else { - normalized_sender.clone() - }; - tracing::info!( - target = "relay_broker::http_api", - - raw_from = ?from, - normalized_sender = %normalized_sender, - from_dashboard = %from_dashboard, - delivery_from = %delivery_from, - to = %normalized_to, - thread_id = ?thread_id, - self_name = %workspace_self_name, - "HTTP API send request" - ); - let ui_from = if from_dashboard { - workspace_self_name.clone() - } else { - normalized_sender - }; - let event_id = format!("http_{}", Uuid::new_v4().simple()); - let priority = if normalized_to.starts_with('#') { 3 } else { 2 }; - let mut delivered = 0usize; - let mut delivery_errors = 0usize; - let request_start = Instant::now(); - let local_delivery_timeout = http_api_local_delivery_timeout(); - let relaycast_timeout = http_api_relaycast_send_timeout(); - let event_emit_timeout = http_api_event_emit_timeout(); - - record_thread_history_event( - &mut recent_thread_messages, - json!({ - "event_id": event_id.clone(), - "from": ui_from.clone(), - "target": normalized_to.clone(), - "to": normalized_to.clone(), - "text": text.clone(), - "thread_id": thread_id.clone(), - "workspace_id": selected_workspace_id.clone(), - "workspace_alias": selected_workspace_alias.clone(), - "timestamp": chrono::Utc::now().to_rfc3339(), - }), - ); - - let targets = if normalized_to.starts_with('#') { - workers.worker_names_for_channel_delivery(&normalized_to, &delivery_from, Some(&selected_workspace_id)) - } else { - workers.worker_names_for_direct_target(&normalized_to, &delivery_from, Some(&selected_workspace_id)) - }; - - tracing::info!( - target = "relay_broker::http_api", - - event_id = %event_id, - to = %normalized_to, - delivery_from = %delivery_from, - target_count = %targets.len(), - "resolved HTTP API send targets" - ); - - for worker_name in targets { - // Inbound-delivery queue: every inbound message - // enters the per-worker FIFO first. `auto_inject` - // drains immediately; `manual_flush` holds and - // counts as delivered so the HTTP caller's ack - // semantics are unchanged. We pass the FULL - // routing context so any drain reproduces the - // original delivery (channel/thread/workspace - // /priority/mode), not a stripped-down DM. - match queue_inbound_for_delivery_mode( - &mut delivery_states, - &workers, - &worker_name, - InboundContext { - from: &delivery_from, - body: &text, - target: &normalized_to, - thread_id: thread_id.as_deref(), - workspace_id: Some(selected_workspace_id.as_str()), - workspace_alias: selected_workspace_alias.as_deref(), - priority, - mode: mode.clone(), - event_id: Some(&event_id), - }, - ) { - InboundQueueOutcome::Queued => { - delivered = delivered.saturating_add(1); - tracing::info!( - target = "relay_broker::http_api", - event_id = %event_id, - to = %normalized_to, - worker = %worker_name, - "queued local delivery (manual_flush inbound delivery mode)" - ); - let _ = send_event( - &sdk_out_tx, - json!({ - "kind":"delivery_queued", - "name":&worker_name, - "event_id":&event_id, - "from":&delivery_from, - "target":&normalized_to, - "reason":"inbound_delivery_manual_flush", - }), - ).await; - continue; - } - InboundQueueOutcome::DrainNow(to_drain) => { - for queued in to_drain { - let queued_event_id = - queued.event_id.as_deref().unwrap_or(""); - let is_current = - queued.event_id.as_deref() == Some(event_id.as_str()); - match timeout( - local_delivery_timeout, - try_inject_pending_relay_message( - &mut workers, - &mut pending_deliveries, - &worker_name, - &queued, - delivery_retry_interval, - ), - ) - .await - { - Ok(Ok(_)) => { - if is_current { - delivered = delivered.saturating_add(1); - } - } - Ok(Err(error)) => { - if is_current { - delivery_errors = - delivery_errors.saturating_add(1); - } - tracing::warn!( - target = "relay_broker::http_api", - - event_id = %queued_event_id, - to = %queued.target, - worker = %worker_name, - error = %error, - "local delivery attempt failed" - ); - } - Err(_) => { - if is_current { - delivery_errors = - delivery_errors.saturating_add(1); - } - tracing::warn!( - target = "relay_broker::http_api", - - event_id = %queued_event_id, - to = %queued.target, - worker = %worker_name, - timeout_ms = %local_delivery_timeout.as_millis(), - "local delivery attempt timed out" - ); - } - } - } - continue; - } - InboundQueueOutcome::WorkerMissing => { - // Fall through so the standard - // not-found accounting path runs. - } - } - match timeout( - local_delivery_timeout, - queue_and_try_delivery_raw( - &mut workers, - &mut pending_deliveries, - &worker_name, - &event_id, - &delivery_from, - &normalized_to, - &text, - thread_id.clone(), - Some(selected_workspace_id.clone()), - selected_workspace_alias.clone(), - priority, - mode.clone(), - delivery_retry_interval, - ), - ) - .await - { - Ok(Ok(_)) => { - delivered = delivered.saturating_add(1); - } - Ok(Err(error)) => { - delivery_errors = delivery_errors.saturating_add(1); - tracing::warn!( - target = "relay_broker::http_api", - - event_id = %event_id, - to = %normalized_to, - worker = %worker_name, - error = %error, - "local delivery attempt failed" - ); - } - Err(_) => { - delivery_errors = delivery_errors.saturating_add(1); - tracing::warn!( - target = "relay_broker::http_api", - - event_id = %event_id, - to = %normalized_to, - worker = %worker_name, - timeout_ms = %local_delivery_timeout.as_millis(), - "local delivery attempt timed out" - ); - } - } - } - - if delivered > 0 { - tracing::info!( - target = "relay_broker::http_api", - - event_id = %event_id, - to = %normalized_to, - delivery_from = %delivery_from, - ui_from = %ui_from, - delivered = %delivered, - "local delivery succeeded" - ); - emit_http_api_event_with_timeout( - &sdk_out_tx, - json!({ - "kind": "relay_inbound", - "event_id": event_id, - "from": ui_from, - "target": normalized_to, - "body": text, - "thread_id": thread_id.clone(), - "workspace_id": selected_workspace_id.clone(), - "workspace_alias": selected_workspace_alias.clone(), - }), - event_emit_timeout, - ) - .await; - if reply - .send(Ok(json!({ - "success": true, - "event_id": event_id, - "delivered": delivered, - "local": true, - "workspace_id": selected_workspace_id, - "workspace_alias": selected_workspace_alias, - }))) - .is_err() - { - tracing::warn!( - target = "relay_broker::http_api", - - event_id = %event_id, - "broker HTTP API reply channel closed before local delivery response" - ); - } - } else { - tracing::info!( - target = "relay_broker::http_api", - - event_id = %event_id, - to = %normalized_to, - mode = ?mode, - delivery_errors = %delivery_errors, - delivery_from = %delivery_from, - ui_from = %ui_from, - relaycast_timeout_ms = %relaycast_timeout.as_millis(), - "no local deliveries succeeded; forwarding to relaycast" - ); - let relaycast_start = Instant::now(); - match timeout( - relaycast_timeout, - selected_workspace - .http_client - .send_with_mode(&normalized_to, &text, mode.clone()), - ) - .await - { - Ok(Ok(())) => { - tracing::info!( - target = "relay_broker::http_api", - - event_id = %event_id, - to = %normalized_to, - relaycast_ms = %relaycast_start.elapsed().as_millis(), - "relaycast publish succeeded" - ); - emit_http_api_event_with_timeout( - &sdk_out_tx, - json!({ - "kind": "relay_inbound", - "event_id": event_id, - "from": ui_from, - "target": normalized_to, - "body": text, - "thread_id": thread_id.clone(), - "workspace_id": selected_workspace_id.clone(), - "workspace_alias": selected_workspace_alias.clone(), - }), - event_emit_timeout, - ) - .await; - if reply - .send(Ok(json!({ - "success": true, - "event_id": event_id, - "relaycast_published": true, - "local": false, - "workspace_id": selected_workspace_id, - "workspace_alias": selected_workspace_alias, - }))) - .is_err() - { - tracing::warn!( - target = "relay_broker::http_api", - - event_id = %event_id, - "broker HTTP API reply channel closed before relaycast response" - ); - } - } - Ok(Err(error)) => { - tracing::warn!( - target = "relay_broker::http_api", - - event_id = %event_id, - to = %normalized_to, - relaycast_ms = %relaycast_start.elapsed().as_millis(), - error = %error, - "relaycast publish failed" - ); - let not_found = format!("Agent \"{}\" not found", normalized_to); - if reply - .send(Err(format!( - "{not_found} and Relaycast publish failed: {error}" - ))) - .is_err() - { - tracing::warn!( - target = "relay_broker::http_api", - - event_id = %event_id, - "broker HTTP API reply channel closed before relaycast failure response" - ); - } - } - Err(_) => { - tracing::warn!( - target = "relay_broker::http_api", - - event_id = %event_id, - to = %normalized_to, - relaycast_timeout_ms = %relaycast_timeout.as_millis(), - relaycast_ms = %relaycast_start.elapsed().as_millis(), - "relaycast publish timed out" - ); - let not_found = format!("Agent \"{}\" not found", normalized_to); - if reply - .send(Err(format!( - "{not_found} and Relaycast publish timed out after {}ms", - relaycast_timeout.as_millis() - ))) - .is_err() - { - tracing::warn!( - target = "relay_broker::http_api", - - event_id = %event_id, - "broker HTTP API reply channel closed before relaycast timeout response" - ); - } - } - } - } - tracing::info!( - target = "relay_broker::http_api", - - event_id = %event_id, - to = %normalized_to, - total_ms = %request_start.elapsed().as_millis(), - "HTTP API send request handling complete" - ); - } - ListenApiRequest::List { reply } => { - let _ = reply.send(Ok(json!({ "agents": workers.list() }))); - } - ListenApiRequest::Threads { reply } => { - let mut messages: Vec = - recent_thread_messages.iter().cloned().collect(); - match relaycast_http.get_all_dms(200).await { - Ok(dm_messages) => messages.extend(dm_messages), - Err(error) => { - tracing::debug!( - error = %error, - "failed to fetch relaycast dm history for /api/threads" - ); - } - } - let threads = build_thread_infos(&messages, &self_names); - let _ = reply.send(Ok(json!({ "threads": threads }))); - } - ListenApiRequest::SendInput { name, data, reply } => { - if let Err(err) = workers.send_to_worker( - &name, "write_pty", Some(format!("api_{}", Uuid::new_v4().simple())), - json!({ "data": data }), - ).await { - let _ = reply.send(Err(format!("agent_not_found: {}", err))); - } else { - let _ = reply.send(Ok(json!({ - "name": name, - "bytes_written": data.len(), - }))); - } - } - ListenApiRequest::ResizePty { name, rows, cols, reply } => { - if rows == 0 || cols == 0 { - let _ = reply.send(Err("invalid_dimensions: rows and cols must be >= 1".into())); - } else if let Err(err) = workers.send_to_worker( - &name, "resize_pty", Some(format!("api_{}", Uuid::new_v4().simple())), - json!({ "rows": rows, "cols": cols }), - ).await { - let _ = reply.send(Err(format!("agent_not_found: {}", err))); - } else { - let _ = reply.send(Ok(json!({ - "name": name, - "rows": rows, - "cols": cols, - }))); - } - } - ListenApiRequest::WorkerRequest { name, kind, payload, timeout, reply } => { - // Generic worker request/response: validate the - // worker exists and supports a PTY (all current - // request/response routes target the PTY side), - // then ship the frame and park the `reply` - // oneshot in `pending_requests`. The response is - // fulfilled either by the `*_response` arm below - // or by the deadline sweep in `reap_tick`. - // - // Headless workers don't run a VT and don't handle - // PTY-oriented RPCs — short-circuit with a typed - // error rather than letting the request sit until - // the timeout sweep returns a misleading - // `worker_timeout`. - let runtime = workers - .workers - .get(&name) - .map(|handle| handle.spec.runtime.clone()); - match runtime { - None => { - let _ = reply.send(Err( - worker_request::RequestWorkerError::WorkerNotFound( - format!("no worker named '{name}'"), - ), - )); - } - Some(AgentRuntime::Headless) => { - let _ = reply.send(Err( - worker_request::RequestWorkerError::UnsupportedRuntime( - format!("worker '{name}' is headless; {kind} is only supported on PTY workers"), - ), - )); - } - Some(AgentRuntime::Pty) => { - let request_id = format!("req_{}", Uuid::new_v4().simple()); - if let Err(err) = workers.send_to_worker( - &name, - &kind, - Some(request_id.clone()), - payload, - ).await { - let _ = reply.send(Err( - worker_request::RequestWorkerError::SendFailed( - err.to_string(), - ), - )); - } else { - pending_requests.insert( - request_id, - worker_request::PendingRequest { - kind, - worker_name: name, - reply, - deadline: Instant::now() + timeout, - }, - ); - } - } - } - } - ListenApiRequest::GetMetrics { agent, reply } => { - if let Some(ref agent_name) = agent { - if let Some(handle) = workers.workers.get(agent_name) { - let m = build_agent_metrics(handle); - let _ = reply.send(Ok(json!({ "agents": [m], "broker": workers.metrics.snapshot(workers.workers.len()) }))); - } else { - let _ = reply.send(Err(format!("unknown worker '{}'", agent_name))); - } - } else { - let mut agent_metrics: Vec = workers.workers.values() - .map(build_agent_metrics) - .collect(); - agent_metrics.sort_by(|a, b| a.name.cmp(&b.name)); - let _ = reply.send(Ok(json!({ - "agents": agent_metrics, - "broker": workers.metrics.snapshot(workers.workers.len()), - }))); - } - } - ListenApiRequest::GetStatus { reply } => { - let pending: Vec = pending_deliveries.values().map(|pd| { - json!({ - "delivery_id": pd.delivery.delivery_id, - "worker_name": pd.worker_name, - "event_id": pd.delivery.event_id, - "attempts": pd.attempts, - }) - }).collect(); - let _ = reply.send(Ok(json!({ - "agent_count": workers.workers.len(), - "agents": workers.list(), - "pending_delivery_count": pending.len(), - "pending_deliveries": pending, - }))); - } - ListenApiRequest::GetCrashInsights { reply } => { - let _ = reply.send(Ok(crash_insights.to_json())); - } - ListenApiRequest::Preflight { agents, reply } => { - let count = agents.len(); - let _ = reply.send(Ok(json!({ "queued": count }))); - // Background preflight — same as stdio handler - for entry in agents { - let http = relaycast_http.clone(); - tokio::spawn(async move { - let _ = tokio::time::timeout( - Duration::from_secs(30), - http.register_agent_token(&entry.name, Some(&entry.cli)), - ).await; - }); - } - } - ListenApiRequest::SubscribeChannels { name, channels, reply } => { - let Some(handle) = workers.workers.get_mut(&name) else { - let _ = reply.send(Err(format!("unknown worker '{}'", name))); - continue; - }; - let mut added = Vec::new(); - for ch in &channels { - let exists = handle.spec.channels.iter() - .any(|c| c.eq_ignore_ascii_case(ch)); - if !exists { - handle.spec.channels.push(ch.clone()); - added.push(ch.clone()); - } - } - let all_channels = handle.spec.channels.clone(); - let _ = reply.send(Ok(json!({ - "name": name, - "channels": all_channels, - }))); - } - ListenApiRequest::UnsubscribeChannels { name, channels, reply } => { - let Some(handle) = workers.workers.get_mut(&name) else { - let _ = reply.send(Err(format!("unknown worker '{}'", name))); - continue; - }; - handle.spec.channels.retain(|c| { - !channels.iter().any(|rem| rem.eq_ignore_ascii_case(c)) - }); - let remaining = handle.spec.channels.clone(); - let _ = reply.send(Ok(json!({ - "name": name, - "channels": remaining, - }))); - } - ListenApiRequest::GetInboundDeliveryMode { name, reply } => { - if !workers.has_worker(&name) { - let _ = reply.send(Err(DeliveryRouteError::WorkerNotFound(name))); - } else { - let mode = delivery_states - .get(&name) - .map(|s| s.mode) - .unwrap_or_default(); - let _ = reply.send(Ok(mode)); - } - } - ListenApiRequest::SetInboundDeliveryMode { name, mode, reply } => { - if !workers.has_worker(&name) { - let _ = reply.send(Err(DeliveryRouteError::WorkerNotFound(name))); - } else { - let entry = delivery_states.entry(name.clone()).or_default(); - let previous = entry.mode; - entry.mode = mode; - let to_flush: Vec = if previous - == InboundDeliveryMode::ManualFlush - && mode == InboundDeliveryMode::AutoInject - { - entry.drain_pending() - } else { - Vec::new() - }; - let flushed = to_flush.len(); - if !to_flush.is_empty() { - tracing::info!( - target = "agent_relay::broker", - worker = %name, - drained = flushed, - "draining pending queue on manual_flush → auto_inject transition" - ); - } - for queued in to_flush { - inject_pending_relay_message( - &mut workers, - &mut pending_deliveries, - &name, - &queued, - delivery_retry_interval, - ) - .await; - } - tracing::info!( - target = "agent_relay::broker", - worker = %name, - previous_mode = previous.as_wire_str(), - mode = mode.as_wire_str(), - flushed, - "inbound delivery mode updated" - ); - if previous != mode { - let _ = send_event( - &sdk_out_tx, - json!({ - "kind":"agent_inbound_delivery_mode_changed", - "name":&name, - "previous_mode":previous.as_wire_str(), - "mode":mode.as_wire_str(), - }), - ).await; - } - if flushed > 0 { - let _ = send_event( - &sdk_out_tx, - json!({ - "kind":"agent_pending_drained", - "name":&name, - "count":flushed, - "reason":"delivery_mode_transition", - }), - ).await; - } - let _ = reply.send(Ok(SetInboundDeliveryModeOk { mode, flushed })); - } - } - ListenApiRequest::GetPending { name, reply } => { - if !workers.has_worker(&name) { - let _ = reply.send(Err(DeliveryRouteError::WorkerNotFound(name))); - } else { - let snapshot = delivery_states - .get(&name) - .map(|s| s.pending_snapshot()) - .unwrap_or_default(); - let _ = reply.send(Ok(snapshot)); - } - } - ListenApiRequest::FlushPending { name, reply } => { - if !workers.has_worker(&name) { - let _ = reply.send(Err(DeliveryRouteError::WorkerNotFound(name))); - } else { - let to_flush: Vec = delivery_states - .get_mut(&name) - .map(|state| state.drain_pending()) - .unwrap_or_default(); - let flushed = to_flush.len(); - if flushed > 0 { - tracing::info!( - target = "agent_relay::broker", - worker = %name, - drained = flushed, - "flushing pending queue on explicit /flush" - ); - } - for queued in to_flush { - inject_pending_relay_message( - &mut workers, - &mut pending_deliveries, - &name, - &queued, - delivery_retry_interval, - ) - .await; - } - if flushed > 0 { - let _ = send_event( - &sdk_out_tx, - json!({ - "kind":"agent_pending_drained", - "name":&name, - "count":flushed, - "reason":"explicit_flush", - }), - ).await; - } - let _ = reply.send(Ok(flushed)); - } - } - ListenApiRequest::Shutdown { reply } => { - let _ = reply.send(Ok(json!({ "status": "shutting_down" }))); - shutdown = true; - } - ListenApiRequest::RenewLease { reply } => { - last_lease_renewal = Instant::now(); - let expires_in = lease_duration.map(|d| d.as_secs()).unwrap_or(0); - let _ = reply.send(Ok(json!({ - "renewed": true, - "expires_in_secs": expires_in, - "persist": cmd.persist, - }))); - } - } - } - } - - // Stdin is no longer used for SDK communication — all control - // goes through the HTTP/WS API. We drain stdin to avoid - // blocking if anything writes to it, and stop polling after EOF. - result = sdk_lines.next_line(), if stdin_open => { - if matches!(result, Ok(None) | Err(_)) { - stdin_open = false; - } - } - - ws_msg = ws_inbound_rx.recv() => { - if let Some(ws_msg) = ws_msg { - let workspace_id = ws_msg.workspace_id.clone(); - let workspace_alias = ws_msg.workspace_alias.clone(); - let ws_value = ws_msg.value; - let workspace_state = workspace_lookup - .get(&workspace_id) - .cloned() - .unwrap_or_else(|| default_workspace.clone()); - let workspace_self_name = workspace_state.self_name.clone(); - let workspace_self_names = workspace_state.self_names.clone(); - let workspace_self_agent_ids = workspace_state.self_agent_ids.clone(); - let workspace_http = workspace_state.http_client.clone(); - let ws_type = ws_value - .get("type") - .and_then(Value::as_str) - .unwrap_or(""); - tracing::info!( - target = "agent_relay::broker", - ws_type = %ws_type, - workspace_id = %workspace_id, - event = %ws_value, - "received relaycast ws event" - ); - - let control_dedup_key = if matches!( - ws_type, - "agent.spawn_requested" | "agent.release_requested" - ) { - relaycast_ws_control_dedup_key(&workspace_id, ws_type, &ws_value) - } else { - None - }; - - if let Some(ref control_dedup_key) = control_dedup_key { - if !dedup.insert_if_new(control_dedup_key, Instant::now()) { - tracing::info!( - ws_type = %ws_type, - workspace_id = %workspace_id, - "dropping duplicate relaycast control event" - ); - continue; - } - } - - if matches!(ws_type, "agent.spawn_requested" | "agent.release_requested") { - if let Err(ref deser_err) = serde_json::from_value::(ws_value.clone()) { - eprintln!( - "[agent-relay] WARNING: failed to deserialize {} event: {}", - ws_type, deser_err - ); - } - } - if let Ok(ws_event) = serde_json::from_value::(ws_value.clone()) { - match ws_event { - WsEvent::AgentReleaseRequested(event) => { - let name = event.agent.name; - if is_relaycast_self_control_target( - &name, - &workspace_self_name, - &workspace_self_names, - ) { - workspace_http.forget_agent_registration(&name); - tracing::debug!( - worker = %name, - "ignoring relaycast release request for broker self" - ); - continue; - } - workers.supervisor.unregister(&name); - workers.metrics.on_release(&name); - match workers.release(&name).await { - Ok(()) => { - workspace_http.forget_agent_registration(&name); - let dropped = drop_pending_for_worker(&mut pending_deliveries, &name); - if dropped > 0 { - let _ = send_event( - &sdk_out_tx, - json!({"kind":"delivery_dropped","name":name,"count":dropped,"reason":"agent_released"}), - ).await; - } - fail_pending_requests_for_worker(&mut pending_requests, &name, "relaycast_release"); - delivery_states.remove(&name); - telemetry.track(TelemetryEvent::AgentRelease { - cli: String::new(), - release_reason: "relaycast_release".to_string(), - lifetime_seconds: 0, - release_source: ActionSource::Protocol, - }); - state.agents.remove(&name); - if paths.persist { - if let Err(error) = state.save(&paths.state) { - tracing::warn!(path = %paths.state.display(), error = %error, "failed to persist broker state"); - } - } - let _ = send_event( - &sdk_out_tx, - json!({"kind":"agent_released","name":name}), - ).await; - publish_agent_state_transition( - &workspace_state.ws_control_tx, - &name, - "exited", - Some("relaycast_release"), - ) - .await; - tracing::info!(child = %name, "released worker via relaycast in broker mode"); - eprintln!("[agent-relay] released worker '{}' via relaycast", name); - } - Err(error) => { - let message = error.to_string(); - if is_unknown_worker_error_message(&message) { - workspace_http.forget_agent_registration(&name); - state.agents.remove(&name); - if paths.persist { - if let Err(save_error) = state.save(&paths.state) { - tracing::warn!( - path = %paths.state.display(), - error = %save_error, - "failed to persist broker state" - ); - } - } - tracing::debug!( - child = %name, - "ignoring duplicate relaycast release for already exited worker" - ); - } else { - tracing::error!(child = %name, error = %error, "failed to release worker via relaycast"); - eprintln!("[agent-relay] failed to release '{}': {}", name, error); - } - } - } - continue; - } - WsEvent::AgentSpawnRequested(event) => { - let name = event.agent.name; - eprintln!("[agent-relay] received spawn request for '{}' (cli: {})", name, event.agent.cli); - if is_relaycast_self_control_target( - &name, - &workspace_self_name, - &workspace_self_names, - ) { - tracing::debug!( - worker = %name, - "ignoring relaycast spawn request for broker self" - ); - eprintln!("[agent-relay] ignoring spawn request for '{}' (broker self)", name); - continue; - } - let local_spawn_echo_key = - relaycast_spawn_control_dedup_key(&workspace_id, &name); - if relaycast_ws_should_apply_local_spawn_echo_dedup( - control_dedup_key.as_deref(), - &local_spawn_echo_key, - ) && !dedup.insert_if_new(&local_spawn_echo_key, Instant::now()) - { - tracing::info!( - worker = %name, - workspace_id = %workspace_id, - "dropping duplicate/local relaycast spawn request" - ); - eprintln!("[agent-relay] dropping duplicate spawn request for '{}'", name); - continue; - } - let cli = event.agent.cli; - let task = Some(event.agent.task).filter(|value| !value.trim().is_empty()); - let channel = event.agent.channel; - - tracing::info!(name = %name, cli = %cli, task = ?task, channel = ?channel, "handling spawn request from relaycast WS"); - let channels = channel - .as_deref() - .map(|ch| { - let mut chs = default_spawn_channels(); - if !chs.contains(&ch.to_string()) { - chs.push(ch.to_string()); - } - chs - }) - .unwrap_or_else(default_spawn_channels); - let spec = AgentSpec { - name: name.clone(), - runtime: AgentRuntime::Pty, - provider: None, - cli: Some(cli.clone()), - model: None, - cwd: None, - team: None, - shadow_of: None, - shadow_mode: None, - args: vec![], - channels: channels.clone(), - restart_policy: None, - }; - let effective_task = normalize_initial_task(task.clone()); - - // Pre-register agent token. Claude doesn't need this — it - // bakes the API key into --mcp-config JSON and self-registers. - // Non-Claude CLIs need the token injected into their CLI args - // at spawn time, so we do a quick (3s) registration attempt. - let cli_command = parse_cli_command(&cli).map(|(cmd, _)| cmd).unwrap_or_else(|_| cli.clone()); - let cli_name_lower = normalize_cli_name(&cli_command).to_lowercase(); - let is_claude = cli_name_lower == "claude" || cli_name_lower.starts_with("claude:"); - let worker_relay_key = { - let ws_token = relaycast_ws_spawn_token(&ws_value); - if ws_token.is_some() { - ws_token - } else if is_claude { - // Claude self-registers via its MCP server — skip blocking call - None - } else { - const REG_TIMEOUT: Duration = Duration::from_secs(3); - match tokio::time::timeout( - REG_TIMEOUT, - workspace_http.register_agent_token(&name, Some(cli.as_str())), - ).await { - Ok(Ok(token)) => { - tracing::info!( - worker = %name, - "pre-registered agent via broker for WS spawn" - ); - Some(token) - } - Ok(Err(error)) => { - tracing::warn!( - worker = %name, - error = %error, - "WS spawn pre-registration failed; agent will self-register" - ); - None - } - Err(_) => { - tracing::warn!( - worker = %name, - "WS spawn pre-registration timed out (3s); agent will self-register" - ); - None - } - } - } - }; - - match workers.spawn( - spec, - Some("Relaycast".to_string()), - None, - worker_relay_key.clone(), - false, - Some(workspace_id.clone()), - ).await { - Ok(effective_spec) => { - if let Some(ref task_text) = effective_task { - workers.initial_tasks.insert(name.clone(), task_text.clone()); - } - agent_spawn_count += 1; - telemetry.track(TelemetryEvent::AgentSpawn { - cli: cli.clone(), - runtime: runtime_label(&effective_spec.runtime).to_string(), - spawn_source: ActionSource::Protocol, - has_task: effective_task.is_some(), - is_shadow: false, - }); - let pid = workers.worker_pid(&name).unwrap_or(0); - state.agents.insert( - name.clone(), - broker::PersistedAgent { - runtime: AgentRuntime::Pty, - parent: Some("Relaycast".to_string()), - channels, - pid: workers.worker_pid(&name), - started_at: Some( - std::time::SystemTime::now() - .duration_since(std::time::UNIX_EPOCH) - .unwrap_or_default() - .as_secs(), - ), - spec: Some(effective_spec.clone()), - restart_policy: None, - initial_task: effective_task, - - }, - ); - if paths.persist { let _ = state.save(&paths.state); } - let _ = send_event( - &sdk_out_tx, - json!({ - "kind": "agent_spawned", - "name": name, - "runtime": "pty", - "cli": cli, - "model": effective_spec.model.clone(), - "pid": pid, - "source": "relaycast_ws", - "pre_registered": worker_relay_key.is_some(), - }), - ).await; - publish_agent_state_transition( - &workspace_state.ws_control_tx, - &name, - "spawned", - Some("relaycast_spawn"), - ) - .await; - tracing::info!(child = %name, pid, "spawned worker via relaycast WS"); - eprintln!("[agent-relay] spawned worker '{}' via relaycast", name); - } - Err(e) => { - let msg = e.to_string(); - if msg.contains("already exists") { - tracing::debug!(child = %name, "agent already spawned via SDK, skipping duplicate relaycast WS spawn"); - } else { - tracing::error!(child = %name, error = %e, "failed to spawn worker via relaycast WS"); - eprintln!("[agent-relay] failed to spawn '{}': {}", name, e); - } - } - } - continue; - } - _ => {} - } - } else if ws_type == "agent.spawn_requested" { - // Fallback: the SDK failed to deserialize the event (e.g. missing - // fields like `already_existed` or `task: null`). Extract the - // spawn info directly from the raw JSON so we don't silently - // drop the request. - let agent_obj = ws_value.get("agent"); - let name = agent_obj - .and_then(|a| a.get("name")) - .and_then(Value::as_str) - .unwrap_or("") - .to_string(); - let cli = agent_obj - .and_then(|a| a.get("cli")) - .and_then(Value::as_str) - .unwrap_or("claude") - .to_string(); - let task = agent_obj - .and_then(|a| a.get("task")) - .and_then(Value::as_str) - .unwrap_or("") - .to_string(); - let channel = agent_obj - .and_then(|a| a.get("channel")) - .and_then(Value::as_str) - .map(String::from); - - if !name.is_empty() { - eprintln!("[agent-relay] handling spawn request for '{}' via JSON fallback (cli: {})", name, cli); - - if is_relaycast_self_control_target( - &name, - &workspace_self_name, - &workspace_self_names, - ) { - eprintln!("[agent-relay] ignoring spawn request for '{}' (broker self)", name); - } else { - let local_spawn_echo_key = - relaycast_spawn_control_dedup_key(&workspace_id, &name); - let should_dedup = relaycast_ws_should_apply_local_spawn_echo_dedup( - control_dedup_key.as_deref(), - &local_spawn_echo_key, - ); - // Always insert the local echo key for consistency with the primary path - let is_new = dedup.insert_if_new(&local_spawn_echo_key, Instant::now()); - if !should_dedup || is_new - { - let channels = channel - .as_deref() - .map(|ch| { - let mut chs = default_spawn_channels(); - if !chs.contains(&ch.to_string()) { - chs.push(ch.to_string()); - } - chs - }) - .unwrap_or_else(default_spawn_channels); - let spec = AgentSpec { - name: name.clone(), - runtime: AgentRuntime::Pty, - provider: None, - cli: Some(cli.clone()), - model: None, - cwd: None, - team: None, - shadow_of: None, - shadow_mode: None, - args: vec![], - channels: channels.clone(), - restart_policy: None, - }; - let task_opt = Some(task).filter(|v| !v.trim().is_empty()); - let effective_task = normalize_initial_task(task_opt.clone()); - - // Pre-register (same logic as primary WS spawn path). - let cli_command = parse_cli_command(&cli).map(|(cmd, _)| cmd).unwrap_or_else(|_| cli.clone()); - let cli_name_lower = normalize_cli_name(&cli_command).to_lowercase(); - let is_claude = cli_name_lower == "claude" || cli_name_lower.starts_with("claude:"); - let worker_relay_key = { - let ws_token = relaycast_ws_spawn_token(&ws_value); - if ws_token.is_some() { - ws_token - } else if is_claude { - None - } else { - const REG_TIMEOUT: Duration = Duration::from_secs(3); - match tokio::time::timeout( - REG_TIMEOUT, - workspace_http.register_agent_token(&name, Some(cli.as_str())), - ).await { - Ok(Ok(token)) => Some(token), - Ok(Err(error)) => { - tracing::warn!( - worker = %name, - error = %error, - "WS spawn fallback pre-registration failed" - ); - None - } - Err(_) => { - tracing::warn!(worker = %name, "WS spawn fallback pre-registration timed out (3s)"); - None - } - } - } - }; - - match workers.spawn( - spec, - Some("Relaycast".to_string()), - None, - worker_relay_key.clone(), - false, - Some(workspace_id.clone()), - ).await { - Ok(effective_spec) => { - if let Some(ref task_text) = effective_task { - workers.initial_tasks.insert(name.clone(), task_text.clone()); - } - agent_spawn_count += 1; - telemetry.track(TelemetryEvent::AgentSpawn { - cli: cli.clone(), - runtime: runtime_label(&effective_spec.runtime).to_string(), - spawn_source: ActionSource::Protocol, - has_task: effective_task.is_some(), - is_shadow: false, - }); - let pid = workers.worker_pid(&name).unwrap_or(0); - state.agents.insert( - name.clone(), - broker::PersistedAgent { - runtime: AgentRuntime::Pty, - parent: Some("Relaycast".to_string()), - channels, - pid: workers.worker_pid(&name), - started_at: Some( - std::time::SystemTime::now() - .duration_since(std::time::UNIX_EPOCH) - .unwrap_or_default() - .as_secs(), - ), - spec: Some(effective_spec.clone()), - restart_policy: None, - initial_task: effective_task, - - }, - ); - if paths.persist { let _ = state.save(&paths.state); } - let _ = send_event( - &sdk_out_tx, - json!({ - "kind": "agent_spawned", - "name": name, - "runtime": "pty", - "cli": cli, - "model": effective_spec.model.clone(), - "pid": pid, - "source": "relaycast_ws_fallback", - "pre_registered": worker_relay_key.is_some(), - }), - ).await; - publish_agent_state_transition( - &workspace_state.ws_control_tx, - &name, - "spawned", - Some("relaycast_spawn"), - ) - .await; - eprintln!("[agent-relay] spawned worker '{}' via relaycast (JSON fallback)", name); - } - Err(e) => { - let msg = e.to_string(); - if !msg.contains("already exists") { - eprintln!("[agent-relay] failed to spawn '{}': {}", name, e); - } - } - } - } else { - eprintln!("[agent-relay] dropping duplicate spawn request for '{}' (fallback)", name); - } - } - } - // Don't fall through to map_ws_event for control events - // handled by the JSON fallback path. - continue; - } - - // Preserve the raw channel from the WS event for thread replies. - // The mapper may set target = "thread" (synthetic) when the SDK - // struct lacks a channel field; we use the raw value to fix - // display_target so the dashboard can route the message correctly. - let raw_ws_channel = ws_value - .get("channel") - .and_then(Value::as_str) - .map(String::from); - - if let Some(mapped) = map_ws_event(&ws_value, &workspace_id, workspace_alias.as_deref()) { - tracing::info!( - from = %mapped.from, - target = %mapped.target, - kind = ?mapped.kind, - event_id = %mapped.event_id, - text_len = mapped.text.len(), - "mapped inbound WS event" - ); - let dedup_key = format!("{}:{}", mapped.workspace_id, mapped.event_id); - if !dedup.insert_if_new(&dedup_key, Instant::now()) { - tracing::info!(event_id = %mapped.event_id, workspace_id = %mapped.workspace_id, "dropping duplicate event"); - continue; - } - let has_local_target = if mapped.target.starts_with('#') { - !workers - .worker_names_for_channel_delivery(&mapped.target, &mapped.from, Some(&workspace_id)) - .is_empty() - } else if matches!(mapped.kind, InboundKind::ThreadReply) && mapped.target == "thread" { - // Thread replies target "thread" (synthetic), not a specific worker. - // Treat as having a local target when any worker exists so the - // self-echo filter doesn't drop dashboard-originated thread replies. - workers.has_any_worker() - } else { - workers.has_worker_by_name_ignoring_case(&mapped.target) - }; - if routing::is_self_echo( - &mapped, - &workspace_self_names, - &workspace_self_agent_ids, - has_local_target, - ) { - tracing::info!(from = %mapped.from, sender_agent_id = ?mapped.sender_agent_id, self_names = ?workspace_self_names, "skipping self-echo in broker loop"); - continue; - } - - telemetry.track(TelemetryEvent::MessageSend { - is_broadcast: mapped.target.starts_with('#'), - has_thread: mapped.thread_id.is_some(), - }); - - let mut delivery_plan = { - let worker_view = workers.routing_workers(); - routing::resolve_delivery_targets(&mapped, &worker_view) - }; - - // For thread replies with synthetic target "thread", override - // display_target with the actual channel so the dashboard can - // route the message to the correct channel/DM view. - if matches!(mapped.kind, InboundKind::ThreadReply) - && delivery_plan.display_target == "thread" - { - if let Some(ref ch) = raw_ws_channel { - let chan_target = if ch.starts_with('#') { - ch.clone() - } else { - format!("#{ch}") - }; - tracing::info!( - original_target = "thread", - resolved_target = %chan_target, - "overriding thread reply display_target with raw WS channel" - ); - delivery_plan.display_target = chan_target; - } - } - - if mapped.target.starts_with('#') { - tracing::info!( - channel = %mapped.target, - from = %mapped.from, - target_count = delivery_plan.targets.len(), - targets = ?delivery_plan.targets, - "channel delivery targets" - ); - } else { - tracing::info!( - target = %mapped.target, - from = %mapped.from, - kind = ?mapped.kind, - direct_targets = ?delivery_plan.targets, - "direct message routing" - ); - } - - if delivery_plan.needs_dm_resolution { - let conversation_id = mapped.target.clone(); - tracing::info!(conversation_id = %conversation_id, "resolving DM participants"); - let participants = resolve_dm_participants_cached( - &workspace_http, - &mut dm_participants_cache, - &workspace_id, - &conversation_id, - ) - .await; - tracing::info!(participants = ?participants, "resolved DM participants"); - - if let Some(participant) = participants - .iter() - .find(|participant| !agent_name_eq(participant, &mapped.from)) - { - delivery_plan.display_target = participant.clone(); - } - - let worker_view = workers.routing_workers(); - delivery_plan.targets = routing::worker_names_for_dm_participants( - &worker_view, - &participants, - &mapped.from, - Some(&workspace_id), - ); - tracing::info!(dm_targets = ?delivery_plan.targets, "DM participant-based routing targets"); - } - - for worker_name in delivery_plan.targets { - // Inbound-delivery queue: mirrors the /api/send - // queue above. Auto-inject workers drain the queue - // immediately; manual-flush workers leave relaycast - // messages parked until flush. The same full-context - // capture makes drains reproduce the original - // delivery (channel/thread/workspace). - match queue_inbound_for_delivery_mode( - &mut delivery_states, - &workers, - &worker_name, - InboundContext { - from: &mapped.from, - body: &mapped.text, - target: &mapped.target, - thread_id: mapped.thread_id.as_deref(), - workspace_id: Some(mapped.workspace_id.as_str()), - workspace_alias: mapped.workspace_alias.as_deref(), - priority: mapped.priority.as_u8(), - mode: MessageInjectionMode::Wait, - event_id: Some(&mapped.event_id), - }, - ) { - InboundQueueOutcome::Queued => { - tracing::info!( - target = "agent_relay::broker", - event_id = %mapped.event_id, - worker = %worker_name, - "queued inbound relay message (manual_flush inbound delivery mode)" - ); - let _ = send_event( - &sdk_out_tx, - json!({ - "kind":"delivery_queued", - "name":&worker_name, - "event_id":&mapped.event_id, - "from":&mapped.from, - "target":&mapped.target, - "reason":"inbound_delivery_manual_flush", - }), - ).await; - continue; - } - InboundQueueOutcome::DrainNow(to_drain) => { - for queued in to_drain { - if let Err(error) = try_inject_pending_relay_message( - &mut workers, - &mut pending_deliveries, - &worker_name, - &queued, - delivery_retry_interval, - ) - .await - { - let _ = send_error( - &sdk_out_tx, - None, - "delivery_failed", - error.to_string(), - true, - Some(json!({"worker": worker_name})), - ) - .await; - } - } - continue; - } - InboundQueueOutcome::WorkerMissing => {} - } - if let Err(error) = queue_and_try_delivery( - &mut workers, - &mut pending_deliveries, - &worker_name, - &mapped, - delivery_retry_interval, - ).await { - let _ = send_error(&sdk_out_tx, None, "delivery_failed", error.to_string(), true, Some(json!({"worker": worker_name}))).await; - } - } - - let display_target = - display_target_for_dashboard(&delivery_plan.display_target, &workspace_self_names, &workspace_self_name); - let display_from = if is_self_name(&workspace_self_names, &mapped.from) - { - workspace_self_name.clone() - } else { - mapped.from.clone() - }; - tracing::info!( - from = %display_from, - display_target = %display_target, - event_id = %mapped.event_id, - body_len = mapped.text.len(), - "broadcasting relay_inbound to dashboard" - ); - record_thread_history_event( - &mut recent_thread_messages, - json!({ - "event_id": mapped.event_id.clone(), - "from": display_from.clone(), - "target": display_target.clone(), - "text": mapped.text.clone(), - "thread_id": mapped.thread_id.clone(), - "workspace_id": mapped.workspace_id.clone(), - "workspace_alias": mapped.workspace_alias.clone(), - "timestamp": chrono::Utc::now().to_rfc3339(), - }), - ); - let _ = send_event( - &sdk_out_tx, - json!({ - "kind": "relay_inbound", - "event_id": mapped.event_id, - "from": display_from, - "target": display_target, - "body": mapped.text, - "thread_id": mapped.thread_id, - "workspace_id": mapped.workspace_id, - "workspace_alias": mapped.workspace_alias, - }), - ).await; - } else if ws_type != "broker.connection" && ws_type != "broker.channel_join" { - tracing::info!( - target = "agent_relay::broker", - ws_type = %ws_type, - event = %ws_value, - "relaycast ws event ignored by inbound mapper" - ); - } - } - } - - worker_event = worker_event_rx.recv() => { - if let Some(worker_event) = worker_event { - match worker_event { - WorkerEvent::Message { name, value } => { - if let Some(msg_type) = value.get("type").and_then(Value::as_str) { - if msg_type == "delivery_ack" { - if let Some(payload) = value.get("payload") { - let delivery_id = payload - .get("delivery_id") - .and_then(Value::as_str) - .unwrap_or(""); - - // Terminal guard: ignore late delivery_ack events once a - // delivery has reached terminal failed status. - if !delivery_id.is_empty() - && terminal_failed_deliveries.contains(delivery_id) - { - tracing::info!( - worker = %name, - delivery_id = %delivery_id, - "ignoring late delivery_ack after terminal failed status" - ); - continue; - } - - if let Ok(ack) = serde_json::from_value::(payload.clone()) { - clear_pending_delivery_if_event_matches( - &mut pending_deliveries, - &ack.delivery_id, - Some(&ack.event_id), - &name, - "delivery_ack", - ); - terminal_failed_deliveries.remove(&ack.delivery_id); - } - let _ = send_event(&sdk_out_tx, json!({ - "kind": "delivery_ack", - "name": name, - "delivery_id": payload.get("delivery_id"), - "event_id": payload.get("event_id"), - "timestamp": payload.get("timestamp"), - })).await; - } - } else if msg_type == "delivery_queued" { - if let Some(payload) = value.get("payload") { - let _ = send_event(&sdk_out_tx, json!({ - "kind": msg_type, - "name": name, - "delivery_id": payload.get("delivery_id"), - "event_id": payload.get("event_id"), - "timestamp": payload.get("timestamp"), - })).await; - } - } else if msg_type == "delivery_injected" { - if let Some(payload) = value.get("payload") { - let delivery_id = payload - .get("delivery_id") - .and_then(Value::as_str) - .unwrap_or(""); - let event_id = - payload.get("event_id").and_then(Value::as_str); - clear_pending_delivery_if_event_matches( - &mut pending_deliveries, - delivery_id, - event_id, - &name, - "delivery_injected", - ); - let _ = send_event(&sdk_out_tx, json!({ - "kind": msg_type, - "name": name, - "delivery_id": payload.get("delivery_id"), - "event_id": payload.get("event_id"), - "timestamp": payload.get("timestamp"), - })).await; - } - } else if msg_type == "delivery_verified" { - if let Some(payload) = value.get("payload") { - let delivery_id = payload.get("delivery_id").and_then(Value::as_str).unwrap_or(""); - let event_id = payload.get("event_id").and_then(Value::as_str).unwrap_or(""); - tracing::debug!( - target = "agent_relay::broker", - worker = %name, - delivery_id = %delivery_id, - event_id = %event_id, - "delivery verified by echo detection" - ); - clear_pending_delivery_if_event_matches( - &mut pending_deliveries, - delivery_id, - Some(event_id), - &name, - "delivery_verified", - ); - let _ = send_event(&sdk_out_tx, json!({ - "kind": "delivery_verified", - "name": name, - "delivery_id": delivery_id, - "event_id": event_id, - })).await; - } - } else if msg_type == "delivery_active" { - if let Some(payload) = value.get("payload") { - let _ = send_event(&sdk_out_tx, json!({ - "kind": "delivery_active", - "name": name, - "delivery_id": payload.get("delivery_id"), - "event_id": payload.get("event_id"), - "pattern": payload.get("pattern"), - })).await; - } - } else if msg_type == "delivery_failed" { - if let Some(payload) = value.get("payload") { - let delivery_id = payload.get("delivery_id").and_then(Value::as_str).unwrap_or(""); - let event_id = payload.get("event_id").and_then(Value::as_str).unwrap_or(""); - let reason = payload.get("reason").and_then(Value::as_str).unwrap_or("unknown"); - tracing::warn!( - target = "agent_relay::broker", - worker = %name, - delivery_id = %delivery_id, - event_id = %event_id, - reason = %reason, - "delivery failed — echo not detected" - ); - clear_pending_delivery_if_event_matches( - &mut pending_deliveries, - delivery_id, - Some(event_id), - &name, - "delivery_failed", - ); - if !delivery_id.is_empty() { - terminal_failed_deliveries - .insert(delivery_id.to_string()); - } - let _ = send_event(&sdk_out_tx, json!({ - "kind": "delivery_failed", - "name": name, - "delivery_id": delivery_id, - "event_id": event_id, - "reason": reason, - })).await; - } - } else if msg_type == "worker_error" { - let _ = send_event(&sdk_out_tx, json!({ - "kind": "worker_error", - "name": name, - "error": value.get("payload").cloned().unwrap_or(Value::Null) - })).await; - } else if msg_type.ends_with("_response") { - // Generic worker request/response dispatch. - // Any frame whose `type` ends in - // `_response` is routed by `request_id` - // into the matching parked `oneshot` in - // `pending_requests`. The pending entry - // owns the format/error decoding logic - // via `worker_request::fulfil_response_frame`. - let routed = worker_request::fulfil_response_frame( - &mut pending_requests, - &value, - ); - if !routed { - let req_id = value - .get("request_id") - .and_then(Value::as_str) - .unwrap_or(""); - tracing::debug!( - target = "agent_relay::broker", - worker = %name, - msg_type = %msg_type, - request_id = %req_id, - "worker response with no pending caller — dropping" - ); - } - } else if msg_type == "worker_stream" { - let _ = send_event(&sdk_out_tx, json!({ - "kind": "worker_stream", - "name": name, - "stream": value.get("payload").and_then(|p| p.get("stream")).cloned().unwrap_or(Value::String("stdout".to_string())), - "chunk": value.get("payload").and_then(|p| p.get("chunk")).cloned().unwrap_or(Value::String(String::new())), - })).await; - } else if msg_type == "worker_ready" { - if let Some(task_text) = workers.initial_tasks.remove(&name) { - let event_id = format!("init_{}", Uuid::new_v4().simple()); - if let Err(e) = queue_and_try_delivery_raw( - &mut workers, - &mut pending_deliveries, - &name, - &event_id, - "broker", - &name, - &task_text, - None, - None, - None, - 2, - MessageInjectionMode::Wait, - delivery_retry_interval, - ).await { - tracing::warn!(worker = %name, error = %e, "failed to deliver initial_task"); - } - } - let runtime = value.get("payload") - .and_then(|p| p.get("runtime")) - .and_then(Value::as_str) - .unwrap_or("pty"); - let (provider_val, cli_val, model_val) = workers.workers.get(&name) - .map(|h| (h.spec.provider.clone(), h.spec.cli.clone(), h.spec.model.clone())) - .unwrap_or((None, None, None)); - let _ = send_event(&sdk_out_tx, json!({ - "kind": "worker_ready", - "name": name, - "runtime": runtime, - "provider": provider_val, - "cli": cli_val, - "model": model_val, - })).await; - } else if msg_type == "agent_idle" { - let idle_secs = value.get("payload") - .and_then(|p| p.get("idle_secs")) - .and_then(Value::as_u64) - .unwrap_or(0); - let _ = send_event(&sdk_out_tx, json!({ - "kind": "agent_idle", - "name": name, - "idle_secs": idle_secs, - })).await; - publish_agent_state_transition( - &ws_control_tx, - &name, - "idle", - Some("idle_threshold"), - ) - .await; - } else if msg_type == "agent_exit" { - let reason = value.get("payload") - .and_then(|p| p.get("reason")) - .and_then(Value::as_str) - .unwrap_or("unknown"); - tracing::info!(agent = %name, reason = %reason, "agent requested exit"); - let _ = send_event(&sdk_out_tx, json!({ - "kind": "agent_exit", - "name": name, - "reason": reason, - })).await; - } else if msg_type == "continuity_command" { - // Agent-initiated continuity: the pty_worker detected a - // KIND: continuity block in PTY output and emitted this event. - let action = value.get("payload") - .and_then(|p| p.get("action")) - .and_then(Value::as_str) - .unwrap_or(""); - let content = value.get("payload") - .and_then(|p| p.get("content")) - .and_then(Value::as_str) - .unwrap_or(""); - match action { - "save" => { - let cont_dir = continuity_dir(&paths.state); - if let Err(e) = std::fs::create_dir_all(&cont_dir) { - tracing::warn!( - agent = %name, - error = %e, - "continuity_command save: failed to create dir" - ); - } else { - // Build a minimal continuity record with the provided summary. - let agent_data = state.agents.get(&name); - let cli = agent_data - .and_then(|d| d.spec.as_ref()) - .and_then(|s| s.cli.clone()); - let initial_task = agent_data - .and_then(|d| d.initial_task.clone()); - let continuity = json!({ - "agent_name": name, - "cli": cli, - "initial_task": initial_task, - "released_at": null, - "lifetime_seconds": null, - "message_history": [], - "summary": content, - }); - let cont_file = cont_dir.join(format!("{}.json", name)); - match std::fs::write( - &cont_file, - serde_json::to_string_pretty(&continuity) - .unwrap_or_default(), - ) { - Ok(()) => tracing::info!( - agent = %name, - path = %cont_file.display(), - "continuity_command: saved agent-initiated continuity" - ), - Err(e) => tracing::warn!( - agent = %name, - error = %e, - "continuity_command save: failed to write file" - ), - } - } - } - "load" => { - let cont_dir = continuity_dir(&paths.state); - let cont_file = cont_dir.join(format!("{}.json", name)); - if cont_file.exists() { - match std::fs::read_to_string(&cont_file) { - Ok(raw) => { - if let Ok(ctx) = serde_json::from_str::(&raw) { - // Build a context summary and inject it - let prev_task = ctx.get("initial_task") - .and_then(Value::as_str) - .unwrap_or("unknown"); - let summary = ctx.get("summary") - .and_then(Value::as_str) - .unwrap_or("no summary"); - let history_str = ctx.get("message_history") - .and_then(Value::as_array) - .map(|msgs| { - msgs.iter() - .filter_map(|m| { - let from = m.get("from")?.as_str()?; - let text = m.get("text") - .or_else(|| m.get("body"))? - .as_str()?; - Some(format!(" - {}: {}", from, text)) - }) - .collect::>() - .join("\n") - }) - .unwrap_or_default(); - let history_section = if history_str.is_empty() { - String::new() - } else { - format!("\nRecent messages:\n{}", history_str) - }; - let inject_body = format!( - "## Continuity Context (from previous session as '{}')\n\ - Previous task: {}\n\ - Session summary: {}{}", - name, prev_task, summary, history_section - ); - let event_id = format!("cont_load_{}", Uuid::new_v4().simple()); - if let Err(e) = queue_and_try_delivery_raw( - &mut workers, - &mut pending_deliveries, - &name, - &event_id, - "broker", - &name, - &inject_body, - None, - None, - None, - 2, - MessageInjectionMode::Wait, - delivery_retry_interval, - ).await { - tracing::warn!( - agent = %name, - error = %e, - "continuity_command load: failed to inject context" - ); - } else { - tracing::info!( - agent = %name, - "continuity_command: injected loaded context" - ); - } - } - } - Err(e) => tracing::warn!( - agent = %name, - error = %e, - "continuity_command load: failed to read file" - ), - } - } else { - tracing::debug!( - agent = %name, - "continuity_command load: no continuity file found" - ); - } - } - "uncertain" => { - tracing::info!( - agent = %name, - content = %content, - "continuity_command: agent reported uncertainty" - ); - } - other => { - tracing::warn!( - agent = %name, - action = %other, - "continuity_command: unknown action ignored" - ); - } - } - } else if msg_type == "worker_exited" { - // PTY worker process is exiting — clean up and - // emit agent_exited so the SDK doesn't have to - // wait for the reap_exited polling cycle. - let code = value.get("payload") - .and_then(|p| p.get("code")) - .and_then(Value::as_i64) - .map(|c| c as i32); - let signal = value.get("payload") - .and_then(|p| p.get("signal")) - .and_then(Value::as_str) - .map(String::from); - tracing::info!( - agent = %name, - code = ?code, - signal = ?signal, - "worker_exited received — cleaning up" - ); - // Remove from registry so reap_exited won't - // double-process this worker. - workers.workers.remove(&name); - workers.initial_tasks.remove(&name); - // Drop pending deliveries for this worker - let dropped = drop_pending_for_worker(&mut pending_deliveries, &name); - if dropped > 0 { - let _ = send_event( - &sdk_out_tx, - json!({ - "kind": "delivery_dropped", - "name": name, - "count": dropped, - "reason": "worker_exited", - }), - ).await; - } - fail_pending_requests_for_worker(&mut pending_requests, &name, "worker_exited"); - delivery_states.remove(&name); - let _ = send_event( - &sdk_out_tx, - json!({ - "kind": "agent_exited", - "name": name, - "code": code, - "signal": signal, - }), - ).await; - publish_agent_state_transition( - &ws_control_tx, - &name, - "exited", - Some("worker_exited"), - ) - .await; - if let Err(error) = relaycast_http.mark_agent_offline(&name).await { - tracing::warn!( - worker = %name, - error = %error, - "failed to mark exited worker offline in relaycast" - ); - } - state.agents.remove(&name); - if paths.persist { - if let Err(error) = state.save(&paths.state) { - tracing::warn!( - path = %paths.state.display(), - error = %error, - "failed to persist broker state" - ); - } - } - } - } - } - } - } - } - - _ = reap_tick.tick() => { - let now = Instant::now(); - - // Time out worker request/response calls whose worker never - // responded. Common cause: worker crashed between us sending - // the request frame and it parsing the frame. Without this - // sweep the HTTP handler would hang forever on its oneshot. - for (req_id, worker_name, kind) in - worker_request::reap_expired(&mut pending_requests, now) - { - tracing::warn!( - target = "agent_relay::broker", - request_id = %req_id, - worker = %worker_name, - kind = %kind, - "worker request timed out before worker responded" - ); - } - - let due_ids: Vec = pending_deliveries - .iter() - .filter_map(|(delivery_id, pending)| { - if pending.next_retry_at <= now { - Some(delivery_id.clone()) - } else { - None - } - }) - .collect(); - - for delivery_id in due_ids { - let was_retry = pending_deliveries - .get(&delivery_id) - .map(|pending| pending.attempts > 0) - .unwrap_or(false); - - match retry_pending_delivery( - &delivery_id, - &mut workers, - &mut pending_deliveries, - delivery_retry_interval, - ) - .await { - Ok(Some((worker_name, attempts, event_id))) => { - if was_retry { - let _ = send_event( - &sdk_out_tx, - json!({ - "kind":"delivery_retry", - "name": worker_name, - "delivery_id": delivery_id, - "event_id": event_id, - "attempts": attempts, - }), - ).await; - } - } - Ok(None) => { - if was_retry { - let _ = send_event( - &sdk_out_tx, - json!({ - "kind": "delivery_dropped", - "delivery_id": delivery_id, - "reason": "max_retries_exceeded", - }), - ).await; - } - } - Err(error) => { - let _ = send_error( - &sdk_out_tx, - None, - "delivery_failed", - error.to_string(), - true, - Some(json!({"delivery_id": delivery_id})), - ).await; - } - } - } - - let exited = match workers.reap_exited().await { - Ok(v) => v, - Err(e) => { - tracing::warn!(err = %e, "reap_exited failed, skipping this cycle"); - vec![] - } - }; - for (name, code, signal) in &exited { - // Record crash in insights - let (category, description) = relay_broker::crash_insights::CrashInsights::analyze(*code, signal.as_deref()); - crash_insights.record(relay_broker::crash_insights::CrashRecord { - agent_name: name.clone(), - exit_code: *code, - signal: signal.clone(), - timestamp: std::time::SystemTime::now() - .duration_since(std::time::UNIX_EPOCH) - .unwrap_or_default() - .as_secs(), - uptime_secs: 0, - category, - description, - }); - - telemetry.track(TelemetryEvent::AgentCrash { - cli: String::new(), - exit_code: *code, - lifetime_seconds: 0, - }); - - // Check supervisor for restart decision - use relay_broker::supervisor::RestartDecision; - match workers.supervisor.on_exit(name, *code, signal.as_deref()) { - Some(RestartDecision::Restart { delay }) => { - // Keep pending deliveries — we'll redeliver after restart - workers.metrics.on_crash(name); - let restart_count = workers.supervisor.restart_count(name) + 1; - tracing::info!( - name = %name, - exit_code = ?code, - signal = ?signal, - restart_count, - delay_ms = delay.as_millis() as u64, - "agent will be restarted" - ); - let _ = send_event( - &sdk_out_tx, - json!({ - "kind": "agent_restarting", - "name": name, - "code": code, - "signal": signal, - "restart_count": restart_count, - "delay_ms": delay.as_millis() as u64, - }), - ).await; - publish_agent_state_transition( - &ws_control_tx, - name, - "stuck", - Some("restarting"), - ) - .await; - } - Some(RestartDecision::PermanentlyDead { reason }) => { - workers.metrics.on_permanent_death(name); - let dropped = drop_pending_for_worker(&mut pending_deliveries, name); - if dropped > 0 { - let _ = send_event( - &sdk_out_tx, - json!({ - "kind":"delivery_dropped", - "name": name, - "count": dropped, - "reason":"worker_permanently_dead", - }), - ).await; - } - fail_pending_requests_for_worker(&mut pending_requests, name, "worker_permanently_dead"); - delivery_states.remove(name); - let _ = send_event( - &sdk_out_tx, - json!({"kind":"agent_permanently_dead","name":name,"reason":reason}), - ).await; - publish_agent_state_transition( - &ws_control_tx, - name, - "stuck", - Some("permanently_dead"), - ) - .await; - if let Err(error) = relaycast_http.mark_agent_offline(name).await { - tracing::warn!( - worker = %name, - error = %error, - "failed to mark permanently dead worker offline in relaycast" - ); - } - state.agents.remove(name); - if paths.persist { - if let Err(error) = state.save(&paths.state) { - tracing::warn!(path = %paths.state.display(), error = %error, "failed to persist broker state"); - } - } - } - None => { - // Not supervised — original behavior - let dropped = drop_pending_for_worker(&mut pending_deliveries, name); - if dropped > 0 { - let _ = send_event( - &sdk_out_tx, - json!({ - "kind":"delivery_dropped", - "name": name, - "count": dropped, - "reason":"worker_exited", - }), - ).await; - } - fail_pending_requests_for_worker(&mut pending_requests, name, "worker_exited"); - delivery_states.remove(name); - let _ = send_event( - &sdk_out_tx, - json!({"kind":"agent_exited","name":name,"code":code,"signal":signal}), - ).await; - publish_agent_state_transition( - &ws_control_tx, - name, - "exited", - Some("worker_exited"), - ) - .await; - if let Err(error) = relaycast_http.mark_agent_offline(name).await { - tracing::warn!( - worker = %name, - error = %error, - "failed to mark exited worker offline in relaycast" - ); - } - state.agents.remove(name); - if paths.persist { - if let Err(error) = state.save(&paths.state) { - tracing::warn!(path = %paths.state.display(), error = %error, "failed to persist broker state"); - } - } - } - } - } - - // Check for agents ready to restart (past cooldown) - if !shutdown { - let pending_restarts = workers.supervisor.pending_restarts(); - for (name, rst) in pending_restarts { - if let Some(remaining) = relaycast_http.registration_block_remaining(&name) - { - tracing::debug!( - worker = %name, - retry_after_secs = remaining.as_secs().max(1), - "skipping restart while relaycast registration is rate-limited" - ); - continue; - } - - let worker_relay_key = if rst.skip_relay_prompt { - None - } else { - match relaycast_http - .register_agent_token(&name, rst.spec.cli.as_deref()) - .await - { - Ok(token) => Some(token), - Err(error) => { - match registration_retry_after_secs(&error) { - Some(retry_after_secs) => { - tracing::warn!( - worker = %name, - retry_after_secs, - error = %error, - "restart blocked by relaycast registration rate limit" - ); - } - None => { - tracing::error!( - worker = %name, - error = %error, - "failed to pre-register worker before restart" - ); - } - } - continue; - } - } - }; - - match workers - .spawn( - rst.spec.clone(), - rst.parent.clone(), - None, - worker_relay_key, - rst.skip_relay_prompt, - None, - ) - .await - { - Ok(_) => { - workers.supervisor.on_restarted(&name); - workers.metrics.on_restart(&name); - if let Some(task) = rst.initial_task { - workers.initial_tasks.insert(name.clone(), task); - } - tracing::info!(name = %name, restart_count = rst.restart_count, "agent restarted"); - let _ = send_event( - &sdk_out_tx, - json!({ - "kind": "agent_restarted", - "name": name, - "restart_count": rst.restart_count, - }), - ).await; - publish_agent_state_transition( - &ws_control_tx, - &name, - "spawned", - Some("restarted"), - ) - .await; - } - Err(e) => { - tracing::error!(name = %name, error = %e, "restart failed"); - } - } - } - } - - // Persist pending deliveries for crash recovery - if paths.persist { - if let Err(error) = save_pending_deliveries(&paths.pending, &pending_deliveries) { - tracing::warn!(path = %paths.pending.display(), error = %error, "failed to persist pending deliveries"); - } - } - } - } - } - - // Save crash insights before shutdown (only in persist mode) - if paths.persist { - if let Err(error) = crash_insights.save(&crash_insights_path) { - tracing::warn!(error = %error, "failed to save crash insights"); - } - } - - telemetry.track(TelemetryEvent::BrokerStop { - uptime_seconds: broker_start.elapsed().as_secs(), - agent_spawn_count, - }); - telemetry.shutdown(); - - let active_workers: Vec = workers.workers.keys().cloned().collect(); - for worker_name in active_workers { - if let Err(error) = relaycast_http.mark_agent_offline(&worker_name).await { - tracing::warn!( - worker = %worker_name, - error = %error, - "failed to mark worker offline during shutdown" - ); - } - } - - // Mark broker agent offline in Relaycast before shutting down WS - if let Err(error) = relaycast_http.mark_offline().await { - tracing::warn!(error = %error, "failed to mark broker offline during shutdown"); - } - - if let Err(error) = ws_control_tx.send(WsControl::Shutdown).await { - tracing::warn!(error = %error, "failed to send ws shutdown signal"); - } - pending_deliveries.clear(); - // Clean shutdown — remove pending file since nothing is pending - if paths.persist { - let _ = std::fs::remove_file(&paths.pending); - } - workers.shutdown_all().await?; - - // Clean up state and connection files on graceful shutdown - if paths.persist { - let _ = std::fs::remove_file(&paths.state); - } - let connection_path = paths.state.parent().unwrap().join("connection.json"); - let _ = std::fs::remove_file(&connection_path); - - Ok(()) -} - -/// Get terminal rows from TIOCGWINSZ. -#[cfg(unix)] -fn terminal_rows() -> Option { - use nix::libc; - use nix::pty::Winsize; - let mut ws = Winsize { - ws_row: 0, - ws_col: 0, - ws_xpixel: 0, - ws_ypixel: 0, - }; - unsafe { - if libc::ioctl(libc::STDOUT_FILENO, libc::TIOCGWINSZ, &mut ws) == 0 && ws.ws_row > 0 { - Some(ws.ws_row) - } else { - None - } - } -} - -/// Get terminal cols from TIOCGWINSZ. -#[cfg(unix)] -fn terminal_cols() -> Option { - use nix::libc; - use nix::pty::Winsize; - let mut ws = Winsize { - ws_row: 0, - ws_col: 0, - ws_xpixel: 0, - ws_ypixel: 0, - }; - unsafe { - if libc::ioctl(libc::STDOUT_FILENO, libc::TIOCGWINSZ, &mut ws) == 0 && ws.ws_col > 0 { - Some(ws.ws_col) - } else { - None - } - } -} - -#[cfg(not(unix))] -fn terminal_rows() -> Option { - None -} -#[cfg(not(unix))] -fn terminal_cols() -> Option { - None -} - -#[cfg(target_os = "linux")] -fn memory_bytes_for_pid(pid: u32) -> u64 { - let statm_path = format!("/proc/{pid}/statm"); - let statm = match std::fs::read_to_string(statm_path) { - Ok(contents) => contents, - Err(_) => return 0, - }; - - let rss_pages = match statm - .split_whitespace() - .nth(1) - .and_then(|value| value.parse::().ok()) - { - Some(value) => value, - None => return 0, - }; - - let page_size = unsafe { nix::libc::sysconf(nix::libc::_SC_PAGESIZE) }; - if page_size <= 0 { - return 0; - } - - rss_pages.saturating_mul(page_size as u64) -} - -#[cfg(not(target_os = "linux"))] -fn memory_bytes_for_pid(_pid: u32) -> u64 { - 0 -} - -fn build_agent_metrics(handle: &WorkerHandle) -> AgentMetrics { - let pid = handle.child.id().unwrap_or_default(); - AgentMetrics { - name: handle.spec.name.clone(), - pid, - memory_bytes: if pid == 0 { - 0 - } else { - memory_bytes_for_pid(pid) - }, - uptime_secs: handle.spawned_at.elapsed().as_secs(), - } -} - -/// Outcome of [`queue_inbound_for_delivery_mode`]. Distinguishes the -/// three cases broker call sites care about: the message is queued and -/// should wait for an explicit flush, the queue should be drained now, -/// or there's no worker (caller falls through to existing target handling). -#[derive(Debug, Clone, PartialEq, Eq)] -enum InboundQueueOutcome { - Queued, - DrainNow(Vec), - WorkerMissing, -} - -/// Bundle of routing context captured into the pending queue. Mirrors the -/// args `queue_and_try_delivery_raw` -/// expects so a drain reproduces the original delivery exactly — same -/// target (channel / DM / thread sentinel), thread, workspace, -/// priority, and injection mode. -struct InboundContext<'a> { - from: &'a str, - body: &'a str, - target: &'a str, - thread_id: Option<&'a str>, - workspace_id: Option<&'a str>, - workspace_alias: Option<&'a str>, - priority: u8, - mode: MessageInjectionMode, - event_id: Option<&'a str>, -} - -/// Queue an inbound relay message through the per-worker [`InboundDeliveryMode`]. -/// -/// Every inbound message is appended to the per-worker pending queue. In -/// [`InboundDeliveryMode::AutoInject`] the caller immediately drains the queue -/// in the same broker turn; in [`InboundDeliveryMode::ManualFlush`] the message -/// stays parked until an explicit flush or mode transition. -/// -/// Pulled out so the broker has one obvious choke point for the two -/// inbound paths (`/api/send` and the relaycast inbound feed) that the -/// `drive` client needs to intercept. Internal broker-driven injections -/// (`worker_ready` initial task, continuity restore) bypass this queue by -/// not calling this helper. -fn queue_inbound_for_delivery_mode( - delivery_states: &mut HashMap, - workers: &WorkerRegistry, - worker_name: &str, - ctx: InboundContext<'_>, -) -> InboundQueueOutcome { - if !workers.has_worker(worker_name) { - return InboundQueueOutcome::WorkerMissing; - } - let state = delivery_states.entry(worker_name.to_string()).or_default(); - let should_drain = state.should_drain_immediately(); - let queued_at_ms = chrono::Utc::now().timestamp_millis().max(0) as u64; - let msg = PendingRelayMessage { - from: ctx.from.to_string(), - body: ctx.body.to_string(), - target: ctx.target.to_string(), - thread_id: ctx.thread_id.map(str::to_string), - workspace_id: ctx.workspace_id.map(str::to_string), - workspace_alias: ctx.workspace_alias.map(str::to_string), - priority: ctx.priority, - mode: ctx.mode, - queued_at_ms, - event_id: ctx.event_id.map(str::to_string), - }; - match state.accept_inbound(msg) { - InboundDeliveryDispatch::Queued { queue_len } => { - tracing::debug!( - target = "agent_relay::broker", - worker = %worker_name, - from = %ctx.from, - mode = state.mode.as_wire_str(), - queue_len, - "queued inbound relay message" - ); - } - InboundDeliveryDispatch::QueuedEvicted { - queue_len, - dropped_from, - } => { - tracing::warn!( - target = "agent_relay::broker", - worker = %worker_name, - from = %ctx.from, - dropped_from = %dropped_from, - mode = state.mode.as_wire_str(), - queue_len, - max_pending = relay_broker::types::MAX_PENDING_PER_WORKER, - "pending queue full — evicting oldest message" - ); - } - } - if should_drain { - let to_drain = state.drain_pending(); - tracing::debug!( - target = "agent_relay::broker", - worker = %worker_name, - drained = to_drain.len(), - "draining inbound queue immediately (auto_inject delivery mode)" - ); - InboundQueueOutcome::DrainNow(to_drain) - } else { - InboundQueueOutcome::Queued - } -} - -async fn try_inject_pending_relay_message( - workers: &mut WorkerRegistry, - pending_deliveries: &mut HashMap, - worker_name: &str, - msg: &PendingRelayMessage, - retry_interval: Duration, -) -> Result<()> { - let event_id = msg - .event_id - .clone() - .unwrap_or_else(|| format!("flush_{}", Uuid::new_v4().simple())); - match timeout( - retry_interval, - queue_and_try_delivery_raw( - workers, - pending_deliveries, - worker_name, - &event_id, - &msg.from, - // Use the ORIGINAL routing target captured at queue time — - // `#general`, the DM recipient name, `"thread"`, etc. Falling - // back to `worker_name` here would silently reframe channel - // messages as direct-to-worker messages on drain. - &msg.target, - &msg.body, - msg.thread_id.clone(), - msg.workspace_id.clone(), - msg.workspace_alias.clone(), - msg.priority, - msg.mode.clone(), - retry_interval, - ), - ) - .await - { - Ok(result) => result, - Err(_) => Err(anyhow::anyhow!( - "pending relay delivery timed out after {}ms", - retry_interval.as_millis() - )), - } -} - -/// Inject a previously-queued pending relay message into the worker via -/// the existing `queue_and_try_delivery_raw` path. Used by the -/// `/api/spawned/{name}/flush` handler and by the auto-drain on a -/// `manual_flush → auto_inject` transition. Failures are logged but not -/// propagated — the broker treats `flush` as best-effort fire-and-forget -/// the same way `/api/send` does for individual targets. -async fn inject_pending_relay_message( - workers: &mut WorkerRegistry, - pending_deliveries: &mut HashMap, - worker_name: &str, - msg: &PendingRelayMessage, - retry_interval: Duration, -) { - let event_id = msg.event_id.as_deref().unwrap_or(""); - if let Err(error) = try_inject_pending_relay_message( - workers, - pending_deliveries, - worker_name, - msg, - retry_interval, - ) - .await - { - tracing::warn!( - target = "agent_relay::broker", - worker = %worker_name, - from = %msg.from, - event_id = %event_id, - error = %error, - "failed to inject pending relay message during flush" - ); - } -} - -async fn queue_and_try_delivery( - workers: &mut WorkerRegistry, - pending_deliveries: &mut HashMap, - worker_name: &str, - mapped: &relay_broker::types::InboundRelayEvent, - retry_interval: Duration, -) -> Result<()> { - queue_and_try_delivery_raw( - workers, - pending_deliveries, - worker_name, - &mapped.event_id, - &mapped.from, - &mapped.target, - &mapped.text, - mapped.thread_id.clone(), - Some(mapped.workspace_id.clone()), - mapped.workspace_alias.clone(), - mapped.priority.as_u8(), - MessageInjectionMode::Wait, - retry_interval, - ) - .await -} - -#[allow(clippy::too_many_arguments)] -async fn queue_and_try_delivery_raw( - workers: &mut WorkerRegistry, - pending_deliveries: &mut HashMap, - worker_name: &str, - event_id: &str, - from: &str, - target: &str, - body: &str, - thread_id: Option, - workspace_id: Option, - workspace_alias: Option, - priority: u8, - injection_mode: MessageInjectionMode, - retry_interval: Duration, -) -> Result<()> { - let delivery = RelayDelivery { - delivery_id: format!("del_{}", Uuid::new_v4().simple()), - event_id: event_id.to_string(), - workspace_id, - workspace_alias, - from: from.to_string(), - target: target.to_string(), - body: body.to_string(), - thread_id, - priority: Some(priority), - injection_mode, - }; - let delivery_id = delivery.delivery_id.clone(); - pending_deliveries.insert( - delivery_id.clone(), - PendingDelivery { - worker_name: worker_name.to_string(), - delivery, - attempts: 0, - next_retry_at: Instant::now(), - }, - ); - - let _ = - retry_pending_delivery(&delivery_id, workers, pending_deliveries, retry_interval).await?; - Ok(()) -} - -async fn retry_pending_delivery( - delivery_id: &str, - workers: &mut WorkerRegistry, - pending_deliveries: &mut HashMap, - retry_interval: Duration, -) -> Result> { - let pending = match pending_deliveries.get(delivery_id) { - Some(pending) => pending.clone(), - None => return Ok(None), - }; - - if pending.attempts >= MAX_DELIVERY_RETRIES { - pending_deliveries.remove(delivery_id); - return Ok(None); - } - - if !workers.has_worker(&pending.worker_name) { - pending_deliveries.remove(delivery_id); - return Ok(None); - } - - match workers - .deliver(&pending.worker_name, pending.delivery.clone()) - .await - { - Ok(()) => { - if let Some(current) = pending_deliveries.get_mut(delivery_id) { - current.attempts = current.attempts.saturating_add(1); - current.next_retry_at = Instant::now() + retry_interval; - return Ok(Some(( - current.worker_name.clone(), - current.attempts, - current.delivery.event_id.clone(), - ))); - } - Ok(None) - } - Err(error) => { - if let Some(current) = pending_deliveries.get_mut(delivery_id) { - current.next_retry_at = Instant::now() + retry_interval; - } - Err(error) - } - } -} - -fn drop_pending_for_worker( - pending_deliveries: &mut HashMap, - worker_name: &str, -) -> usize { - let before = pending_deliveries.len(); - pending_deliveries.retain(|_, pending| pending.worker_name != worker_name); - before.saturating_sub(pending_deliveries.len()) -} - -/// Drain every in-flight worker request targeting `worker_name` and -/// notify each awaiter with [`worker_request::RequestWorkerError::WorkerDisappeared`]. -/// Called from every worker-teardown path (explicit release, -/// `worker_exited` frame, `reap_exited` periodic sweep) so HTTP callers -/// don't have to wait out the request deadline when the worker has -/// clearly gone. Logs one structured warning per drained request. -fn fail_pending_requests_for_worker( - pending_requests: &mut HashMap, - worker_name: &str, - reason: &'static str, -) -> usize { - let failed = worker_request::fail_for_worker(pending_requests, worker_name); - for (req_id, kind) in &failed { - tracing::warn!( - target = "agent_relay::broker", - request_id = %req_id, - worker = %worker_name, - kind = %kind, - reason = reason, - "failed pending worker request because worker is gone" - ); - } - failed.len() -} - -fn should_clear_pending_delivery_for_event( - pending: Option<&PendingDelivery>, - event_id: Option<&str>, -) -> bool { - let Some(pending) = pending else { - return true; - }; - - let Some(event_id) = event_id - .map(str::trim) - .filter(|event_id| !event_id.is_empty()) - else { - return true; - }; - - pending.delivery.event_id == event_id -} - -fn clear_pending_delivery_if_event_matches( - pending_deliveries: &mut HashMap, - delivery_id: &str, - event_id: Option<&str>, - worker_name: &str, - worker_signal: &str, -) { - let pending = pending_deliveries.get(delivery_id); - if should_clear_pending_delivery_for_event(pending, event_id) { - pending_deliveries.remove(delivery_id); - return; - } - - if let Some(pending) = pending { - tracing::warn!( - target = "agent_relay::broker", - worker = %worker_name, - signal = %worker_signal, - delivery_id = %delivery_id, - expected_event_id = %pending.delivery.event_id, - received_event_id = %event_id.unwrap_or(""), - "ignoring stale delivery lifecycle event due to event_id mismatch" - ); - } -} - -async fn run_headless_worker(cmd: HeadlessCommand) -> Result<()> { - let provider: ProtocolHeadlessProvider = cmd.provider.into(); - let provider_name = headless_provider_cli_name(&provider); - let provider_args = cmd.args.clone(); - - let (out_tx, mut out_rx) = mpsc::channel::>(512); - let writer_task = tokio::spawn(async move { - // Keep one async stdout handle for this process. Tokio's `write_all` - // is not cancel-safe if the task is aborted mid-write, so shutdown - // below drops `out_tx` and awaits this task before returning. - let mut stdout = tokio::io::stdout(); - while let Some(frame) = out_rx.recv().await { - if let Ok(mut line) = serde_json::to_string(&frame) { - line.push('\n'); - if stdout.write_all(line.as_bytes()).await.is_err() || stdout.flush().await.is_err() - { - break; - } - } - } - }); - - let mut lines = BufReader::new(tokio::io::stdin()).lines(); - let mut worker_name = cmd - .agent_name - .clone() - .unwrap_or_else(|| format!("headless-{provider_name}")); - let mut final_exit_code: Option = None; - let mut final_exit_signal: Option = None; - - while let Ok(Some(line)) = lines.next_line().await { - let frame: ProtocolEnvelope = match serde_json::from_str(&line) { - Ok(frame) => frame, - Err(error) => { - let _ = send_frame( - &out_tx, - "worker_error", - None, - json!({ - "code":"invalid_frame", - "message": error.to_string(), - "retryable": false, - }), - ) - .await; - continue; - } - }; - - match frame.msg_type.as_str() { - "init_worker" => { - worker_name = cmd - .agent_name - .clone() - .or_else(|| { - frame - .payload - .get("agent") - .and_then(|a| a.get("name")) - .and_then(Value::as_str) - .map(ToOwned::to_owned) - }) - .unwrap_or_else(|| format!("headless-{provider_name}")); - - let _ = send_frame( - &out_tx, - "worker_ready", - frame.request_id, - json!({ - "name": &worker_name, - "runtime": "headless", - }), - ) - .await; - } - "deliver_relay" => { - let request_id = frame.request_id.clone(); - let delivery: RelayDelivery = match serde_json::from_value(frame.payload) { - Ok(d) => d, - Err(error) => { - let _ = send_frame( - &out_tx, - "worker_error", - request_id, - json!({ - "code":"invalid_delivery", - "message": error.to_string(), - "retryable": false, - }), - ) - .await; - continue; - } - }; - - let timestamp = chrono::Utc::now().timestamp_millis(); - let delivery_id = delivery.delivery_id; - let event_id = delivery.event_id; - - let _ = send_frame( - &out_tx, - "delivery_queued", - None, - json!({ - "delivery_id": delivery_id, - "event_id": event_id, - "agent": &worker_name, - "timestamp": timestamp, - }), - ) - .await; - - let _ = send_frame( - &out_tx, - "delivery_injected", - None, - json!({ - "delivery_id": delivery_id, - "event_id": event_id, - "agent": &worker_name, - "timestamp": timestamp, - }), - ) - .await; - - let _ = send_frame( - &out_tx, - "delivery_active", - None, - json!({ - "delivery_id": delivery_id, - "event_id": event_id, - "pattern": format!("headless:{}", provider_name), - }), - ) - .await; - - let task_text = delivery.body.clone(); - let (binary, args) = - headless_provider_command(&provider, &task_text, &provider_args); - - let mut child_cmd = tokio::process::Command::new(&binary); - child_cmd - .args(&args) - .stdin(Stdio::null()) - .stdout(Stdio::piped()) - .stderr(Stdio::piped()); - - // Auto-approve tool permissions for opencode in headless mode. - if matches!(provider, ProtocolHeadlessProvider::Opencode) { - child_cmd.env( - "OPENCODE_PERMISSION", - r#"{"*":"allow","external_directory":{"*":"allow"}}"#, - ); - } - - let mut child = match child_cmd.spawn() { - Ok(child) => child, - Err(error) => { - let _ = send_frame( - &out_tx, - "delivery_failed", - None, - json!({ - "delivery_id": delivery_id, - "event_id": event_id, - "reason": format!("failed to spawn {}: {}", binary, error), - }), - ) - .await; - let _ = send_frame( - &out_tx, - "worker_error", - request_id, - json!({ - "code":"spawn_failed", - "message": format!("failed to spawn {}: {}", binary, error), - "retryable": false, - }), - ) - .await; - final_exit_code = Some(1); - break; - } - }; - - let _ = send_frame( - &out_tx, - "delivery_ack", - request_id.clone(), - json!({ - "delivery_id": delivery_id, - "event_id": event_id, - }), - ) - .await; - - let stdout = child.stdout.take(); - let stderr = child.stderr.take(); - - let stream_stdout = { - let out_tx = out_tx.clone(); - async move { - if let Some(stdout) = stdout { - let mut lines = BufReader::new(stdout).lines(); - while let Ok(Some(chunk)) = lines.next_line().await { - let _ = send_frame( - &out_tx, - "worker_stream", - None, - json!({ - "stream": "stdout", - "chunk": chunk, - }), - ) - .await; - } - } - } - }; - - let stream_stderr = { - let out_tx = out_tx.clone(); - async move { - if let Some(stderr) = stderr { - let mut lines = BufReader::new(stderr).lines(); - while let Ok(Some(chunk)) = lines.next_line().await { - let _ = send_frame( - &out_tx, - "worker_stream", - None, - json!({ - "stream": "stderr", - "chunk": chunk, - }), - ) - .await; - } - } - } - }; - - let (status, _, _) = tokio::join!(child.wait(), stream_stdout, stream_stderr); - - match status { - Ok(exit_status) => { - final_exit_code = exit_status.code(); - final_exit_signal = None; - if exit_status.success() { - let _ = send_frame( - &out_tx, - "delivery_verified", - None, - json!({ - "delivery_id": delivery_id, - "event_id": event_id, - }), - ) - .await; - } else { - let reason = match exit_status.code() { - Some(code) => format!("{} exited with code {}", binary, code), - None => format!("{} exited without an exit code", binary), - }; - let _ = send_frame( - &out_tx, - "delivery_failed", - None, - json!({ - "delivery_id": delivery_id, - "event_id": event_id, - "reason": reason, - }), - ) - .await; - } - } - Err(error) => { - let reason = format!("failed waiting for {}: {}", binary, error); - let _ = send_frame( - &out_tx, - "delivery_failed", - None, - json!({ - "delivery_id": delivery_id, - "event_id": event_id, - "reason": reason, - }), - ) - .await; - let _ = send_frame( - &out_tx, - "worker_error", - request_id, - json!({ - "code":"wait_failed", - "message": format!("failed waiting for {}: {}", binary, error), - "retryable": false, - }), - ) - .await; - final_exit_code = Some(1); - } - } - - break; - } - "ping" => { - let ts = frame - .payload - .get("ts_ms") - .and_then(Value::as_u64) - .unwrap_or_default(); - let _ = send_frame(&out_tx, "pong", frame.request_id, json!({"ts_ms": ts})).await; - } - "shutdown_worker" => { - break; - } - other => { - let _ = send_frame( - &out_tx, - "worker_error", - frame.request_id, - json!({ - "code":"unknown_type", - "message": format!("unsupported message type '{}'", other), - "retryable": false, - }), - ) - .await; - } - } - } - - let _ = send_frame( - &out_tx, - "worker_exited", - None, - json!({"code": final_exit_code, "signal": final_exit_signal}), - ) - .await; - drop(out_tx); - let _ = writer_task.await; - - Ok(()) -} - -async fn send_error( - tx: &mpsc::Sender>, - request_id: Option, - code: &str, - message: String, - retryable: bool, - data: Option, -) -> Result<()> { - send_frame( - tx, - "error", - request_id, - json!({ - "code": code, - "message": message, - "retryable": retryable, - "data": data, - }), - ) - .await -} - -async fn send_event(tx: &mpsc::Sender>, payload: Value) -> Result<()> { - send_frame(tx, "event", None, payload).await -} - -async fn emit_http_api_event_with_timeout( - tx: &mpsc::Sender>, - payload: Value, - timeout_window: Duration, -) { - match timeout(timeout_window, send_event(tx, payload)).await { - Ok(Ok(())) => {} - Ok(Err(error)) => { - tracing::warn!( - target = "relay_broker::http_api", - error = %error, - "failed to enqueue HTTP API event" - ); - } - Err(_) => { - tracing::warn!( - target = "relay_broker::http_api", - timeout_ms = %timeout_window.as_millis(), - "timed out enqueuing HTTP API event" - ); - } - } -} - -async fn send_frame( - tx: &mpsc::Sender>, - msg_type: &str, - request_id: Option, - payload: Value, -) -> Result<()> { - tx.send(ProtocolEnvelope { - v: PROTOCOL_VERSION, - msg_type: msg_type.to_string(), - request_id, - payload, - }) - .await - .context("failed to enqueue outbound frame") -} - -fn init_tracing() { - let (writer, guard) = tracing_appender::non_blocking(std::io::stderr()); - let subscriber = tracing_subscriber::fmt::Subscriber::builder() - .with_env_filter( - tracing_subscriber::EnvFilter::try_from_default_env() - .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info")), - ) - .with_target(true) - .with_writer(writer) - .finish(); - if tracing::subscriber::set_global_default(subscriber).is_ok() { - let _ = TRACING_GUARD.set(guard); - } -} - -fn channels_from_csv(raw: &str) -> Vec { - raw.split(',') - .map(str::trim) - .filter(|s| !s.is_empty()) - .map(ToOwned::to_owned) - .collect() -} - -/// Default channels for freshly spawned agents. -/// Reads RELAY_DEFAULT_CHANNELS (comma-separated) or falls back to the -/// broker's default channels: vec!["general", "engineering"] — both created -/// at startup by ensure_default_channels(). -fn default_spawn_channels() -> Vec { - if let Ok(raw) = std::env::var("RELAY_DEFAULT_CHANNELS") { - let parsed = channels_from_csv(&raw); - if !parsed.is_empty() { - return parsed; - } - } - // channels: ["general", "engineering"] (must match ensure_default_channels) - vec!["general".to_string(), "engineering".to_string()] -} - -fn command_targets_self(cmd_event: &BrokerCommandEvent, self_agent_id: &str) -> bool { - match cmd_event.handler_agent_id.as_deref() { - Some(handler_id) => handler_id == self_agent_id, - None => { - tracing::warn!( - command = %cmd_event.command, - invoked_by = %cmd_event.invoked_by, - "command has no handler_agent_id; accepting by default (multi-broker setups should scope commands)" - ); - true - } - } -} - -fn env_flag_enabled(name: &str) -> bool { - std::env::var(name) - .ok() - .map(|value| value.trim().to_ascii_lowercase()) - .is_some_and(|value| matches!(value.as_str(), "1" | "true" | "yes" | "on")) -} - -fn delivery_retry_interval() -> Duration { - let ms = std::env::var("AGENT_RELAY_DELIVERY_RETRY_MS") - .ok() - .and_then(|raw| raw.trim().parse::().ok()) - .unwrap_or(DEFAULT_DELIVERY_RETRY_MS); - Duration::from_millis(ms.max(50)) -} - -fn http_api_local_delivery_timeout() -> Duration { - let ms = std::env::var("AGENT_RELAY_HTTP_API_LOCAL_DELIVERY_TIMEOUT_MS") - .ok() - .and_then(|raw| raw.trim().parse::().ok()) - .unwrap_or(DEFAULT_HTTP_API_LOCAL_DELIVERY_TIMEOUT_MS); - Duration::from_millis(ms.max(100)) -} - -fn http_api_relaycast_send_timeout() -> Duration { - let ms = std::env::var("AGENT_RELAY_HTTP_API_RELAYCAST_SEND_TIMEOUT_MS") - .ok() - .and_then(|raw| raw.trim().parse::().ok()) - .unwrap_or(DEFAULT_HTTP_API_RELAYCAST_SEND_TIMEOUT_MS); - Duration::from_millis(ms.max(500)) -} - -fn http_api_event_emit_timeout() -> Duration { - let ms = std::env::var("AGENT_RELAY_HTTP_API_EVENT_EMIT_TIMEOUT_MS") - .ok() - .and_then(|raw| raw.trim().parse::().ok()) - .unwrap_or(DEFAULT_HTTP_API_EVENT_EMIT_TIMEOUT_MS); - Duration::from_millis(ms.max(25)) -} - -fn normalize_channel(raw: &str) -> String { - let trimmed = raw.trim(); - if trimmed.starts_with('#') { - trimmed.to_string() - } else { - format!("#{trimmed}") - } -} - -fn build_agent_state_transition_event(name: &str, state: &str, reason: Option<&str>) -> Value { - let mut payload = json!({ - "type": "agent.state", - "state": state, - "agent": { "name": name }, - "timestamp": chrono::Utc::now().to_rfc3339(), - }); - if let Some(reason) = reason.map(str::trim).filter(|value| !value.is_empty()) { - payload["reason"] = json!(reason); - } - payload -} - -async fn publish_agent_state_transition( - ws_control_tx: &mpsc::Sender, - name: &str, - state: &str, - reason: Option<&str>, -) { - let event = build_agent_state_transition_event(name, state, reason); - if let Err(error) = ws_control_tx.send(WsControl::Publish(event)).await { - tracing::debug!( - agent = %name, - state = %state, - error = %error, - "failed to publish agent state transition" - ); - } -} - -fn normalize_identity_for_thread(raw: &str) -> String { - raw.trim().trim_start_matches('@').to_ascii_lowercase() -} - -fn json_scalar_to_string(value: &Value) -> Option { - match value { - Value::String(text) => { - let trimmed = text.trim(); - if trimmed.is_empty() { - None - } else { - Some(trimmed.to_string()) - } - } - Value::Number(number) => Some(number.to_string()), - _ => None, - } -} - -fn first_string(value: &Value, pointers: &[&str]) -> Option { - pointers - .iter() - .find_map(|pointer| value.pointer(pointer).and_then(json_scalar_to_string)) -} - -fn first_bool(value: &Value, pointers: &[&str]) -> Option { - pointers - .iter() - .find_map(|pointer| value.pointer(pointer).and_then(Value::as_bool)) -} - -fn first_u64(value: &Value, pointers: &[&str]) -> Option { - pointers - .iter() - .find_map(|pointer| value.pointer(pointer).and_then(Value::as_u64)) -} - -fn first_i64(value: &Value, pointers: &[&str]) -> Option { - pointers - .iter() - .find_map(|pointer| value.pointer(pointer).and_then(Value::as_i64)) -} - -fn relaycast_ws_control_dedup_key( - workspace_id: &str, - ws_type: &str, - value: &Value, -) -> Option { - let identity = if ws_type == "agent.spawn_requested" { - relaycast_ws_spawn_token(value) - .or_else(|| { - first_string( - value, - &[ - "/event_id", - "/id", - "/payload/id", - "/payload/event_id", - "/agent/id", - "/agent/event_id", - "/message/id", - "/message/event_id", - "/message_id", - ], - ) - }) - .or_else(|| first_string(value, &["/agent/name", "/payload/agent/name", "/name"])) - } else { - first_string( - value, - &[ - "/event_id", - "/id", - "/payload/id", - "/payload/event_id", - "/agent/id", - "/agent/event_id", - "/message/id", - "/message/event_id", - "/message_id", - ], - ) - } - .or_else(|| serde_json::to_string(value).ok())?; - Some(format!("control:{workspace_id}:{ws_type}:{identity}")) -} - -fn relaycast_ws_spawn_token(value: &Value) -> Option { - first_string( - value, - &[ - "/agent/token", - "/agent/relay_key", - "/agent/api_key", - "/token", - ], - ) -} - -fn relaycast_spawn_control_dedup_key(workspace_id: &str, identity: &str) -> String { - format!("control:{workspace_id}:agent.spawn_requested:{identity}") -} - -fn relaycast_ws_should_apply_local_spawn_echo_dedup( - control_dedup_key: Option<&str>, - local_spawn_echo_key: &str, -) -> bool { - control_dedup_key != Some(local_spawn_echo_key) -} - -fn note_local_spawn_control_dedup( - dedup: &mut DedupCache, - workspace_id: Option<&str>, - agent_name: &str, - relay_key: Option<&str>, -) { - let Some(workspace_id) = workspace_id else { - return; - }; - let agent_name = agent_name.trim(); - if !agent_name.is_empty() { - let key = relaycast_spawn_control_dedup_key(workspace_id, agent_name); - dedup.insert_if_new(&key, Instant::now()); - } - if let Some(relay_key) = relay_key.map(str::trim).filter(|value| !value.is_empty()) { - let key = relaycast_spawn_control_dedup_key(workspace_id, relay_key); - dedup.insert_if_new(&key, Instant::now()); - } -} - -fn is_unknown_worker_error_message(message: &str) -> bool { - message.contains("unknown worker '") -} - -fn is_relaycast_self_control_target( - name: &str, - workspace_self_name: &str, - workspace_self_names: &HashSet, -) -> bool { - let normalized = normalize_identity_for_thread(name); - normalized == normalize_identity_for_thread(workspace_self_name) - || workspace_self_names.contains(&normalized) -} - -fn message_sender(value: &Value) -> Option { - first_string( - value, - &[ - "/from", - "/sender", - "/author", - "/agent_name", - "/message/from", - "/message/sender", - "/message/author", - "/payload/from", - "/payload/sender", - "/payload/author", - "/payload/message/from", - "/payload/message/sender", - "/payload/message/author", - ], - ) -} - -fn message_target(value: &Value) -> Option { - first_string( - value, - &[ - "/target", - "/to", - "/recipient", - "/channel", - "/conversation_id", - "/conversationId", - "/message/target", - "/message/to", - "/message/recipient", - "/message/channel", - "/message/conversation_id", - "/message/conversationId", - "/payload/target", - "/payload/to", - "/payload/recipient", - "/payload/channel", - "/payload/conversation_id", - "/payload/conversationId", - "/payload/message/target", - "/payload/message/to", - "/payload/message/recipient", - "/payload/message/channel", - "/payload/message/conversation_id", - "/payload/message/conversationId", - ], - ) -} - -fn message_preview(value: &Value) -> Option { - let text = first_string( - value, - &[ - "/text", - "/body", - "/content", - "/message/text", - "/message/body", - "/message/content", - "/payload/text", - "/payload/body", - "/payload/content", - "/payload/message/text", - "/payload/message/body", - "/payload/message/content", - "/message", - "/payload/message", - ], - )?; - Some(truncate_thread_preview(&text, 200)) -} - -fn truncate_thread_preview(input: &str, max_len: usize) -> String { - let trimmed = input.trim(); - if trimmed.len() <= max_len { - return trimmed.to_string(); - } - let boundary = floor_char_boundary(trimmed, max_len); - let mut out = trimmed[..boundary].to_string(); - out.push_str("..."); - out -} - -fn parse_sort_key_from_raw_timestamp(raw: &str) -> Option { - let trimmed = raw.trim(); - if trimmed.is_empty() { - return None; - } - if let Ok(epoch) = trimmed.parse::() { - return Some(epoch); - } - chrono::DateTime::parse_from_rfc3339(trimmed) - .ok() - .map(|parsed| parsed.timestamp_millis()) -} - -fn message_timestamp_string(value: &Value) -> Option { - first_string( - value, - &[ - "/created_at", - "/createdAt", - "/timestamp", - "/ts", - "/message/created_at", - "/message/createdAt", - "/message/timestamp", - "/message/ts", - "/payload/created_at", - "/payload/createdAt", - "/payload/timestamp", - "/payload/ts", - "/payload/message/created_at", - "/payload/message/createdAt", - "/payload/message/timestamp", - "/payload/message/ts", - ], - ) -} - -fn message_sort_key(value: &Value, index: usize) -> i64 { - if let Some(raw) = message_timestamp_string(value) { - if let Some(parsed) = parse_sort_key_from_raw_timestamp(&raw) { - return parsed; - } - } - - first_i64( - value, - &[ - "/created_at", - "/createdAt", - "/timestamp", - "/ts", - "/message/created_at", - "/message/createdAt", - "/message/timestamp", - "/message/ts", - "/payload/created_at", - "/payload/createdAt", - "/payload/timestamp", - "/payload/ts", - ], - ) - .unwrap_or(index as i64) -} - -fn message_thread_id(value: &Value) -> Option { - if let Some(explicit) = first_string( - value, - &[ - "/thread_id", - "/threadId", - "/parent_id", - "/conversation_id", - "/conversationId", - "/message/thread_id", - "/message/threadId", - "/message/parent_id", - "/message/conversation_id", - "/message/conversationId", - "/payload/thread_id", - "/payload/threadId", - "/payload/parent_id", - "/payload/conversation_id", - "/payload/conversationId", - "/payload/message/thread_id", - "/payload/message/threadId", - "/payload/message/parent_id", - "/payload/message/conversation_id", - "/payload/message/conversationId", - ], - ) { - return Some(explicit); - } - - let target = message_target(value)?; - if target.starts_with('#') { - return Some(normalize_channel(&target)); - } - if target.starts_with("conv_") - || target.starts_with("dm_") - || target.chars().all(|ch| ch.is_ascii_digit()) - { - return Some(target); - } - - let sender = message_sender(value)?; - let sender = normalize_identity_for_thread(&sender); - let target = normalize_identity_for_thread(&target); - if sender.is_empty() || target.is_empty() { - return None; - } - let (first, second) = if sender <= target { - (sender, target) - } else { - (target, sender) - }; - Some(format!("direct:{first}:{second}")) -} - -fn is_self_identity(value: &str, self_names: &HashSet) -> bool { - let normalized = normalize_identity_for_thread(value); - !normalized.is_empty() - && self_names - .iter() - .any(|self_name| normalize_identity_for_thread(self_name) == normalized) -} - -fn derive_thread_name(message: &Value, thread_id: &str, self_names: &HashSet) -> String { - if let Some(explicit) = first_string( - message, - &[ - "/thread_name", - "/threadName", - "/title", - "/subject", - "/conversation_name", - "/conversationName", - ], - ) { - return explicit; - } - - if thread_id.starts_with('#') { - return thread_id.to_string(); - } - - // Use participants array (from workspace-level DM data) to build a combined name - // like "WorkerA ↔ WorkerB" for DMs between non-broker agents. - if let Some(participants) = message.get("participants").and_then(|v| v.as_array()) { - let names: Vec<&str> = participants - .iter() - .filter_map(|p| p.as_str()) - .filter(|name| !is_self_identity(name, self_names)) - .collect(); - if names.len() >= 2 { - return format!("{} ↔ {}", names[0], names[1]); - } else if names.len() == 1 { - return names[0].to_string(); - } - } - - if let Some(sender) = message_sender(message) { - if !is_self_identity(&sender, self_names) { - return sender.trim().trim_start_matches('@').to_string(); - } - } - - if let Some(target) = message_target(message) { - let trimmed = target.trim().trim_start_matches('@'); - if trimmed.starts_with('#') { - return normalize_channel(trimmed); - } - if !trimmed.is_empty() - && !trimmed.eq_ignore_ascii_case(thread_id) - && !is_self_identity(trimmed, self_names) - && !trimmed.starts_with("conv_") - && !trimmed.starts_with("dm_") - && !trimmed.chars().all(|ch| ch.is_ascii_digit()) - { - return trimmed.to_string(); - } - } - - thread_id.to_string() -} - -fn thread_unread_increment(message: &Value, self_names: &HashSet) -> usize { - if let Some(read) = first_bool( - message, - &[ - "/read", - "/is_read", - "/isRead", - "/message/read", - "/message/is_read", - "/message/isRead", - "/payload/read", - "/payload/is_read", - "/payload/isRead", - "/payload/message/read", - "/payload/message/is_read", - "/payload/message/isRead", - ], - ) { - return usize::from(!read); - } - - if let Some(sender) = message_sender(message) { - return usize::from(!is_self_identity(&sender, self_names)); - } - 0 -} - -fn build_thread_infos(messages: &[Value], self_names: &HashSet) -> Vec { - let mut by_thread: HashMap = HashMap::new(); - - for (index, message) in messages.iter().enumerate() { - let Some(thread_id) = message_thread_id(message) else { - continue; - }; - - let name = derive_thread_name(message, &thread_id, self_names); - let sort_key = message_sort_key(message, index); - let preview = message_preview(message); - let timestamp = message_timestamp_string(message); - let explicit_unread = first_u64( - message, - &[ - "/unread_count", - "/unreadCount", - "/message/unread_count", - "/message/unreadCount", - "/payload/unread_count", - "/payload/unreadCount", - "/payload/message/unread_count", - "/payload/message/unreadCount", - ], - ) - .map(|value| value as usize); - let unread_delta = thread_unread_increment(message, self_names); - - let entry = by_thread - .entry(thread_id.clone()) - .or_insert_with(|| ThreadAccumulator { - info: ThreadInfo { - thread_id: thread_id.clone(), - name: name.clone(), - unread_count: 0, - last_message: None, - last_message_at: None, - }, - sort_key, - }); - - if entry.info.name == entry.info.thread_id && name != entry.info.thread_id { - entry.info.name = name.clone(); - } - - if let Some(explicit_unread) = explicit_unread { - entry.info.unread_count = entry.info.unread_count.max(explicit_unread); - } else { - entry.info.unread_count = entry.info.unread_count.saturating_add(unread_delta); - } - - if sort_key >= entry.sort_key { - entry.sort_key = sort_key; - entry.info.name = name; - entry.info.last_message = preview; - entry.info.last_message_at = timestamp; - } - } - - let mut threads: Vec = by_thread.into_values().collect(); - threads.sort_by(|left, right| { - right - .sort_key - .cmp(&left.sort_key) - .then_with(|| left.info.thread_id.cmp(&right.info.thread_id)) - }); - - threads.into_iter().map(|entry| entry.info).collect() -} - -fn record_thread_history_event(history: &mut VecDeque, event: Value) { - if history.len() >= THREAD_HISTORY_LIMIT { - let _ = history.pop_front(); - } - history.push_back(event); -} - -/// Get current terminal size. Returns (rows, cols). -/// -/// Uses `crossterm::terminal::size()`, which is cross-platform: -/// TIOCGWINSZ on unix, GetConsoleScreenBufferInfo on Windows. -fn get_terminal_size() -> Option<(u16, u16)> { - crossterm::terminal::size() - .ok() - .map(|(cols, rows)| (rows, cols)) -} - -/// Detect Claude Code auto-suggestion ghost text. -/// -/// Auto-suggestions are rendered with reverse-video cursor + dim ghost text, -/// and often include the "↵ send" hint. -/// Extract Relaycast message IDs from MCP tool response output. -/// -/// When the agent sends a message via MCP (send_dm, send_message, etc.), -/// the response JSON contains `"id": ""`. We extract these IDs -/// and pre-seed the dedup cache so the WS echo of the same message is dropped. -/// This is more robust than name-based filtering since it works regardless -/// of what identity the MCP server registers with. -fn extract_mcp_message_ids(buffer: &str) -> Vec { - let mut ids = Vec::new(); - // Match patterns like "id": "147310274064424960" (Relaycast snowflake IDs are 18-digit numbers) - let mut search_start = 0; - while let Some(key_pos) = buffer[search_start..].find("\"id\"") { - let abs_pos = search_start + key_pos + 4; // skip past "id" - if abs_pos >= buffer.len() { - break; - } - let rest = &buffer[abs_pos..]; - // Skip whitespace and colon - let rest = rest.trim_start(); - let rest = if let Some(r) = rest.strip_prefix(':') { - r.trim_start() - } else { - search_start = abs_pos; - continue; - }; - // Extract quoted value - if let Some(r) = rest.strip_prefix('"') { - if let Some(end) = r.find('"') { - let value = &r[..end]; - // Only match numeric snowflake IDs (15-20 digits) - if value.len() >= 15 - && value.len() <= 20 - && value.chars().all(|c| c.is_ascii_digit()) - { - ids.push(value.to_string()); - } - } - } - search_start = abs_pos; - } - ids -} - -/// Returns the continuity directory path derived from the state file path. -/// State path is always `{cwd}/.agent-relay/state.json`, so parent is `{cwd}/.agent-relay/`. -fn continuity_dir(state_path: &Path) -> PathBuf { - state_path - .parent() - .expect("state_path always has a parent (.agent-relay/)") - .join("continuity") -} - -/// Create ephemeral runtime paths in the system temp directory. -/// -/// Unlike `ensure_runtime_paths`, this function: -/// - Writes nothing to the project directory -/// - Uses a deterministic temp directory derived from cwd+broker name so -/// duplicate brokers still collide on the same lock/PID files -/// -/// The temp directory is NOT removed on exit — the OS cleans it up on reboot. -/// State and pending-delivery files are still written there so they don't -/// interfere with the project tree; they're just ephemeral. -/// Ephemeral mode: no lock file, no PID file, no temp directory. -/// The broker lifecycle is tied to the parent process via stdin — when the -/// parent (SDK client) exits, stdin gets EOF and the broker shuts down. -/// Single-instance enforcement is unnecessary here because each SDK client -/// manages its own child process. -fn ensure_ephemeral_paths(_cwd: &Path, _broker_name: &str) -> Result { - // Use a random temp subdir so concurrent ephemeral brokers don't collide - // on state files. - let root = std::env::temp_dir().join(format!("agent-relay-ephemeral-{}", std::process::id())); - std::fs::create_dir_all(&root) - .with_context(|| format!("failed to create ephemeral temp dir {}", root.display()))?; - - Ok(RuntimePaths { - persist: false, - state: root.join("state.json"), - pending: root.join("pending.json"), - _lock: None, - }) -} - -fn ensure_runtime_paths( - cwd: &Path, - broker_name: &str, - state_dir: Option<&Path>, -) -> Result { - let root = state_dir - .map(PathBuf::from) - .unwrap_or_else(|| cwd.join(".agent-relay")); - std::fs::create_dir_all(&root) - .with_context(|| format!("failed to create runtime dir {}", root.display()))?; - - // Sanitise name for use in filenames — keep only alphanumeric and hyphens - let safe_name: String = broker_name - .chars() - .map(|c| { - if c.is_alphanumeric() || c == '-' { - c - } else { - '-' - } - }) - .collect(); - - // Lock and PID files are per-broker-name so concurrent workflows can coexist. - let lock_path = root.join(format!("broker-{safe_name}.lock")); - let lock_file = std::fs::File::create(&lock_path) - .with_context(|| format!("failed to create lock file {}", lock_path.display()))?; - - #[cfg(unix)] - { - use std::os::unix::io::AsRawFd; - let fd = lock_file.as_raw_fd(); - let rc = unsafe { nix::libc::flock(fd, nix::libc::LOCK_EX | nix::libc::LOCK_NB) }; - if rc != 0 { - // Lock acquisition failed — check if the holder is still alive - // by reading the PID from connection.json. - let connection_path = root.join("connection.json"); - let old_pid = std::fs::read_to_string(&connection_path) - .ok() - .and_then(|c| serde_json::from_str::(&c).ok()) - .and_then(|v| v.get("pid").and_then(|p| p.as_u64())) - .map(|p| p as u32); - if let Some(old_pid) = old_pid { - if !broker::is_pid_alive(old_pid) { - tracing::warn!( - old_pid = old_pid, - "stale broker lock detected (PID {} is dead), recovering", - old_pid - ); - // The old process is dead — remove stale PID file and retry lock. - // We drop and re-create the lock file to clear the stale flock. - drop(lock_file); - let lock_file = std::fs::File::create(&lock_path).with_context(|| { - format!( - "failed to re-create lock file after stale recovery {}", - lock_path.display() - ) - })?; - let fd = lock_file.as_raw_fd(); - let rc = - unsafe { nix::libc::flock(fd, nix::libc::LOCK_EX | nix::libc::LOCK_NB) }; - if rc != 0 { - anyhow::bail!( - "another broker instance is already running in this directory ({})", - root.display() - ); - } - // Successfully recovered — PID is written via connection.json at API start - return Ok(RuntimePaths { - persist: true, - state: root.join(format!("state-{safe_name}.json")), - pending: root.join(format!("pending-{safe_name}.json")), - _lock: Some(lock_file), - }); - } else { - anyhow::bail!( - "another broker instance is already running in this directory (pid: {}, {})", - old_pid, - root.display() - ); - } - } - // PID file missing or unreadable while lock is held — treat as stale. - // This happens when the user deletes .agent-relay/ while an old broker - // is still alive, or during the shutdown race (PID deleted before flock - // released). - tracing::warn!( - "broker lock held but no valid PID file found, treating as stale and recovering" - ); - drop(lock_file); - let lock_file = std::fs::File::create(&lock_path).with_context(|| { - format!( - "failed to re-create lock file after stale recovery {}", - lock_path.display() - ) - })?; - let fd = lock_file.as_raw_fd(); - let rc = unsafe { nix::libc::flock(fd, nix::libc::LOCK_EX | nix::libc::LOCK_NB) }; - if rc != 0 { - anyhow::bail!( - "another broker instance is already running in this directory ({})", - root.display() - ); - } - return Ok(RuntimePaths { - persist: true, - state: root.join(format!("state-{safe_name}.json")), - pending: root.join(format!("pending-{safe_name}.json")), - _lock: Some(lock_file), - }); - } - } - - // PID is written via connection.json at API start - - Ok(RuntimePaths { - persist: true, - state: root.join(format!("state-{safe_name}.json")), - pending: root.join(format!("pending-{safe_name}.json")), - _lock: Some(lock_file), - }) -} - -fn derive_ws_base_url_from_http(http_base: &str) -> String { - let trimmed = http_base.trim(); - if let Some(rest) = trimmed.strip_prefix("https://") { - format!("wss://{rest}") - } else if let Some(rest) = trimmed.strip_prefix("http://") { - format!("ws://{rest}") - } else { - trimmed.to_string() - } + cli::run().await } #[cfg(test)] mod broker_tests; #[cfg(test)] mod worker_tests; - -#[cfg(test)] -mod tests { - use std::{ - collections::{BTreeSet, HashMap, HashSet}, - path::PathBuf, - process::Stdio, - time::{Duration, Instant}, - }; - - use crate::helpers::format_injection; - use crate::worker::{WorkerEvent, WorkerHandle, WorkerRegistry}; - use relay_broker::protocol::{AgentSpec, MessageInjectionMode, RelayDelivery}; - use serde_json::{json, Value}; - use tokio::sync::mpsc; - - use super::{ - build_agent_state_transition_event, build_http_api_spawn_spec, build_thread_infos, - channels_from_csv, continuity_dir, delivery_retry_interval, derive_ws_base_url_from_http, - detect_bypass_permissions_prompt, detect_claude_trust_prompt, display_target_for_dashboard, - drop_pending_for_worker, extract_mcp_message_ids, http_api_event_emit_timeout, - http_api_local_delivery_timeout, http_api_relaycast_send_timeout, is_auto_suggestion, - is_bypass_selection_menu, is_in_editor_mode, is_relaycast_self_control_target, - is_unknown_worker_error_message, normalize_channel, normalize_initial_task, - normalize_sender, queue_inbound_for_delivery_mode, relaycast_spawn_control_dedup_key, - relaycast_ws_control_dedup_key, relaycast_ws_should_apply_local_spawn_echo_dedup, - relaycast_ws_spawn_token, sender_is_dashboard_label, - should_clear_pending_delivery_for_event, strip_ansi, AgentRuntime, InboundContext, - InboundQueueOutcome, PendingDelivery, ProtocolHeadlessProvider, - }; - use crate::helpers::floor_char_boundary; - use relay_broker::dedup::DedupCache; - use relay_broker::relaycast_ws::{ - format_worker_preregistration_error, RelaycastRegistrationError, - }; - use relay_broker::types::{InboundDeliveryMode, InboundDeliveryState}; - - async fn make_worker_registry_with_worker(name: &str) -> WorkerRegistry { - let (tx, _rx) = mpsc::channel::(16); - let mut registry = WorkerRegistry::new( - tx, - Vec::new(), - PathBuf::from("/tmp/agent-relay-broker-tests"), - Instant::now(), - ); - let mut child = tokio::process::Command::new("cat") - .stdin(Stdio::piped()) - .stdout(Stdio::null()) - .stderr(Stdio::null()) - .spawn() - .expect("test worker process should spawn"); - let stdin = child.stdin.take().expect("test worker stdin should exist"); - registry.workers.insert( - name.to_string(), - WorkerHandle { - spec: AgentSpec { - name: name.to_string(), - runtime: AgentRuntime::Pty, - provider: None, - cli: Some("cat".to_string()), - model: None, - cwd: None, - team: None, - shadow_of: None, - shadow_mode: None, - args: Vec::new(), - channels: Vec::new(), - restart_policy: None, - }, - parent: None, - workspace_id: Some("ws_demo".to_string()), - child, - stdin, - spawned_at: Instant::now(), - }, - ); - registry - } - - async fn cleanup_worker_registry(mut registry: WorkerRegistry) { - for handle in registry.workers.values_mut() { - let _ = handle.child.start_kill(); - let _ = handle.child.wait().await; - } - } - - fn inbound_ctx<'a>(event_id: &'a str) -> InboundContext<'a> { - InboundContext { - from: "Alice", - body: "hello from relay", - target: "#general", - thread_id: Some("thr_123"), - workspace_id: Some("ws_demo"), - workspace_alias: Some("Demo"), - priority: 1, - mode: MessageInjectionMode::Steer, - event_id: Some(event_id), - } - } - - #[tokio::test] - async fn inbound_queue_auto_inject_drains_immediately_with_full_context() { - let worker_name = "worker-a"; - let workers = make_worker_registry_with_worker(worker_name).await; - let mut delivery_states = HashMap::new(); - - let outcome = queue_inbound_for_delivery_mode( - &mut delivery_states, - &workers, - worker_name, - inbound_ctx("evt_auto"), - ); - - match outcome { - InboundQueueOutcome::DrainNow(messages) => { - assert_eq!(messages.len(), 1); - let msg = &messages[0]; - assert_eq!(msg.from, "Alice"); - assert_eq!(msg.body, "hello from relay"); - assert_eq!(msg.target, "#general"); - assert_eq!(msg.thread_id.as_deref(), Some("thr_123")); - assert_eq!(msg.workspace_id.as_deref(), Some("ws_demo")); - assert_eq!(msg.workspace_alias.as_deref(), Some("Demo")); - assert_eq!(msg.priority, 1); - assert_eq!(msg.mode, MessageInjectionMode::Steer); - assert_eq!(msg.event_id.as_deref(), Some("evt_auto")); - } - other => panic!("expected immediate drain, got {other:?}"), - } - assert_eq!( - delivery_states - .get(worker_name) - .expect("state should be created") - .pending_snapshot(), - Vec::new(), - "auto_inject drains the per-worker pending queue in the same broker turn" - ); - - cleanup_worker_registry(workers).await; - } - - #[tokio::test] - async fn inbound_queue_manual_flush_holds_until_explicit_drain() { - let worker_name = "worker-a"; - let workers = make_worker_registry_with_worker(worker_name).await; - let mut delivery_states = HashMap::from([( - worker_name.to_string(), - InboundDeliveryState::new(InboundDeliveryMode::ManualFlush), - )]); - - let outcome = queue_inbound_for_delivery_mode( - &mut delivery_states, - &workers, - worker_name, - inbound_ctx("evt_manual"), - ); - - assert_eq!(outcome, InboundQueueOutcome::Queued); - let snapshot = delivery_states - .get(worker_name) - .expect("manual state should remain present") - .pending_snapshot(); - assert_eq!(snapshot.len(), 1); - assert_eq!(snapshot[0].event_id.as_deref(), Some("evt_manual")); - assert_eq!(snapshot[0].target, "#general"); - - cleanup_worker_registry(workers).await; - } - - #[tokio::test] - async fn inbound_queue_worker_missing_does_not_create_state() { - let (tx, _rx) = mpsc::channel::(16); - let workers = WorkerRegistry::new( - tx, - Vec::new(), - PathBuf::from("/tmp/agent-relay-broker-tests"), - Instant::now(), - ); - let mut delivery_states = HashMap::new(); - - let outcome = queue_inbound_for_delivery_mode( - &mut delivery_states, - &workers, - "ghost", - inbound_ctx("evt_missing"), - ); - - assert_eq!(outcome, InboundQueueOutcome::WorkerMissing); - assert!(delivery_states.is_empty()); - } - - fn extract_kind_literals(source: &str) -> BTreeSet { - let marker = "\"kind\""; - let mut kinds = BTreeSet::new(); - let mut cursor = 0; - while let Some(offset) = source[cursor..].find(marker) { - let mut start = cursor + offset + marker.len(); - if start >= source.len() { - break; - } - if !source[start..].starts_with(':') { - cursor = start; - continue; - } - start += 1; - while start < source.len() && source.as_bytes()[start].is_ascii_whitespace() { - start += 1; - } - if start >= source.len() || source.as_bytes()[start] != b'"' { - cursor = start; - continue; - } - start += 1; - if let Some(end) = source[start..].find('"') { - let candidate = &source[start..start + end]; - if !candidate.is_empty() - && candidate - .chars() - .all(|c| c.is_ascii_lowercase() || c == '_' || c.is_ascii_digit()) - { - kinds.insert(candidate.to_string()); - } - } - cursor = start; - if cursor >= source.len() { - break; - } - } - kinds - } - - #[test] - fn parses_channels() { - assert_eq!(channels_from_csv("general,ops"), vec!["general", "ops"]); - } - - #[test] - fn channel_normalization() { - assert_eq!(normalize_channel("general"), "#general"); - assert_eq!(normalize_channel("#ops"), "#ops"); - } - - #[test] - fn normalize_initial_task_drops_empty_values() { - assert_eq!(normalize_initial_task(None), None); - assert_eq!(normalize_initial_task(Some(String::new())), None); - assert_eq!(normalize_initial_task(Some(" ".to_string())), None); - } - - #[test] - fn normalize_initial_task_keeps_non_empty_values() { - assert_eq!( - normalize_initial_task(Some("Ship the patch".to_string())), - Some("Ship the patch".to_string()) - ); - } - - #[test] - fn ws_base_derivation() { - assert_eq!( - derive_ws_base_url_from_http("https://api.relaycast.dev"), - "wss://api.relaycast.dev" - ); - assert_eq!( - derive_ws_base_url_from_http("http://localhost:8787"), - "ws://localhost:8787" - ); - } - - #[test] - fn relaycast_control_dedup_key_prefers_event_id() { - let value = json!({ - "type": "agent.spawn_requested", - "event_id": "evt_123", - "agent": { "name": "worker-a", "cli": "claude", "task": "Ship it" } - }); - - assert_eq!( - relaycast_ws_control_dedup_key("ws_1", "agent.spawn_requested", &value), - Some("control:ws_1:agent.spawn_requested:evt_123".to_string()) - ); - } - - #[test] - fn relaycast_control_dedup_key_prefers_spawn_token_for_spawn_requests() { - let value = json!({ - "type": "agent.spawn_requested", - "event_id": "evt_123", - "agent": { - "name": "worker-a", - "cli": "claude", - "task": "Ship it", - "token": "at_live_worker" - } - }); - - assert_eq!( - relaycast_ws_control_dedup_key("ws_1", "agent.spawn_requested", &value), - Some("control:ws_1:agent.spawn_requested:at_live_worker".to_string()) - ); - } - - #[test] - fn relaycast_control_dedup_key_falls_back_to_agent_name_for_spawn_requests() { - let value = json!({ - "type": "agent.spawn_requested", - "agent": { - "name": "worker-a", - "cli": "claude", - "task": "Ship it" - } - }); - - assert_eq!( - relaycast_ws_control_dedup_key("ws_1", "agent.spawn_requested", &value), - Some("control:ws_1:agent.spawn_requested:worker-a".to_string()) - ); - } - - #[test] - fn relaycast_control_dedup_key_falls_back_to_serialized_payload() { - let value = json!({ - "type": "agent.release_requested", - "agent": { "name": "worker-a" } - }); - - let key = relaycast_ws_control_dedup_key("ws_1", "agent.release_requested", &value) - .expect("fallback dedup key"); - assert!(key.starts_with("control:ws_1:agent.release_requested:{")); - assert!(key.contains("\"worker-a\"")); - } - - #[test] - fn relaycast_ws_spawn_token_extracts_agent_token() { - let value = json!({ - "type": "agent.spawn_requested", - "agent": { - "name": "worker-a", - "token": "at_live_worker" - } - }); - - assert_eq!( - relaycast_ws_spawn_token(&value), - Some("at_live_worker".to_string()) - ); - } - - #[test] - fn relaycast_ws_spawn_name_only_control_key_skips_second_name_dedup() { - let value = json!({ - "type": "agent.spawn_requested", - "agent": { - "name": "worker-a", - "cli": "claude", - "task": "Ship it" - } - }); - - let control_key = relaycast_ws_control_dedup_key("ws_1", "agent.spawn_requested", &value) - .expect("control dedup key"); - let local_key = relaycast_spawn_control_dedup_key("ws_1", "worker-a"); - - assert_eq!(control_key, local_key); - assert!(!relaycast_ws_should_apply_local_spawn_echo_dedup( - Some(control_key.as_str()), - &local_key - )); - } - - #[test] - fn relaycast_ws_spawn_event_id_echo_still_uses_local_name_dedup() { - let value = json!({ - "type": "agent.spawn_requested", - "event_id": "evt_123", - "agent": { - "name": "worker-a", - "cli": "claude", - "task": "Ship it" - } - }); - - let control_key = relaycast_ws_control_dedup_key("ws_1", "agent.spawn_requested", &value) - .expect("control dedup key"); - let local_key = relaycast_spawn_control_dedup_key("ws_1", "worker-a"); - - assert_ne!(control_key, local_key); - assert!(relaycast_ws_should_apply_local_spawn_echo_dedup( - Some(control_key.as_str()), - &local_key - )); - - let now = Instant::now(); - let mut dedup = DedupCache::new(Duration::from_secs(60), 16); - assert!(dedup.insert_if_new(&local_key, now)); - assert!(dedup.insert_if_new(&control_key, now + Duration::from_secs(1))); - assert!(!dedup.insert_if_new(&local_key, now + Duration::from_secs(2))); - } - - #[test] - fn unknown_worker_error_message_matches_release_failures() { - assert!(is_unknown_worker_error_message("unknown worker 'worker-a'")); - assert!(is_unknown_worker_error_message( - "failed to release 'worker-a': unknown worker 'worker-a'" - )); - assert!(!is_unknown_worker_error_message("failed to bind api port")); - } - - #[test] - fn relaycast_self_control_target_matches_aliases_case_insensitively() { - let self_names = HashSet::from([ - "relay-broker".to_string(), - "relay-broker@workspace".to_string(), - ]); - - assert!(is_relaycast_self_control_target( - "Relay-Broker", - "relay-broker", - &self_names - )); - assert!(is_relaycast_self_control_target( - "@relay-broker@workspace", - "relay-broker", - &self_names - )); - assert!(!is_relaycast_self_control_target( - "worker-a", - "relay-broker", - &self_names - )); - } - - #[tokio::test] - async fn contract_health_fixture_requires_rich_listen_health_shape() { - let fixture: Value = serde_json::from_str(include_str!( - "../../../packages/contracts/fixtures/health-fixtures.json" - )) - .expect("health fixture should be valid JSON"); - let expected_shape = fixture - .get("health_response") - .and_then(Value::as_object) - .expect("health fixture must include health_response object"); - - let actual = crate::listen_api::listen_api_health_payload(None, vec![]); - - for required_key in expected_shape.keys() { - // TODO(contract-wave1-health-shape): listen-mode /health should - // implement the shared BrokerHealthResponse contract fields. - assert!( - actual.get(required_key).is_some(), - "listen /health response is missing required contract field: {}", - required_key - ); - } - } - - #[tokio::test] - async fn contract_startup_429_fixture_requires_degraded_health_status() { - let fixture: Value = serde_json::from_str(include_str!( - "../../../packages/contracts/fixtures/health-fixtures.json" - )) - .expect("health fixture should be valid JSON"); - let expected = fixture - .get("wave0_startup_429_degraded") - .and_then(|v| v.get("expected_health_status")) - .and_then(Value::as_str) - .expect("health fixture must include expected degraded health status"); - let startup_error_code = fixture - .get("wave0_startup_429_degraded") - .and_then(|v| v.get("error")) - .and_then(|v| v.get("code")) - .and_then(Value::as_str) - .expect("health fixture must include startup error code"); - std::env::set_var("AGENT_RELAY_STARTUP_ERROR_CODE", startup_error_code); - let actual = crate::listen_api::listen_api_health_payload(None, vec![]) - .get("status") - .and_then(Value::as_str) - .unwrap_or("unknown") - .to_string(); - std::env::remove_var("AGENT_RELAY_STARTUP_ERROR_CODE"); - - assert_eq!( - actual, expected, - "listen /health status \"{}\" does not match startup 429 degraded contract \"{}\"", - actual, expected - ); - } - - #[test] - fn contract_replay_fixture_requires_replay_route_exposure() { - let replay_fixture: Value = serde_json::from_str(include_str!( - "../../../packages/contracts/fixtures/replay-fixtures.json" - )) - .expect("replay fixture should be valid JSON"); - assert!( - replay_fixture.get("replay_cursor_request").is_some(), - "replay fixture must include replay_cursor_request" - ); - assert!( - replay_fixture.get("replay_response").is_some(), - "replay fixture must include replay_response" - ); - - let source = include_str!("listen_api.rs"); - assert!( - source.contains(".route(\"/api/events/replay\""), - "listen API router does not expose /api/events/replay" - ); - } - - #[test] - fn contract_timeout_fixture_requires_terminal_failed_guard_before_late_ack() { - let replay_fixture: Value = serde_json::from_str(include_str!( - "../../../packages/contracts/fixtures/replay-fixtures.json" - )) - .expect("replay fixture should be valid JSON"); - let timeout_fixture = replay_fixture - .get("wave0_timeout_terminal_semantics") - .and_then(Value::as_object) - .expect("replay fixture must include wave0_timeout_terminal_semantics object"); - - let expected_terminal_status = timeout_fixture - .get("expected_terminal_status") - .and_then(Value::as_str) - .expect("timeout fixture requires expected_terminal_status"); - let late_event_kind = timeout_fixture - .get("late_event_kind") - .and_then(Value::as_str) - .expect("timeout fixture requires late_event_kind"); - - let source = include_str!("main.rs"); - let ack_branch = source - .find("msg_type == \"delivery_ack\"") - .map(|idx| { - let end = (idx + 1200).min(source.len()); - &source[idx..end] - }) - .expect("main.rs must include delivery_ack handling"); - - assert!( - ack_branch.contains(expected_terminal_status) || ack_branch.contains("terminal"), - "delivery_ack branch lacks terminal guard for timeout status \"{}\" and late event \"{}\"", - expected_terminal_status, - late_event_kind - ); - } - - #[test] - fn contract_broadcast_whitelist_fixture_requires_filtering_to_required_kinds() { - let event_fixture: Value = serde_json::from_str(include_str!( - "../../../packages/contracts/fixtures/event-fixtures.json" - )) - .expect("event fixture should be valid JSON"); - let required = event_fixture - .get("wave0_broadcast_whitelist") - .and_then(|v| v.get("required_kinds")) - .and_then(Value::as_array) - .expect("event fixture must include wave0_broadcast_whitelist.required_kinds") - .iter() - .filter_map(Value::as_str) - .map(str::to_owned) - .collect::>(); - - let emitted = extract_kind_literals(include_str!("main.rs")); - - assert!( - required.is_subset(&emitted), - "broker source is missing required broadcast kinds; expected {:?}, got {:?}", - required, - emitted - ); - } - - #[test] - fn build_thread_infos_groups_channel_messages() { - let messages = vec![ - json!({ - "from": "broker", - "target": "#general", - "text": "outbound", - "timestamp": "2026-02-23T10:00:00Z", - }), - json!({ - "from": "Lead", - "target": "#general", - "text": "inbound", - "timestamp": "2026-02-23T10:01:00Z", - }), - ]; - let self_names = HashSet::from(["broker".to_string()]); - let threads = build_thread_infos(&messages, &self_names); - - assert_eq!(threads.len(), 1); - assert_eq!(threads[0].thread_id, "#general"); - assert_eq!(threads[0].name, "#general"); - assert_eq!(threads[0].unread_count, 1); - assert_eq!(threads[0].last_message.as_deref(), Some("inbound")); - } - - #[test] - fn build_thread_infos_groups_direct_messages_case_insensitively() { - let messages = vec![ - json!({ - "from": "BROKER", - "to": "WorkerA", - "text": "ping", - "timestamp": "2026-02-23T10:00:00Z", - }), - json!({ - "from": "workera", - "to": "broker", - "text": "pong", - "timestamp": "2026-02-23T10:01:00Z", - }), - ]; - let self_names = HashSet::from(["broker".to_string()]); - let threads = build_thread_infos(&messages, &self_names); - - assert_eq!(threads.len(), 1); - assert_eq!(threads[0].thread_id, "direct:broker:workera"); - assert_eq!(threads[0].name, "workera"); - assert_eq!(threads[0].unread_count, 1); - assert_eq!(threads[0].last_message.as_deref(), Some("pong")); - } - - #[test] - fn build_thread_infos_uses_dm_conversation_id_and_sender_name() { - let messages = vec![json!({ - "from": "Planner", - "conversation_id": "conv_123", - "text": "dm payload", - "timestamp": "2026-02-23T10:01:00Z", - })]; - let self_names = HashSet::from(["broker".to_string()]); - let threads = build_thread_infos(&messages, &self_names); - - assert_eq!(threads.len(), 1); - assert_eq!(threads[0].thread_id, "conv_123"); - assert_eq!(threads[0].name, "Planner"); - assert_eq!(threads[0].unread_count, 1); - } - - #[test] - fn build_thread_infos_shows_dms_between_non_broker_agents() { - let messages = vec![ - json!({ - "from": "WorkerA", - "conversation_id": "dm_456", - "participants": ["WorkerA", "WorkerB"], - "text": "hello WorkerB", - "timestamp": "2026-02-23T10:00:00Z", - }), - json!({ - "from": "WorkerB", - "conversation_id": "dm_456", - "participants": ["WorkerA", "WorkerB"], - "text": "hi WorkerA", - "timestamp": "2026-02-23T10:01:00Z", - }), - ]; - let self_names = HashSet::from(["broker".to_string()]); - let threads = build_thread_infos(&messages, &self_names); - - assert_eq!(threads.len(), 1, "should group into one conversation"); - assert_eq!(threads[0].thread_id, "dm_456"); - assert_eq!(threads[0].name, "WorkerA ↔ WorkerB"); - assert_eq!( - threads[0].unread_count, 2, - "both messages unread (neither from broker)" - ); - assert_eq!(threads[0].last_message.as_deref(), Some("hi WorkerA")); - } - - #[test] - fn build_thread_infos_dm_with_participants_filters_broker() { - let messages = vec![json!({ - "from": "WorkerA", - "conversation_id": "dm_789", - "participants": ["broker", "WorkerA"], - "text": "hello broker", - "timestamp": "2026-02-23T10:00:00Z", - })]; - let self_names = HashSet::from(["broker".to_string()]); - let threads = build_thread_infos(&messages, &self_names); - - assert_eq!(threads.len(), 1); - assert_eq!( - threads[0].name, "WorkerA", - "should filter out broker from participants" - ); - } - - #[test] - fn build_thread_infos_multiple_independent_dm_conversations() { - let messages = vec![ - json!({ - "from": "Alice", - "conversation_id": "dm_aaa", - "participants": ["Alice", "Bob"], - "text": "hi Bob", - "timestamp": "2026-02-23T10:00:00Z", - }), - json!({ - "from": "Charlie", - "conversation_id": "dm_bbb", - "participants": ["Charlie", "Diana"], - "text": "hi Diana", - "timestamp": "2026-02-23T10:01:00Z", - }), - json!({ - "from": "broker", - "conversation_id": "dm_ccc", - "participants": ["broker", "Eve"], - "text": "hi Eve", - "timestamp": "2026-02-23T10:02:00Z", - }), - ]; - let self_names = HashSet::from(["broker".to_string()]); - let threads = build_thread_infos(&messages, &self_names); - - assert_eq!( - threads.len(), - 3, - "should have three separate DM conversations" - ); - - let thread_aaa = threads.iter().find(|t| t.thread_id == "dm_aaa").unwrap(); - assert_eq!(thread_aaa.name, "Alice ↔ Bob"); - - let thread_bbb = threads.iter().find(|t| t.thread_id == "dm_bbb").unwrap(); - assert_eq!(thread_bbb.name, "Charlie ↔ Diana"); - - let thread_ccc = threads.iter().find(|t| t.thread_id == "dm_ccc").unwrap(); - assert_eq!(thread_ccc.name, "Eve", "broker filtered from participants"); - } - - #[test] - fn build_thread_infos_respects_explicit_unread_count() { - let messages = vec![json!({ - "from": "Planner", - "target": "broker", - "text": "status", - "unread_count": 7, - "timestamp": "2026-02-23T10:01:00Z", - })]; - let self_names = HashSet::from(["broker".to_string()]); - let threads = build_thread_infos(&messages, &self_names); - - assert_eq!(threads.len(), 1); - assert_eq!(threads[0].unread_count, 7); - } - - #[test] - fn build_agent_state_transition_event_has_expected_shape() { - let payload = build_agent_state_transition_event("worker-a", "spawned", Some("sdk_spawn")); - assert_eq!(payload["type"], "agent.state"); - assert_eq!(payload["state"], "spawned"); - assert_eq!(payload["agent"]["name"], "worker-a"); - assert_eq!(payload["reason"], "sdk_spawn"); - assert!(payload["timestamp"].as_str().is_some()); - - let no_reason = build_agent_state_transition_event("worker-a", "idle", None); - assert!(no_reason.get("reason").is_none()); - } - - #[test] - fn preregistration_error_message_dedupes_retry_after_for_rate_limit() { - let error = RelaycastRegistrationError::RateLimited { - agent_name: "Foobar".to_string(), - retry_after_secs: 60, - detail: "{\"ok\":false}".to_string(), - }; - let message = format_worker_preregistration_error("Foobar", &error); - assert_eq!(message.matches("retry after").count(), 1); - } - - #[test] - fn preregistration_error_message_does_not_invent_retry_after_for_transport_errors() { - let error = RelaycastRegistrationError::Transport { - agent_name: "Foobar".to_string(), - detail: "timeout".to_string(), - }; - let message = format_worker_preregistration_error("Foobar", &error); - assert!(!message.contains("retry after")); - } - - #[test] - fn injection_format_preserved() { - let rendered = format_injection("alice", "evt_1", "hello", "bob"); - assert!(rendered.contains("")); - assert!(rendered.contains("mcp__relaycast__message_dm_send")); - assert!(rendered.contains("Relay message from alice [evt_1]: hello")); - } - - #[test] - fn injection_format_includes_channel() { - let rendered = format_injection("alice", "evt_1", "hello", "#general"); - assert!(rendered.contains("mcp__relaycast__message_post")); - assert!(rendered.contains("channel: \"general\"")); - assert!(rendered.contains("Relay message from alice in #general [evt_1]: hello")); - } - - #[test] - fn normalize_sender_defaults_to_human_orchestrator() { - assert_eq!(normalize_sender(None), "human:orchestrator"); - assert_eq!(normalize_sender(Some(String::new())), "human:orchestrator"); - assert_eq!( - normalize_sender(Some(" ".to_string())), - "human:orchestrator" - ); - } - - #[test] - fn normalize_sender_normalizes_human_prefix() { - assert_eq!( - normalize_sender(Some("human: Dashboard ".to_string())), - "human:Dashboard" - ); - } - - #[test] - fn normalize_sender_preserves_worker_names() { - assert_eq!( - normalize_sender(Some("WorkerOne".to_string())), - "WorkerOne".to_string() - ); - } - - #[test] - fn sender_is_dashboard_label_accepts_legacy_dashboard_senders() { - assert!(sender_is_dashboard_label("Dashboard", "my-project")); - assert!(sender_is_dashboard_label("human:Dashboard", "my-project")); - assert!(sender_is_dashboard_label( - "human:orchestrator", - "my-project" - )); - assert!(sender_is_dashboard_label("my-project", "my-project")); - assert!(!sender_is_dashboard_label("Lead", "my-project")); - } - - #[test] - fn display_target_for_dashboard_maps_self_identity() { - let mut self_names = HashSet::new(); - self_names.insert("broker-951762d5".to_string()); - self_names.insert("DashProbe".to_string()); - let primary = "my-project"; - - assert_eq!( - display_target_for_dashboard("broker-951762d5", &self_names, primary), - "my-project" - ); - assert_eq!( - display_target_for_dashboard("dashprobe", &self_names, primary), - "my-project" - ); - assert_eq!( - display_target_for_dashboard("Lead", &self_names, primary), - "Lead".to_string() - ); - } - - #[test] - fn delivery_retry_interval_uses_default_and_env_override() { - std::env::remove_var("AGENT_RELAY_DELIVERY_RETRY_MS"); - assert_eq!(delivery_retry_interval().as_millis(), 1_000); - - std::env::set_var("AGENT_RELAY_DELIVERY_RETRY_MS", "250"); - assert_eq!(delivery_retry_interval().as_millis(), 250); - - std::env::set_var("AGENT_RELAY_DELIVERY_RETRY_MS", "1"); - assert_eq!(delivery_retry_interval().as_millis(), 50); - - std::env::remove_var("AGENT_RELAY_DELIVERY_RETRY_MS"); - } - - #[test] - fn http_api_timeout_windows_use_default_and_env_override() { - std::env::remove_var("AGENT_RELAY_HTTP_API_LOCAL_DELIVERY_TIMEOUT_MS"); - std::env::remove_var("AGENT_RELAY_HTTP_API_RELAYCAST_SEND_TIMEOUT_MS"); - std::env::remove_var("AGENT_RELAY_HTTP_API_EVENT_EMIT_TIMEOUT_MS"); - - assert_eq!(http_api_local_delivery_timeout().as_millis(), 3_000); - assert_eq!(http_api_relaycast_send_timeout().as_millis(), 20_000); - assert_eq!(http_api_event_emit_timeout().as_millis(), 200); - - std::env::set_var("AGENT_RELAY_HTTP_API_LOCAL_DELIVERY_TIMEOUT_MS", "10"); - std::env::set_var("AGENT_RELAY_HTTP_API_RELAYCAST_SEND_TIMEOUT_MS", "100"); - std::env::set_var("AGENT_RELAY_HTTP_API_EVENT_EMIT_TIMEOUT_MS", "1"); - - assert_eq!(http_api_local_delivery_timeout().as_millis(), 100); - assert_eq!(http_api_relaycast_send_timeout().as_millis(), 500); - assert_eq!(http_api_event_emit_timeout().as_millis(), 25); - - std::env::set_var("AGENT_RELAY_HTTP_API_LOCAL_DELIVERY_TIMEOUT_MS", "1500"); - std::env::set_var("AGENT_RELAY_HTTP_API_RELAYCAST_SEND_TIMEOUT_MS", "12000"); - std::env::set_var("AGENT_RELAY_HTTP_API_EVENT_EMIT_TIMEOUT_MS", "150"); - - assert_eq!(http_api_local_delivery_timeout().as_millis(), 1_500); - assert_eq!(http_api_relaycast_send_timeout().as_millis(), 12_000); - assert_eq!(http_api_event_emit_timeout().as_millis(), 150); - - std::env::remove_var("AGENT_RELAY_HTTP_API_LOCAL_DELIVERY_TIMEOUT_MS"); - std::env::remove_var("AGENT_RELAY_HTTP_API_RELAYCAST_SEND_TIMEOUT_MS"); - std::env::remove_var("AGENT_RELAY_HTTP_API_EVENT_EMIT_TIMEOUT_MS"); - } - - #[test] - fn drop_pending_for_worker_removes_only_matching_entries() { - let mut pending = HashMap::new(); - pending.insert( - "del_1".to_string(), - PendingDelivery { - worker_name: "A".to_string(), - delivery: RelayDelivery { - delivery_id: "del_1".to_string(), - event_id: "evt_1".to_string(), - workspace_id: Some("ws_test".to_string()), - workspace_alias: Some("test".to_string()), - from: "x".to_string(), - target: "#general".to_string(), - body: "hello".to_string(), - thread_id: None, - priority: None, - injection_mode: MessageInjectionMode::Wait, - }, - attempts: 1, - next_retry_at: Instant::now(), - }, - ); - pending.insert( - "del_2".to_string(), - PendingDelivery { - worker_name: "B".to_string(), - delivery: RelayDelivery { - delivery_id: "del_2".to_string(), - event_id: "evt_2".to_string(), - workspace_id: Some("ws_test".to_string()), - workspace_alias: Some("test".to_string()), - from: "y".to_string(), - target: "#general".to_string(), - body: "world".to_string(), - thread_id: None, - priority: None, - injection_mode: MessageInjectionMode::Wait, - }, - attempts: 1, - next_retry_at: Instant::now(), - }, - ); - - let dropped = drop_pending_for_worker(&mut pending, "A"); - assert_eq!(dropped, 1); - assert!(pending.contains_key("del_2")); - assert!(!pending.contains_key("del_1")); - } - - #[test] - fn should_clear_pending_delivery_when_event_id_matches() { - let pending = PendingDelivery { - worker_name: "A".to_string(), - delivery: RelayDelivery { - delivery_id: "del_1".to_string(), - event_id: "evt_1".to_string(), - workspace_id: Some("ws_test".to_string()), - workspace_alias: Some("test".to_string()), - from: "x".to_string(), - target: "#general".to_string(), - body: "hello".to_string(), - thread_id: None, - priority: None, - injection_mode: MessageInjectionMode::Wait, - }, - attempts: 1, - next_retry_at: Instant::now(), - }; - - assert!(should_clear_pending_delivery_for_event( - Some(&pending), - Some("evt_1") - )); - assert!(!should_clear_pending_delivery_for_event( - Some(&pending), - Some("evt_2") - )); - } - - #[test] - fn should_clear_pending_delivery_without_event_id_for_compatibility() { - let pending = PendingDelivery { - worker_name: "A".to_string(), - delivery: RelayDelivery { - delivery_id: "del_1".to_string(), - event_id: "evt_1".to_string(), - workspace_id: Some("ws_test".to_string()), - workspace_alias: Some("test".to_string()), - from: "x".to_string(), - target: "#general".to_string(), - body: "hello".to_string(), - thread_id: None, - priority: None, - injection_mode: MessageInjectionMode::Wait, - }, - attempts: 1, - next_retry_at: Instant::now(), - }; - - assert!(should_clear_pending_delivery_for_event( - Some(&pending), - None - )); - assert!(should_clear_pending_delivery_for_event( - Some(&pending), - Some("") - )); - assert!(should_clear_pending_delivery_for_event(None, Some("evt_1"))); - } - - // ==================== strip_ansi tests ==================== - - #[test] - fn strip_ansi_removes_csi_sequences() { - assert_eq!(strip_ansi("\x1b[32mHello\x1b[0m"), "Hello"); - assert_eq!(strip_ansi("\x1b[1;31mred bold\x1b[0m"), "red bold"); - } - - #[test] - fn strip_ansi_removes_osc_sequences() { - assert_eq!(strip_ansi("\x1b]0;title\x07rest"), "rest"); - assert_eq!(strip_ansi("\x1b]0;title\x1b\\rest"), "rest"); - } - - #[test] - fn strip_ansi_preserves_plain_text() { - assert_eq!(strip_ansi("Hello world"), "Hello world"); - assert_eq!(strip_ansi(""), ""); - } - - #[test] - fn strip_ansi_handles_mixed_content() { - let input = "\x1b[33m⚠️ bypass\x1b[0m permissions mode\n\x1b[1m(yes/no)\x1b[0m"; - let clean = strip_ansi(input); - assert!(clean.contains("bypass")); - assert!(clean.contains("(yes/no)")); - assert!(!clean.contains("\x1b")); - } - - #[test] - fn strip_ansi_handles_cursor_forward_sequences() { - // Claude Code uses \x1b[1C (cursor forward) instead of spaces - // These should be replaced with spaces so echo detection works - let input = "\x1b[1CYes,\x1b[1CI\x1b[1Caccept"; - let clean = strip_ansi(input); - assert_eq!(clean, " Yes, I accept"); - } - - // ==================== floor_char_boundary tests ==================== - - #[test] - fn floor_char_boundary_at_valid_positions() { - let s = "Hello 世界"; - assert_eq!(floor_char_boundary(s, 0), 0); - assert_eq!(floor_char_boundary(s, 6), 6); - assert_eq!(floor_char_boundary(s, 9), 9); - } - - #[test] - fn floor_char_boundary_mid_multibyte() { - let s = "Hello 世界"; - assert_eq!(floor_char_boundary(s, 7), 6); - assert_eq!(floor_char_boundary(s, 8), 6); - } - - #[test] - fn floor_char_boundary_past_end() { - let s = "Hello 世界"; - assert_eq!(floor_char_boundary(s, 100), s.len()); - } - - // ==================== detect_bypass_permissions_prompt tests ==================== - - #[test] - fn bypass_perms_yes_no_prompt() { - let output = "⚠️ Bypassing all permission checks.\nDo you want to proceed? (yes/no)"; - let (has_ref, has_confirm) = detect_bypass_permissions_prompt(output); - assert!(has_ref); - assert!(has_confirm); - } - - #[test] - fn bypass_perms_dangerously_with_yn() { - let output = "Running with --dangerously-skip-permissions\nAccept the risks? (y/n)"; - let (has_ref, has_confirm) = detect_bypass_permissions_prompt(output); - assert!(has_ref); - assert!(has_confirm); - } - - #[test] - fn bypass_perms_accept_risk_variant() { - let output = - "bypass permissions mode enabled\nDo you accept the risk of running in this mode?"; - let (has_ref, has_confirm) = detect_bypass_permissions_prompt(output); - assert!(has_ref); - assert!(has_confirm); - } - - #[test] - fn bypass_perms_no_match_normal_output() { - let output = "I'll help you fix that bug. Let me read the file first."; - let (has_ref, has_confirm) = detect_bypass_permissions_prompt(output); - assert!(!has_ref); - assert!(!has_confirm); - } - - #[test] - fn bypass_perms_no_false_positive_permission_without_bypass() { - let output = "File permission denied. (yes/no)"; - let (has_ref, has_confirm) = detect_bypass_permissions_prompt(output); - assert!(!has_ref, "permission without bypass should not match"); - assert!(has_confirm, "yes/no detected but insufficient alone"); - } - - #[test] - fn bypass_perms_no_false_positive_status_bar() { - let output = "-- INSERT -- ⏵⏵ bypass permissions on (shift+tab to cycle)"; - let (has_ref, has_confirm) = detect_bypass_permissions_prompt(output); - assert!(has_ref, "status bar has bypass+permissions"); - assert!(!has_confirm, "but no confirmation prompt"); - } - - #[test] - fn bypass_perms_selection_menu_format() { - let output = "WARNING: ClaudeCoderunninginBypassPermissionsmode\n\ - Byproceeding,youacceptallresponsibility\n\ - No,exit\nYes,Iaccept\nEntertoconfirm"; - let (has_ref, has_confirm) = detect_bypass_permissions_prompt(output); - assert!(has_ref); - assert!(has_confirm); - assert!(is_bypass_selection_menu(output)); - } - - #[test] - fn bypass_perms_selection_menu_with_spaces() { - let output = "WARNING: Claude Code running in Bypass Permissions mode\n\ - 1. No, exit\n2. Yes, I accept\nEnter to confirm"; - let (has_ref, has_confirm) = detect_bypass_permissions_prompt(output); - assert!(has_ref && has_confirm); - assert!(is_bypass_selection_menu(output)); - } - - #[test] - fn bypass_perms_legacy_not_selection_menu() { - let output = "bypass permissions mode\nProceed? (yes/no)"; - let (has_ref, has_confirm) = detect_bypass_permissions_prompt(output); - assert!(has_ref && has_confirm, "legacy should still detect"); - assert!( - !is_bypass_selection_menu(output), - "legacy should NOT be selection menu" - ); - } - - #[test] - fn bypass_perms_with_raw_ansi() { - let raw = "\x1b[33m⚠️ bypass permissions\x1b[0m mode\nProceed? \x1b[1m(yes/no)\x1b[0m"; - let clean = strip_ansi(raw); - let (has_ref, has_confirm) = detect_bypass_permissions_prompt(&clean); - assert!(has_ref && has_confirm); - } - - // ==================== detect_claude_trust_prompt tests ==================== - - #[test] - fn claude_trust_prompt_full_match() { - let output = "take a moment to review what's in this folder first.\n\ - Claude Code'll be able to read, edit, and execute files here.\n\ - Security guide\n\ - ❯ 1. Yes, I trust this folder\n\ - 2. No, exit\n\ - Enter to confirm · Esc to cancel"; - let (has_trust_ref, has_confirmation) = detect_claude_trust_prompt(output); - assert!(has_trust_ref); - assert!(has_confirmation); - } - - #[test] - fn claude_trust_prompt_stripped_spaces() { - let output = "Yes,Itrustthisfolder\nNo,exit"; - let (has_trust_ref, has_confirmation) = detect_claude_trust_prompt(output); - assert!(has_trust_ref); - assert!(has_confirmation); - } - - #[test] - fn claude_trust_prompt_no_match_normal_output() { - let output = "I'll help you fix that bug. Let me read the file first."; - let (has_trust_ref, has_confirmation) = detect_claude_trust_prompt(output); - assert!(!has_trust_ref); - assert!(!has_confirmation); - } - - #[test] - fn claude_trust_prompt_partial_no_exit() { - let output = "Yes, I trust this folder"; - let (has_trust_ref, has_confirmation) = detect_claude_trust_prompt(output); - assert!(has_trust_ref); - assert!(!has_confirmation, "should not match without exit option"); - } - - #[test] - fn claude_trust_prompt_with_ansi() { - let raw = "\x1b[1m❯ 1. Yes, I trust this folder\x1b[0m\n 2. No, exit"; - let clean = strip_ansi(raw); - let (has_trust_ref, has_confirmation) = detect_claude_trust_prompt(&clean); - assert!(has_trust_ref && has_confirmation); - } - - // ==================== is_in_editor_mode tests ==================== - - #[test] - fn editor_mode_vim_insert() { - assert!(is_in_editor_mode("Some text\n-- INSERT --\n")); - assert!(is_in_editor_mode("Some text\n-- INSERT --")); - } - - #[test] - fn editor_mode_claude_cli_not_vim() { - let output = "-- INSERT -- ⏵⏵ bypass permissions on (shift+tab to cycle)"; - assert!(!is_in_editor_mode(output)); - } - - #[test] - fn editor_mode_nano() { - let output = " GNU nano 5.8\nFile: test.txt\n^G Get Help ^O Write Out"; - assert!(is_in_editor_mode(output)); - } - - #[test] - fn editor_mode_less_pager() { - assert!(is_in_editor_mode("some content\n(END)")); - assert!(is_in_editor_mode("some content\n--More--")); - } - - #[test] - fn editor_mode_normal_output() { - assert!(!is_in_editor_mode( - "I'll help you with that task. Let me search." - )); - assert!(!is_in_editor_mode("$ ls -la\ntotal 0\n$ ")); - } - - #[test] - fn editor_mode_with_ansi() { - let output = "\x1b[32mSome text\x1b[0m\n-- INSERT --\n"; - assert!(is_in_editor_mode(output)); - } - - #[test] - fn editor_mode_vim_visual_modes() { - assert!(is_in_editor_mode("text\n-- VISUAL --\n")); - assert!(is_in_editor_mode("text\n-- VISUAL LINE --\n")); - assert!(is_in_editor_mode("text\n-- VISUAL BLOCK --\n")); - assert!(is_in_editor_mode("text\n-- REPLACE --\n")); - } - - #[test] - fn editor_mode_claude_normal_not_vim() { - assert!(!is_in_editor_mode("-- NORMAL -- ► some Claude UI text")); - assert!(!is_in_editor_mode("-- VISUAL -- ▶ Claude UI")); - } - - #[test] - fn auto_suggestion_detects_cursor_plus_dim_pattern() { - assert!(is_auto_suggestion( - "\x1b[7mW\x1b[27m\x1b[2mhat's the task?\x1b[22m" - )); - } - - #[test] - fn auto_suggestion_detects_send_hint() { - assert!(is_auto_suggestion(" ↵ send")); - } - - #[test] - fn auto_suggestion_ignores_normal_output() { - assert!(!is_auto_suggestion("Relay message from Alice [abc]: hello")); - assert!(!is_auto_suggestion("Running tests...")); - assert!(!is_auto_suggestion("> \x1b[7m \x1b[27m")); - } - - #[test] - fn extract_mcp_ids_from_tool_response() { - let output = r#" ⎿ { - "id": "147310274064424960", - "conversation_id": "147310245874507776", - "from": "agent-a", - "text": "hello" - }"#; - let ids = extract_mcp_message_ids(output); - // Only extracts "id" keys, not "conversation_id" - assert_eq!(ids, vec!["147310274064424960"]); - } - - #[test] - fn extract_mcp_ids_ignores_short_ids() { - let output = r#""id": "123""#; - assert!(extract_mcp_message_ids(output).is_empty()); - } - - #[test] - fn extract_mcp_ids_ignores_non_numeric() { - let output = r#""id": "msg_abc123def456ghi""#; - assert!(extract_mcp_message_ids(output).is_empty()); - } - - #[test] - fn extract_mcp_ids_handles_no_ids() { - assert!(extract_mcp_message_ids("normal output with no JSON").is_empty()); - assert!(extract_mcp_message_ids("").is_empty()); - } - - // ==================== bypass flag selection logic tests ==================== - // Tests for the bypass flag logic used in WorkerRegistry::spawn(). - // The logic is: claude/claude:* → --dangerously-skip-permissions, codex → --dangerously-bypass-approvals-and-sandbox - - fn compute_bypass_flag(cli: &str, existing_args: &[String]) -> Option<&'static str> { - let cli_lower = cli.to_lowercase(); - if (cli_lower == "claude" || cli_lower.starts_with("claude:")) - && !existing_args - .iter() - .any(|a| a.contains("dangerously-skip-permissions")) - { - Some("--dangerously-skip-permissions") - } else if cli_lower == "codex" - && !existing_args - .iter() - .any(|a| a.contains("dangerously-bypass") || a.contains("full-auto")) - { - Some("--dangerously-bypass-approvals-and-sandbox") - } else if cli_lower == "gemini" && !existing_args.iter().any(|a| a == "--yolo" || a == "-y") - { - Some("--yolo") - } else { - None - } - } - - #[test] - fn bypass_flag_claude_gets_skip_permissions() { - assert_eq!( - compute_bypass_flag("claude", &[]), - Some("--dangerously-skip-permissions") - ); - } - - #[test] - fn bypass_flag_claude_variant_gets_skip_permissions() { - assert_eq!( - compute_bypass_flag("claude:latest", &[]), - Some("--dangerously-skip-permissions") - ); - assert_eq!( - compute_bypass_flag("Claude", &[]), - Some("--dangerously-skip-permissions") - ); - assert_eq!( - compute_bypass_flag("CLAUDE:v2", &[]), - Some("--dangerously-skip-permissions") - ); - } - - #[test] - fn bypass_flag_codex_gets_dangerously_bypass() { - assert_eq!( - compute_bypass_flag("codex", &[]), - Some("--dangerously-bypass-approvals-and-sandbox") - ); - } - - #[test] - fn bypass_flag_gemini_gets_yolo() { - assert_eq!(compute_bypass_flag("gemini", &[]), Some("--yolo")); - } - - #[test] - fn bypass_flag_gemini_dedup_when_yolo_present() { - let args = vec!["--yolo".to_string()]; - assert_eq!( - compute_bypass_flag("gemini", &args), - None, - "should not duplicate --yolo flag" - ); - } - - #[test] - fn bypass_flag_gemini_dedup_when_y_present() { - let args = vec!["-y".to_string()]; - assert_eq!( - compute_bypass_flag("gemini", &args), - None, - "should not duplicate when -y shorthand present" - ); - } - - #[test] - fn bypass_flag_aider_gets_none() { - assert_eq!(compute_bypass_flag("aider", &[]), None); - } - - #[test] - fn bypass_flag_goose_gets_none() { - assert_eq!(compute_bypass_flag("goose", &[]), None); - } - - #[test] - fn bypass_flag_unknown_cli_gets_none() { - assert_eq!(compute_bypass_flag("mystery-cli", &[]), None); - } - - #[test] - fn bypass_flag_claude_dedup_when_already_present() { - let args = vec!["--dangerously-skip-permissions".to_string()]; - assert_eq!( - compute_bypass_flag("claude", &args), - None, - "should not duplicate flag" - ); - } - - #[test] - fn bypass_flag_codex_dedup_when_already_present() { - let args = vec!["--dangerously-bypass-approvals-and-sandbox".to_string()]; - assert_eq!( - compute_bypass_flag("codex", &args), - None, - "should not duplicate flag" - ); - } - - #[test] - fn bypass_flag_codex_dedup_when_full_auto_present() { - let args = vec!["--full-auto".to_string()]; - assert_eq!( - compute_bypass_flag("codex", &args), - None, - "should not add bypass when --full-auto already present" - ); - } - - #[test] - fn bypass_flag_claude_dedup_partial_match() { - // If someone passes a different arg containing the substring, still dedup - let args = vec!["--my-dangerously-skip-permissions-flag".to_string()]; - assert_eq!( - compute_bypass_flag("claude", &args), - None, - "substring match should prevent duplication" - ); - } - - #[test] - fn bypass_flag_codex_with_other_args() { - let args = vec!["--model".to_string(), "gpt-4".to_string()]; - assert_eq!( - compute_bypass_flag("codex", &args), - Some("--dangerously-bypass-approvals-and-sandbox"), - "unrelated args should not prevent bypass flag" - ); - } - - // ==================== is_pid_alive ==================== - - #[test] - fn is_pid_alive_returns_true_for_self() { - let pid = std::process::id(); - assert!( - crate::broker::is_pid_alive(pid), - "current process PID should be alive" - ); - } - - #[test] - fn is_pid_alive_returns_false_for_dead_pid() { - // Spawn a short-lived child, wait for it to exit, then verify it's dead - let child = std::process::Command::new("true") - .spawn() - .expect("failed to spawn 'true'"); - let pid = child.id(); - let mut child = child; - child.wait().expect("failed to wait on child"); - // After the child exits, its PID should not be alive - // (the PID may be recycled, but on macOS/Linux it won't be immediately) - assert!( - !crate::broker::is_pid_alive(pid), - "exited child PID should be dead" - ); - } - - #[test] - fn is_pid_alive_returns_false_for_bogus_pid() { - // PID 0 is the kernel scheduler — kill(0, 0) signals the entire process group, - // not a real target. Use a very high PID that almost certainly doesn't exist. - // On macOS pid_max is ~99999; on Linux it's typically 32768 or 4194304. - // 4_000_000 is unlikely to be in use. - assert!( - !crate::broker::is_pid_alive(4_000_000), - "bogus PID 4_000_000 should not be alive (ESRCH)" - ); - } - - #[test] - fn is_pid_alive_eperm_means_alive() { - // PID 1 (launchd/init) is owned by root. When run as a normal user, - // kill(1, 0) returns EPERM — the process exists but we can't signal it. - // This is exactly the EPERM case our fix handles. - // Skip if running as root (e.g., in some CI containers) since root can - // signal any process and would get rc=0 instead of EPERM. - if unsafe { nix::libc::getuid() } == 0 { - eprintln!("skipping EPERM test: running as root"); - return; - } - assert!( - crate::broker::is_pid_alive(1), - "PID 1 (init/launchd) should report alive via EPERM" - ); - } - - // ==================== write_pid_file ==================== - - // ==================== continuity_dir ==================== - - #[test] - fn continuity_dir_derives_correct_path_from_state_json() { - let state_path = std::path::Path::new("/project/.agent-relay/state.json"); - let result = continuity_dir(state_path); - assert_eq!( - result, - std::path::PathBuf::from("/project/.agent-relay/continuity") - ); - } - - #[test] - fn continuity_dir_works_with_nested_project_path() { - let state_path = std::path::Path::new("/home/user/projects/my-app/.agent-relay/state.json"); - let result = continuity_dir(state_path); - assert_eq!( - result, - std::path::PathBuf::from("/home/user/projects/my-app/.agent-relay/continuity") - ); - } - - #[test] - fn continuity_dir_preserves_relative_paths() { - let state_path = std::path::Path::new(".agent-relay/state.json"); - let result = continuity_dir(state_path); - assert_eq!(result, std::path::PathBuf::from(".agent-relay/continuity")); - } - - #[test] - fn http_api_spawn_spec_defaults_to_pty_runtime() { - let spec = build_http_api_spawn_spec( - "worker-a".to_string(), - "codex".to_string(), - None, - Some("o3".to_string()), - vec!["--fast".to_string()], - vec!["general".to_string()], - Some("/tmp/project".to_string()), - Some("core".to_string()), - Some("Lead".to_string()), - Some("subagent".to_string()), - None, - ) - .expect("spec should build"); - - assert!(matches!(spec.runtime, AgentRuntime::Pty)); - assert!(spec.provider.is_none()); - assert_eq!(spec.cli.as_deref(), Some("codex")); - assert_eq!(spec.model.as_deref(), Some("o3")); - } - - #[test] - fn http_api_spawn_spec_uses_headless_runtime_for_supported_providers() { - let spec = build_http_api_spawn_spec( - "worker-a".to_string(), - "opencode".to_string(), - Some("headless".to_string()), - Some("ignored".to_string()), - vec![], - vec!["general".to_string()], - None, - None, - None, - None, - None, - ) - .expect("headless spec should build"); - - assert!(matches!(spec.runtime, AgentRuntime::Headless)); - assert!(matches!( - spec.provider, - Some(ProtocolHeadlessProvider::Opencode) - )); - assert!(spec.cli.is_none()); - assert_eq!(spec.model.as_deref(), Some("ignored")); - } - - #[test] - fn headless_provider_command_claude_places_flags_before_task() { - let (bin, args) = super::headless_provider_command( - &ProtocolHeadlessProvider::Claude, - "hello world", - &[ - "--mcp-config".to_string(), - "{\"mcpServers\":{}}".to_string(), - ], - ); - - assert_eq!(bin, "claude"); - assert_eq!(args.last().map(String::as_str), Some("hello world")); - let mcp_pos = args.iter().position(|a| a == "--mcp-config").unwrap(); - let task_pos = args.iter().position(|a| a == "hello world").unwrap(); - assert!(mcp_pos < task_pos, "--mcp-config must precede task"); - } - - #[test] - fn headless_provider_command_opencode_places_flags_before_task() { - let (bin, args) = super::headless_provider_command( - &ProtocolHeadlessProvider::Opencode, - "hello world", - &["--agent".to_string(), "relaycast".to_string()], - ); - - assert_eq!(bin, "opencode"); - assert_eq!(args.first().map(String::as_str), Some("run")); - assert_eq!(args.last().map(String::as_str), Some("hello world")); - let agent_pos = args.iter().position(|a| a == "--agent").unwrap(); - let task_pos = args.iter().position(|a| a == "hello world").unwrap(); - assert!(agent_pos < task_pos, "--agent must precede task"); - } - - #[test] - fn http_api_spawn_spec_rejects_unknown_headless_providers() { - let error = build_http_api_spawn_spec( - "worker-a".to_string(), - "codex".to_string(), - Some("headless".to_string()), - None, - vec![], - vec!["general".to_string()], - None, - None, - None, - None, - None, - ) - .expect_err("unsupported headless provider should fail"); - - assert!( - error - .to_string() - .contains("does not support headless transport"), - "unexpected error: {error}" - ); - } - - // ==================== model flag injection tests ==================== - // Tests for the --model flag injection logic used in WorkerRegistry::spawn(). - // When spec.model is set and non-empty, the broker should inject --model - // into the spawned CLI's argv, unless the user already specified --model. - - /// Mirror of the model flag logic in WorkerRegistry::spawn(). - fn compute_model_flag(model: Option<&str>, existing_args: &[String]) -> Option { - model.and_then(|m| { - if m.is_empty() - || existing_args - .iter() - .any(|a| a == "--model" || a.starts_with("--model=") || a == "-m") - { - None - } else { - Some(m.to_string()) - } - }) - } - - #[test] - fn model_flag_injected_when_present() { - assert_eq!( - compute_model_flag(Some("haiku"), &[]), - Some("haiku".to_string()), - "model should be injected when set and args are empty" - ); - } - - #[test] - fn model_flag_not_injected_when_none() { - assert_eq!( - compute_model_flag(None, &[]), - None, - "model should not be injected when not set" - ); - } - - #[test] - fn model_flag_not_injected_when_empty() { - assert_eq!( - compute_model_flag(Some(""), &[]), - None, - "model should not be injected when empty string" - ); - } - - #[test] - fn model_flag_not_injected_when_already_in_args() { - let args = vec!["--model".to_string(), "opus".to_string()]; - assert_eq!( - compute_model_flag(Some("haiku"), &args), - None, - "model should not be injected when --model already in args" - ); - } - - #[test] - fn model_flag_not_injected_when_short_flag_in_args() { - let args = vec!["-m".to_string(), "opus".to_string()]; - assert_eq!( - compute_model_flag(Some("haiku"), &args), - None, - "model should not be injected when -m already in args" - ); - } - - #[test] - fn model_flag_not_injected_when_equals_format_in_args() { - let args = vec!["--model=opus".to_string()]; - assert_eq!( - compute_model_flag(Some("haiku"), &args), - None, - "model should not be injected when --model=value already in args" - ); - } - - #[test] - fn model_flag_injected_with_other_args() { - let args = vec!["--verbose".to_string()]; - assert_eq!( - compute_model_flag(Some("gpt-4o"), &args), - Some("gpt-4o".to_string()), - "model should be injected when other unrelated args exist" - ); - } -} diff --git a/crates/broker/src/pty_worker.rs b/crates/broker/src/pty_worker.rs index aed8cd378..20da8208d 100644 --- a/crates/broker/src/pty_worker.rs +++ b/crates/broker/src/pty_worker.rs @@ -1,12 +1,30 @@ -use super::*; +use std::{ + collections::{HashSet, VecDeque}, + time::{Duration, Instant}, +}; + +use anyhow::{Context, Result}; +use relay_broker::{ + protocol::{MessageInjectionMode, ProtocolEnvelope, RelayDelivery}, + pty::PtySession, +}; +use serde_json::{json, Value}; +use tokio::{ + io::{AsyncBufReadExt, BufReader}, + sync::mpsc, + time::MissedTickBehavior, +}; + +use crate::cli::PtyCommand; use crate::helpers::{ check_echo_in_output, current_timestamp_ms, delivery_injected_event_payload, delivery_queued_event_payload, floor_char_boundary, format_injection_for_worker_with_workspace, - parse_cli_command, parse_continuity_command, ActivityDetector, DeliveryOutcome, + parse_cli_command, parse_continuity_command, strip_ansi, ActivityDetector, DeliveryOutcome, PendingActivity, PendingVerification, ThrottleState, ACTIVITY_BUFFER_KEEP_BYTES, ACTIVITY_BUFFER_MAX_BYTES, ACTIVITY_WINDOW, VERIFICATION_WINDOW, }; use crate::readiness::{cli_prompt_ready, detect_cli_ready, GridReadinessSnapshot}; +use crate::runtime::{get_terminal_size, send_frame}; use crate::wrap::{PtyAutoState, AUTO_SUGGESTION_BLOCK_TIMEOUT}; use base64::Engine; use relay_broker::snapshot::Snapshot; diff --git a/crates/broker/src/routing.rs b/crates/broker/src/routing.rs index b161afdc9..3e9e67419 100644 --- a/crates/broker/src/routing.rs +++ b/crates/broker/src/routing.rs @@ -2,7 +2,7 @@ use std::collections::HashSet; use relay_broker::types::{InboundKind, InboundRelayEvent}; -use crate::normalize_channel; +use crate::runtime::normalize_channel; #[derive(Clone)] pub(crate) struct RoutingWorker<'a> { diff --git a/crates/broker/src/runtime.rs b/crates/broker/src/runtime.rs new file mode 100644 index 000000000..2b10759a0 --- /dev/null +++ b/crates/broker/src/runtime.rs @@ -0,0 +1,7512 @@ +use std::{ + collections::{HashMap, HashSet, VecDeque}, + path::{Path, PathBuf}, + process::Stdio, + sync::{Arc, OnceLock}, + time::{Duration, Instant}, +}; + +use crate::helpers::{ + agent_name_eq, floor_char_boundary, is_self_name, normalize_cli_name, parse_cli_command, +}; +use crate::listen_api::{ + broadcast_if_relevant, listen_api_router, DeliveryRouteError, ListenApiConfig, + ListenApiRequest, SetInboundDeliveryModeOk, +}; +use crate::routing::display_target_for_dashboard; + +use anyhow::{Context, Result}; +use relaycast::WsEvent; +use serde::{Deserialize, Serialize}; +use serde_json::{json, Value}; +use tokio::{ + io::{AsyncBufReadExt, AsyncReadExt, AsyncWriteExt, BufReader}, + sync::{broadcast, mpsc, Notify, RwLock}, + time::{timeout, MissedTickBehavior}, +}; +use uuid::Uuid; + +use relay_broker::{ + auth::AuthClient, + dedup::DedupCache, + message_bridge::map_ws_event, + multi_workspace::{MultiWorkspaceSession, WorkspaceInboundMessage, WorkspaceMembershipSummary}, + protocol::{ + AgentRuntime, AgentSpec, HeadlessProvider as ProtocolHeadlessProvider, + MessageInjectionMode, ProtocolEnvelope, RelayDelivery, PROTOCOL_VERSION, + }, + relaycast_ws::{ + format_worker_preregistration_error, registration_retry_after_secs, + retry_agent_registration, RegRetryOutcome, RelaycastHttpClient, WsControl, + }, + replay_buffer::{ReplayBuffer, DEFAULT_REPLAY_CAPACITY}, + snippets::ensure_relaycast_mcp_config, + telemetry::{ActionSource, TelemetryClient, TelemetryEvent}, + types::{ + BrokerCommandEvent, InboundDeliveryDispatch, InboundDeliveryMode, InboundDeliveryState, + InboundKind, PendingRelayMessage, + }, +}; + +use crate::cli::{DumpPtyCommand, DumpPtyFormat, HeadlessCommand, InitCommand}; +use crate::worker::{WorkerEvent, WorkerHandle, WorkerRegistry}; +use crate::{broker, listen_api, routing, worker_request}; + +const DEFAULT_DELIVERY_RETRY_MS: u64 = 1_000; +const MAX_DELIVERY_RETRIES: u32 = 10; +const DEFAULT_RELAYCAST_BASE_URL: &str = "https://api.relaycast.dev"; +use crate::helpers::resolve_dm_participants_cached; +const THREAD_HISTORY_LIMIT: usize = 1_000; +const DEFAULT_HTTP_API_LOCAL_DELIVERY_TIMEOUT_MS: u64 = 3_000; +const DEFAULT_HTTP_API_RELAYCAST_SEND_TIMEOUT_MS: u64 = 20_000; +const DEFAULT_HTTP_API_EVENT_EMIT_TIMEOUT_MS: u64 = 200; +static TRACING_GUARD: OnceLock = OnceLock::new(); + +pub(crate) fn startup_debug_enabled() -> bool { + std::env::var("AGENT_RELAY_STARTUP_DEBUG") + .map(|value| { + let trimmed = value.trim(); + !trimmed.is_empty() && trimmed != "0" && !trimmed.eq_ignore_ascii_case("false") + }) + .unwrap_or(false) +} + +pub(crate) fn log_startup_phase(enabled: bool, started_at: Instant, message: impl AsRef) { + if enabled { + eprintln!( + "[agent-relay][startup +{}ms] {}", + started_at.elapsed().as_millis(), + message.as_ref() + ); + } +} + +pub(crate) fn headless_provider_cli_name(provider: &ProtocolHeadlessProvider) -> &'static str { + match provider { + ProtocolHeadlessProvider::Claude => "claude", + ProtocolHeadlessProvider::Opencode => "opencode", + } +} + +pub(crate) fn headless_provider_command( + provider: &ProtocolHeadlessProvider, + task: &str, + extra_args: &[String], +) -> (String, Vec) { + match provider { + ProtocolHeadlessProvider::Claude => { + let mut args = vec![ + "-p".to_string(), + "--dangerously-skip-permissions".to_string(), + ]; + args.extend(extra_args.iter().cloned()); + args.push(task.to_string()); + ("claude".to_string(), args) + } + ProtocolHeadlessProvider::Opencode => { + let mut args = vec!["run".to_string()]; + args.extend(extra_args.iter().cloned()); + args.push(task.to_string()); + ("opencode".to_string(), args) + } + } +} + +pub(crate) fn headless_provider_from_cli(value: &str) -> Option { + match value.trim().to_ascii_lowercase().as_str() { + "claude" => Some(ProtocolHeadlessProvider::Claude), + "opencode" => Some(ProtocolHeadlessProvider::Opencode), + _ => None, + } +} + +pub(crate) fn runtime_label(runtime: &AgentRuntime) -> &'static str { + match runtime { + AgentRuntime::Pty => "pty", + AgentRuntime::Headless => "headless", + } +} + +#[allow(clippy::too_many_arguments)] +pub(crate) fn build_http_api_spawn_spec( + name: String, + cli: String, + transport: Option, + model: Option, + args: Vec, + channels: Vec, + cwd: Option, + team: Option, + shadow_of: Option, + shadow_mode: Option, + restart_policy: Option, +) -> Result { + let runtime = match transport + .as_deref() + .map(str::trim) + .filter(|value| !value.is_empty()) + .map(|value| value.to_ascii_lowercase()) + { + None => AgentRuntime::Pty, + Some(value) if value == "pty" => AgentRuntime::Pty, + Some(value) if value == "headless" => AgentRuntime::Headless, + Some(other) => { + anyhow::bail!("unsupported transport '{other}' (expected 'pty' or 'headless')") + } + }; + let parsed_restart_policy = match restart_policy { + Some(v) => Some(serde_json::from_value(v).context("invalid restart_policy")?), + None => None, + }; + + let (provider, cli_command, model) = match runtime { + AgentRuntime::Pty => (None, Some(cli), model), + AgentRuntime::Headless => { + let provider = headless_provider_from_cli(&cli).with_context(|| { + format!( + "provider '{cli}' does not support headless transport (supported: claude, opencode)" + ) + })?; + (Some(provider), None, model) + } + }; + + Ok(AgentSpec { + name, + runtime, + provider, + cli: cli_command, + model, + cwd, + team, + shadow_of, + shadow_mode, + args, + channels, + restart_policy: parsed_restart_policy, + }) +} + +#[derive(Debug)] +pub(crate) struct RuntimePaths { + persist: bool, + state: PathBuf, + pending: PathBuf, + /// Held for process lifetime to prevent concurrent broker instances (persist mode only). + #[allow(dead_code)] + _lock: Option, +} + +/// Shared Relaycast connection state used by run_init and run_wrap. +#[derive(Clone)] +pub(crate) struct RelayWorkspace { + pub(crate) workspace_id: String, + pub(crate) workspace_alias: Option, + pub(crate) relay_workspace_key: String, + pub(crate) self_name: String, + pub(crate) self_agent_id: String, + pub(crate) self_names: HashSet, + pub(crate) self_agent_ids: HashSet, + pub(crate) http_client: RelaycastHttpClient, + pub(crate) ws_control_tx: mpsc::Sender, +} + +pub(crate) struct RelaySession { + pub(crate) http_base: String, + pub(crate) default_workspace_id: Option, + pub(crate) workspaces: Vec, + pub(crate) ws_inbound_rx: mpsc::Receiver, +} + +#[derive(Clone)] +pub(crate) struct RelayReadyState { + workspace_key: String, + memberships: Vec, + default_workspace_id: Option, +} + +pub(crate) async fn serve_startup_api_until_ready( + listener: tokio::net::TcpListener, + relay_ready: Arc, +) -> tokio::net::TcpListener { + loop { + tokio::select! { + _ = relay_ready.notified() => { + return listener; + } + accepted = listener.accept() => { + match accepted { + Ok((stream, _addr)) => { + tokio::spawn(handle_startup_api_connection(stream)); + } + Err(error) => { + tracing::warn!(error = %error, "startup API accept failed"); + tokio::time::sleep(Duration::from_millis(50)).await; + } + } + } + } + } +} + +pub(crate) async fn handle_startup_api_connection(mut stream: tokio::net::TcpStream) { + let mut buffer = [0_u8; 1024]; + let read = match timeout(Duration::from_secs(5), stream.read(&mut buffer)).await { + Ok(Ok(read)) => read, + Ok(Err(error)) => { + tracing::debug!(error = %error, "failed reading startup API request"); + return; + } + Err(_) => return, + }; + + let request = String::from_utf8_lossy(&buffer[..read]); + let path = request + .lines() + .next() + .and_then(|line| line.split_whitespace().nth(1)) + .unwrap_or("/"); + let (status, content_type, body) = if path == "/health" { + ( + "200 OK", + "application/json", + listen_api::listen_api_health_payload(None, vec![]).to_string(), + ) + } else { + ( + "503 Service Unavailable", + "text/plain; charset=utf-8", + "Broker is starting, please retry".to_string(), + ) + }; + let response = format!( + "HTTP/1.1 {status}\r\ncontent-type: {content_type}\r\ncontent-length: {}\r\nconnection: close\r\n\r\n{body}", + body.len() + ); + if let Err(error) = stream.write_all(response.as_bytes()).await { + tracing::debug!(error = %error, "failed writing startup API response"); + } +} + +/// Build the standard env-var array passed to every spawned child agent. +pub(crate) fn normalize_initial_task(task: Option) -> Option { + task.and_then(|value| { + if value.trim().is_empty() { + None + } else { + Some(value) + } + }) +} + +pub(crate) struct RelaySessionOptions<'a> { + pub(crate) paths: &'a RuntimePaths, + pub(crate) requested_name: &'a str, + pub(crate) channels: Vec, + pub(crate) strict_name: bool, + pub(crate) agent_type: Option<&'a str>, + /// Read .mcp.json for additional self-name identities + pub(crate) read_mcp_identity: bool, + /// Write relaycast server entry to .mcp.json + pub(crate) ensure_mcp_config: bool, + pub(crate) runtime_cwd: &'a Path, +} + +pub(crate) async fn connect_relay(opts: RelaySessionOptions<'_>) -> Result { + let startup_debug = startup_debug_enabled(); + let connect_started = Instant::now(); + let http_base = std::env::var("RELAYCAST_BASE_URL") + .ok() + .or_else(|| std::env::var("RELAY_BASE_URL").ok()) + .unwrap_or_else(|| DEFAULT_RELAYCAST_BASE_URL.to_string()); + let ws_base = std::env::var("RELAYCAST_WS_URL") + .unwrap_or_else(|_| derive_ws_base_url_from_http(&http_base)); + + log_startup_phase( + startup_debug, + connect_started, + format!( + "connect_relay begin requested_name='{}' channels={}", + opts.requested_name, + opts.channels.join(",") + ), + ); + let auth = AuthClient::new(http_base.clone()); + let sessions = auth + .startup_session_set_with_options( + Some(opts.requested_name), + opts.strict_name, + opts.agent_type, + ) + .await + .context("failed to initialize relaycast session")?; + log_startup_phase( + startup_debug, + connect_started, + format!( + "startup_session_set_with_options complete memberships={}", + sessions.memberships.len() + ), + ); + + let default_session = sessions + .default_session() + .or_else(|| sessions.memberships.first()) + .context("no relaycast memberships were initialized")?; + let relay_workspace_key = default_session.credentials.api_key.clone(); + let self_agent_id = default_session.credentials.agent_id.clone(); + let self_token = default_session.token.clone(); + let agent_name = default_session + .credentials + .agent_name + .clone() + .unwrap_or_else(|| opts.requested_name.to_string()); + + let identity_debug = format!( + "agent_name='{}' +requested='{}' +agent_id='{}' +token_prefix='{}' +default_workspace='{}' +workspace_count='{}' +timestamp='{}' +", + agent_name, + opts.requested_name, + self_agent_id, + &self_token[..self_token.len().min(16)], + default_session.credentials.workspace_id, + sessions.memberships.len(), + chrono::Utc::now().to_rfc3339() + ); + let debug_path = opts + .paths + .state + .parent() + .unwrap() + .join("identity-debug.txt"); + if std::env::var("AGENT_RELAY_NO_DEBUG_FILES").is_err() { + let _ = std::fs::write(&debug_path, &identity_debug); + eprintln!( + "[agent-relay] identity debug written to {}", + debug_path.display() + ); + } + if agent_name != opts.requested_name { + eprintln!( + "[agent-relay] WARNING: registered as '{}' (requested '{}')", + agent_name, opts.requested_name + ); + } + + if opts.ensure_mcp_config { + if let Err(error) = ensure_relaycast_mcp_config( + opts.runtime_cwd, + Some(relay_workspace_key.as_str()), + Some(http_base.as_str()), + None, + ) { + tracing::warn!("failed to ensure .mcp.json: {error}"); + } + } + + log_startup_phase( + startup_debug, + connect_started, + "MultiWorkspaceSession::new begin", + ); + let mut multi = MultiWorkspaceSession::new( + http_base.clone(), + ws_base, + auth, + sessions, + opts.channels, + opts.read_mcp_identity, + opts.runtime_cwd, + relay_broker::events::EventEmitter::new(false), + ); + log_startup_phase( + startup_debug, + connect_started, + format!( + "MultiWorkspaceSession::new complete handles={} default_workspace={:?}", + multi.handles.len(), + multi.default_workspace_id + ), + ); + + let default_workspace_id = multi.default_workspace_id.clone(); + let workspaces = multi + .handles + .drain(..) + .map(|handle| RelayWorkspace { + workspace_id: handle.workspace_id, + workspace_alias: handle.workspace_alias, + relay_workspace_key: handle.relay_workspace_key, + self_name: handle.self_name, + self_agent_id: handle.self_agent_id, + self_names: handle.self_names, + self_agent_ids: handle.self_agent_ids, + http_client: handle.http_client, + ws_control_tx: handle.ws_control_tx, + }) + .collect(); + + Ok(RelaySession { + http_base, + default_workspace_id, + workspaces, + ws_inbound_rx: multi.inbound_rx, + }) +} + +#[derive(Debug, Clone)] +pub(crate) struct PendingDelivery { + worker_name: String, + delivery: RelayDelivery, + attempts: u32, + next_retry_at: Instant, +} + +/// Serializable snapshot of pending deliveries for crash recovery. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub(crate) struct PersistedPendingDelivery { + worker_name: String, + delivery: RelayDelivery, + attempts: u32, +} + +pub(crate) fn save_pending_deliveries( + path: &Path, + deliveries: &HashMap, +) -> Result<()> { + let persisted: Vec = deliveries + .values() + .map(|pd| PersistedPendingDelivery { + worker_name: pd.worker_name.clone(), + delivery: pd.delivery.clone(), + attempts: pd.attempts, + }) + .collect(); + let json = serde_json::to_string_pretty(&persisted)?; + let dir = path.parent().unwrap_or(path); + let mut tmp = tempfile::NamedTempFile::new_in(dir) + .with_context(|| format!("failed creating temp file in {}", dir.display()))?; + std::io::Write::write_all(&mut tmp, json.as_bytes())?; + tmp.persist(path) + .with_context(|| format!("failed persisting pending deliveries to {}", path.display()))?; + Ok(()) +} + +pub(crate) fn load_pending_deliveries(path: &Path) -> HashMap { + let data = match std::fs::read_to_string(path) { + Ok(d) => d, + Err(_) => return HashMap::new(), + }; + let persisted: Vec = match serde_json::from_str(&data) { + Ok(v) => v, + Err(_) => return HashMap::new(), + }; + persisted + .into_iter() + .map(|p| { + let id = p.delivery.delivery_id.clone(); + ( + id, + PendingDelivery { + worker_name: p.worker_name, + delivery: p.delivery, + attempts: p.attempts, + next_retry_at: Instant::now(), // retry immediately on restart + }, + ) + }) + .collect() +} + +// These payload structs were used by the stdio protocol handler (handle_sdk_frame). +#[derive(Debug, Serialize)] +pub(crate) struct AgentMetrics { + name: String, + pid: u32, + memory_bytes: u64, + uptime_secs: u64, +} + +#[derive(Debug, Deserialize)] +pub(crate) struct DeliveryAckPayload { + delivery_id: String, + event_id: String, +} + +#[derive(Debug, Clone, Serialize, PartialEq, Eq)] +#[serde(rename_all = "camelCase")] +pub(crate) struct ThreadInfo { + thread_id: String, + name: String, + unread_count: usize, + #[serde(skip_serializing_if = "Option::is_none")] + last_message: Option, + #[serde(skip_serializing_if = "Option::is_none")] + last_message_at: Option, +} + +#[derive(Debug, Clone)] +pub(crate) struct ThreadAccumulator { + info: ThreadInfo, + sort_key: i64, +} + +pub(crate) fn normalize_sender(sender: Option) -> String { + let raw = sender + .unwrap_or_else(|| "human:orchestrator".to_string()) + .trim() + .to_string(); + if raw.is_empty() { + return "human:orchestrator".to_string(); + } + if let Some(rest) = raw.strip_prefix("human:") { + let normalized_rest = rest.trim(); + if normalized_rest.is_empty() { + return "human:orchestrator".to_string(); + } + return format!("human:{normalized_rest}"); + } + raw +} + +pub(crate) fn sender_is_dashboard_label(sender: &str, self_name: &str) -> bool { + let trimmed = sender.trim(); + trimmed.eq_ignore_ascii_case("Dashboard") + || trimmed.eq_ignore_ascii_case("human:Dashboard") + || trimmed.eq_ignore_ascii_case("human:orchestrator") + || trimmed.eq_ignore_ascii_case(self_name) +} + +/// Connection metadata discovered from a running broker — typically by +/// reading `/connection.json` or from explicit CLI flags / env. +pub(crate) struct BrokerConnection { + base_url: String, + api_key: Option, +} + +/// Resolve the broker connection by checking, in order: +/// +/// 1. Explicit CLI args (`--broker-url`, `--api-key`). When `--broker-url` +/// is supplied without an API key, we still attempt to fall back to the +/// API key from env / `.agent-relay/connection.json` so users don't have +/// to repeat `--api-key` for every dump-pty invocation. +/// 2. Env vars `RELAY_BROKER_URL` / `RELAY_BROKER_API_KEY`. +/// 3. `connection.json` in the supplied state dir, otherwise +/// `.agent-relay/connection.json` directly under the current working +/// directory. The bare `cwd` is intentionally NOT probed — an unrelated +/// `connection.json` sitting in the user's repo root must not silently +/// redirect the snapshot request (and its broker API key) elsewhere. +pub(crate) fn discover_broker_connection( + explicit_url: Option<&str>, + explicit_api_key: Option<&str>, + state_dir: Option<&Path>, +) -> Result { + // Walk the same search roots used for the URL fallback, but only to + // pull out a stored `api_key`. Lets `--broker-url` reuse the broker's + // saved key when the env var and `--api-key` are both unset. + let api_key_from_connection_file = || -> Option { + let cwd = std::env::current_dir().ok()?; + let roots: Vec = match state_dir { + Some(dir) => vec![dir.to_path_buf()], + None => vec![cwd.join(".agent-relay")], + }; + for root in roots { + let path = root.join("connection.json"); + if !path.is_file() { + continue; + } + let body = std::fs::read_to_string(&path).ok()?; + let value: Value = serde_json::from_str(&body).ok()?; + if let Some(key) = value.get("api_key").and_then(Value::as_str) { + if !key.trim().is_empty() { + return Some(key.to_string()); + } + } + } + None + }; + + let resolve_api_key = |explicit: Option<&str>| -> Option { + explicit + .map(ToString::to_string) + .or_else(|| std::env::var("RELAY_BROKER_API_KEY").ok()) + .or_else(api_key_from_connection_file) + .filter(|value| !value.trim().is_empty()) + }; + + if let Some(url) = explicit_url { + return Ok(BrokerConnection { + base_url: url.trim_end_matches('/').to_string(), + api_key: resolve_api_key(explicit_api_key), + }); + } + + if let Ok(url) = std::env::var("RELAY_BROKER_URL") { + let trimmed = url.trim(); + if !trimmed.is_empty() { + return Ok(BrokerConnection { + base_url: trimmed.trim_end_matches('/').to_string(), + api_key: resolve_api_key(explicit_api_key), + }); + } + } + + let cwd = std::env::current_dir().context("failed to read current directory")?; + let search_roots: Vec = match state_dir { + Some(dir) => vec![dir.to_path_buf()], + None => vec![cwd.join(".agent-relay")], + }; + + for root in &search_roots { + let path = root.join("connection.json"); + if !path.is_file() { + continue; + } + let body = std::fs::read_to_string(&path) + .with_context(|| format!("failed reading {}", path.display()))?; + let value: Value = serde_json::from_str(&body) + .with_context(|| format!("failed parsing {}", path.display()))?; + let url = value + .get("url") + .and_then(Value::as_str) + .with_context(|| format!("connection file missing 'url': {}", path.display()))? + .to_string(); + let api_key = explicit_api_key + .map(ToString::to_string) + .or_else(|| std::env::var("RELAY_BROKER_API_KEY").ok()) + .or_else(|| { + value + .get("api_key") + .and_then(Value::as_str) + .map(ToString::to_string) + }) + .filter(|value| !value.trim().is_empty()); + return Ok(BrokerConnection { + base_url: url.trim_end_matches('/').to_string(), + api_key, + }); + } + + anyhow::bail!( + "could not locate broker connection. Pass --broker-url, set RELAY_BROKER_URL, \ + or run from a directory containing .agent-relay/connection.json" + ); +} + +/// `agent-relay-broker dump-pty ` — capture and print a worker's +/// current visible screen by hitting the broker's snapshot route. +pub(crate) async fn run_dump_pty(cmd: DumpPtyCommand) -> Result<()> { + use base64::Engine; + + let connection = discover_broker_connection( + cmd.broker_url.as_deref(), + cmd.api_key.as_deref(), + cmd.state_dir.as_deref(), + )?; + + let url = format!( + "{}/api/spawned/{}/snapshot?format={}", + connection.base_url, + urlencoding::encode(&cmd.name), + cmd.format.as_wire_str(), + ); + + let client = reqwest::Client::builder() + .timeout(Duration::from_secs(10)) + .build() + .context("failed to build http client")?; + + let mut request = client.get(&url); + if let Some(key) = connection.api_key.as_deref() { + request = request.header("X-API-Key", key); + } + let response = request + .send() + .await + .with_context(|| format!("failed reaching broker at {url}"))?; + let status = response.status(); + let body_bytes = response + .bytes() + .await + .context("failed reading broker response body")?; + + if !status.is_success() { + let body_str = String::from_utf8_lossy(&body_bytes); + anyhow::bail!("broker returned {status}: {body_str}"); + } + + let body: Value = + serde_json::from_slice(&body_bytes).context("broker response was not valid JSON")?; + let screen = body + .get("screen") + .and_then(Value::as_str) + .context("broker response missing 'screen' field")?; + + match cmd.format { + DumpPtyFormat::Plain => { + // The plain payload already includes the trailing newline per row. + // Print as-is so pipelines see a stable terminator. + use std::io::Write; + let mut stdout = std::io::stdout().lock(); + stdout + .write_all(screen.as_bytes()) + .context("failed writing snapshot to stdout")?; + stdout.flush().ok(); + } + DumpPtyFormat::Ansi => { + let bytes = base64::engine::general_purpose::STANDARD + .decode(screen) + .context("broker returned non-base64 ansi screen")?; + use std::io::Write; + let mut stdout = std::io::stdout().lock(); + stdout + .write_all(&bytes) + .context("failed writing snapshot to stdout")?; + stdout.flush().ok(); + } + } + + Ok(()) +} + +pub(crate) async fn run_init(cmd: InitCommand, telemetry: TelemetryClient) -> Result<()> { + let broker_start = Instant::now(); + let startup_debug = startup_debug_enabled(); + let mut agent_spawn_count: u32 = 0; + telemetry.track(TelemetryEvent::BrokerStart); + + let runtime_cwd = std::env::current_dir()?; + let resolved_name = if cmd.name.trim().is_empty() { + runtime_cwd + .file_name() + .and_then(|name| name.to_str()) + .filter(|name| !name.is_empty()) + .unwrap_or("project") + .to_string() + } else { + cmd.name.trim().to_string() + }; + let custom_state_dir = cmd.state_dir.as_ref().map(PathBuf::from); + log_startup_phase( + startup_debug, + broker_start, + format!( + "run_init begin name='{}' cwd='{}' persist={} channels='{}'", + resolved_name, + runtime_cwd.display(), + cmd.persist, + cmd.channels + ), + ); + let paths = if cmd.persist || custom_state_dir.is_some() { + ensure_runtime_paths(&runtime_cwd, &resolved_name, custom_state_dir.as_deref())? + } else { + // Warn if a stale .agent-relay/ dir exists from a previous persist run. + // Agents can read files from it directly (logs, state) and get confused. + let stale_dir = runtime_cwd.join(".agent-relay"); + if stale_dir.exists() { + eprintln!( + "[agent-relay] WARNING: stale .agent-relay/ directory found in {}", + runtime_cwd.display() + ); + eprintln!( + "[agent-relay] WARNING: remove it to avoid confusing spawned agents: rm -rf {}", + stale_dir.display() + ); + } + ensure_ephemeral_paths(&runtime_cwd, &resolved_name)? + }; + log_startup_phase( + startup_debug, + broker_start, + format!("runtime paths ready state='{}'", paths.state.display()), + ); + let mut state = if cmd.persist || custom_state_dir.is_some() { + broker::BrokerState::load(&paths.state).unwrap_or_default() + } else { + broker::BrokerState::default() + }; + + // Clean up agents from previous sessions whose processes have died + let reaped = state.reap_dead_agents(); + if !reaped.is_empty() { + tracing::info!( + agents = ?reaped, + "reaped {} dead agent(s) from previous session", + reaped.len() + ); + if paths.persist { + if let Err(error) = state.save(&paths.state) { + tracing::warn!(path = %paths.state.display(), error = %error, "failed to persist broker state after reaping dead agents"); + } + } + } + + if std::env::var("AGENT_RELAY_DISABLE_RELAYCAST").is_ok() { + anyhow::bail!( + "AGENT_RELAY_DISABLE_RELAYCAST is no longer supported; broker requires Relaycast" + ); + } + + // Use RELAY_AGENT_TYPE env var if set (e.g. "agent" for SDK-spawned brokers), + // otherwise default to "human" for interactive CLI usage. + let agent_type_env = std::env::var("RELAY_AGENT_TYPE").ok(); + let agent_type_ref = agent_type_env.as_deref().unwrap_or("human"); + + // HTTP/WS API — always started. This is the primary transport for SDK + // consumers, dashboards, and remote clients. When no explicit API key + // is configured, generate a random one so control endpoints are always + // authenticated (the key is written to the runtime metadata file for + // SDK discovery). + let api_key = std::env::var("RELAY_BROKER_API_KEY") + .ok() + .filter(|v| !v.trim().is_empty()) + .unwrap_or_else(|| format!("br_{}", Uuid::new_v4().simple())); + + // Set the env var so listen_api's configured_broker_api_key() picks it up. + std::env::set_var("RELAY_BROKER_API_KEY", &api_key); + + let relay_ready = Arc::new(Notify::new()); + let relay_ready_state: Arc>> = Arc::new(RwLock::new(None)); + let (api_tx, mut api_rx) = mpsc::channel::(32); + let bind_addr = format!("{}:{}", cmd.api_bind, cmd.api_port); + log_startup_phase( + startup_debug, + broker_start, + format!("binding API listener on {}", bind_addr), + ); + let listener = tokio::net::TcpListener::bind(&bind_addr) + .await + .with_context(|| format!("failed to bind API on {}", bind_addr))?; + let actual_port = listener.local_addr()?.port(); + log_startup_phase( + startup_debug, + broker_start, + format!("API listener bound on {}:{}", cmd.api_bind, actual_port), + ); + // Machine-readable on stdout (SDK parses this to discover the port). + // Diagnostic logs stay on stderr via tracing/eprintln. + println!( + "[agent-relay] API listening on http://{}:{}", + cmd.api_bind, actual_port + ); + + // Write connection file so CLI commands can find this broker. + let connection_dir = paths.state.parent().unwrap(); + let connection_path = connection_dir.join("connection.json"); + let connection = json!({ + "url": format!("http://{}:{}", cmd.api_bind, actual_port), + "port": actual_port, + "api_key": &api_key, + "pid": std::process::id(), + }); + if let Ok(json_str) = serde_json::to_string_pretty(&connection) { + if let Ok(mut tmp) = tempfile::NamedTempFile::new_in(connection_dir) { + use std::io::Write; + if tmp.write_all(json_str.as_bytes()).is_ok() { + let _ = tmp.persist(&connection_path); + tracing::info!(path = %connection_path.display(), "wrote connection file"); + } + } + } + + let (startup_listener_tx, startup_listener_rx) = + tokio::sync::oneshot::channel::(); + let relay_ready_for_startup = relay_ready.clone(); + tokio::spawn(async move { + let listener = serve_startup_api_until_ready(listener, relay_ready_for_startup).await; + let _ = startup_listener_tx.send(listener); + }); + + log_startup_phase(startup_debug, broker_start, "calling connect_relay"); + let relay = connect_relay(RelaySessionOptions { + paths: &paths, + requested_name: &resolved_name, + channels: channels_from_csv(&cmd.channels), + // Ephemeral brokers are short-lived and frequently restarted by tests/SDK + // callers. Use non-strict registration so stale Relaycast identities from + // prior runs don't hard-fail startup. + strict_name: cmd.persist, + agent_type: Some(agent_type_ref), + read_mcp_identity: true, + ensure_mcp_config: cmd.persist, + runtime_cwd: &runtime_cwd, + }) + .await?; + log_startup_phase(startup_debug, broker_start, "connect_relay completed"); + + let RelaySession { + http_base, + default_workspace_id, + workspaces, + mut ws_inbound_rx, + } = relay; + let workspace_lookup: HashMap = workspaces + .iter() + .cloned() + .map(|workspace| (workspace.workspace_id.clone(), workspace)) + .collect(); + let default_workspace = if let Some(default_workspace_id) = default_workspace_id.as_deref() { + workspaces + .iter() + .find(|workspace| workspace.workspace_id == default_workspace_id) + .or_else(|| workspaces.first()) + } else { + workspaces.first() + } + .cloned() + .context("no relay workspace was available after initialization")?; + let relay_workspace_key = default_workspace.relay_workspace_key.clone(); + let self_names = default_workspace.self_names.clone(); + let ws_control_tx = default_workspace.ws_control_tx.clone(); + let relaycast_http = default_workspace.http_client.clone(); + let workspace_memberships: Vec = workspaces + .iter() + .map(|workspace| WorkspaceMembershipSummary { + workspace_id: workspace.workspace_id.clone(), + workspace_alias: workspace.workspace_alias.clone(), + is_default: default_workspace_id + .as_deref() + .is_some_and(|workspace_id| workspace_id == workspace.workspace_id), + }) + .collect(); + let relay_workspaces_json = serde_json::to_string( + &workspaces + .iter() + .map(|workspace| { + serde_json::json!({ + "workspace_id": workspace.workspace_id, + "workspace_alias": workspace.workspace_alias, + "api_key": workspace.relay_workspace_key, + }) + }) + .collect::>(), + )?; + + // Broadcast channel for streaming dashboard-relevant events to WS clients. + // Created before publishing the ready router so replay and WS endpoints are + // available as soon as Relaycast workspace data is known. + let (events_tx, _events_rx) = broadcast::channel::(512); + let replay_buffer = ReplayBuffer::new(DEFAULT_REPLAY_CAPACITY); + + let ready_router = listen_api_router(ListenApiConfig { + tx: api_tx.clone(), + events_tx: events_tx.clone(), + replay_buffer: replay_buffer.clone(), + workspace_key: Some(relay_workspace_key.clone()), + memberships: workspace_memberships.clone(), + default_workspace_id: default_workspace_id.clone(), + persist: cmd.persist, + }); + { + let mut ready = relay_ready_state.write().await; + *ready = Some(RelayReadyState { + workspace_key: relay_workspace_key.clone(), + memberships: workspace_memberships.clone(), + default_workspace_id: default_workspace_id.clone(), + }); + } + if let Some(ready) = relay_ready_state.read().await.as_ref() { + log_startup_phase( + startup_debug, + broker_start, + format!( + "relay ready workspace_key_set={} memberships={} default_workspace={:?}", + !ready.workspace_key.is_empty(), + ready.memberships.len(), + ready.default_workspace_id + ), + ); + } + relay_ready.notify_one(); + let listener = startup_listener_rx + .await + .context("startup API listener task stopped before Relaycast readiness handoff")?; + tokio::spawn(async move { + if let Err(e) = axum::serve(listener, ready_router).await { + tracing::error!(error = %e, "HTTP API server error"); + } + }); + + log_startup_phase( + startup_debug, + broker_start, + format!( + "ensuring default channels for {} workspaces", + workspaces.len() + ), + ); + for workspace in &workspaces { + if let Err(error) = workspace.http_client.ensure_default_channels().await { + tracing::warn!(workspace_id = %workspace.workspace_id, error = %error, "failed to ensure default channels"); + } + } + log_startup_phase(startup_debug, broker_start, "default channels ensured"); + + let extra_channels = channels_from_csv(&cmd.channels); + log_startup_phase( + startup_debug, + broker_start, + format!("ensuring extra channels count={}", extra_channels.len()), + ); + for workspace in &workspaces { + if let Err(error) = workspace + .http_client + .ensure_extra_channels(&extra_channels) + .await + { + tracing::warn!(workspace_id = %workspace.workspace_id, error = %error, "failed to ensure extra channels"); + } + } + log_startup_phase(startup_debug, broker_start, "extra channels ensured"); + + if !extra_channels.is_empty() { + log_startup_phase( + startup_debug, + broker_start, + "subscribing websocket control channels", + ); + for workspace in &workspaces { + let _ = workspace + .ws_control_tx + .send(WsControl::Subscribe(extra_channels.clone())) + .await; + } + log_startup_phase( + startup_debug, + broker_start, + "websocket subscriptions updated", + ); + } + + let mut worker_env = vec![ + ("RELAY_BASE_URL".to_string(), http_base.clone()), + ("RELAY_API_KEY".to_string(), relay_workspace_key.clone()), + ( + "RELAY_WORKSPACES_JSON".to_string(), + relay_workspaces_json.clone(), + ), + ]; + if let Some(default_workspace_id) = default_workspace_id.clone() { + // Do NOT stamp RELAYFILE_WORKSPACE from default_workspace_id. The + // relaycast workspace id and the relayfile workspace id are + // independent — a relayfile JWT scoped to a different workspace will + // 403 with "workspace mismatch" when the relayfile MCP sends the + // wrong id. Callers that share an id across both services (e.g. the + // canonical `relay on start` flow) set RELAYFILE_WORKSPACE + // themselves through per-spawn env_vars. + worker_env.push(( + "RELAY_DEFAULT_WORKSPACE".to_string(), + default_workspace_id.clone(), + )); + worker_env.push(("RELAY_WORKSPACE_ID".to_string(), default_workspace_id)); + } + + let (sdk_out_tx, mut sdk_out_rx) = mpsc::channel::>(1024); + let events_tx_for_stdout = events_tx.clone(); + let replay_buffer_for_stdout = replay_buffer.clone(); + tokio::spawn(async move { + while let Some(frame) = sdk_out_rx.recv().await { + // Broadcast events to WS clients (the primary SDK transport) + if frame.msg_type == "event" { + broadcast_if_relevant( + &events_tx_for_stdout, + &replay_buffer_for_stdout, + &frame.payload, + ) + .await; + } + // Note: stdout writing is removed. The HTTP/WS API is the + // only SDK transport. Events flow through broadcast_if_relevant + // → events_tx → WS clients. + } + }); + + let (worker_event_tx, mut worker_event_rx) = mpsc::channel::(1024); + let worker_logs_dir = paths + .state + .parent() + .expect("state path should always have a parent") + .join("team") + .join("worker-logs"); + let mut workers = + WorkerRegistry::new(worker_event_tx, worker_env, worker_logs_dir, broker_start); + + // Load crash insights from previous session + let crash_insights_path = paths.state.parent().unwrap().join("crash-insights.json"); + let mut crash_insights = + relay_broker::crash_insights::CrashInsights::load(&crash_insights_path); + + let mut sdk_lines = BufReader::new(tokio::io::stdin()).lines(); + let mut stdin_open = true; + let mut reap_tick = tokio::time::interval(Duration::from_millis(500)); + reap_tick.set_missed_tick_behavior(MissedTickBehavior::Skip); + let mut dedup = DedupCache::new(Duration::from_secs(300), 8192); + let delivery_retry_interval = delivery_retry_interval(); + let mut pending_deliveries = load_pending_deliveries(&paths.pending); + let mut terminal_failed_deliveries: HashSet = HashSet::new(); + // Outstanding worker-bound RPC requests waiting on a `*_response` + // frame from the wrapped worker. Keyed by the `request_id` we put on + // the outbound request frame; the reply `oneshot` is consumed when + // the worker echoes the same `request_id` back, or the entry expires + // via the deadline sweep in the `reap_tick` arm below. + // + // The generic correlation infrastructure lives in `crate::worker_request` + // so each new request/response route (`snapshot_pty`, `delivery-mode`, + // `pending`, `flush`, ...) costs about five lines of broker plumbing. + let mut pending_requests: HashMap = HashMap::new(); + // Per-worker inbound-delivery-mode + pending-relay-message queue. Lives + // parallel to `workers.workers` so we can swap modes / inspect / + // drain without touching `WorkerHandle` (which holds OS-level + // process state). See `relay_broker::types::InboundDeliveryState`. Entries + // are created lazily on first lookup and removed wherever workers + // exit (`Release` arm, `worker_exited` frame, `reap_exited` sweep). + let mut delivery_states: HashMap = HashMap::new(); + let mut dm_participants_cache: HashMap)> = HashMap::new(); + let mut recent_thread_messages: VecDeque = VecDeque::new(); + if !pending_deliveries.is_empty() { + tracing::info!( + count = pending_deliveries.len(), + "loaded {} pending deliveries from previous session", + pending_deliveries.len() + ); + } + + let mut shutdown = false; + + // Owner lease: in ephemeral mode, the broker shuts down if the SDK + // doesn't renew the lease within this duration. Replaces stdin EOF + // detection. Disabled in persist mode. + let lease_duration = if cmd.persist { + None + } else { + Some(Duration::from_secs(120)) + }; + let mut last_lease_renewal = Instant::now(); + let mut lease_check = tokio::time::interval(Duration::from_secs(10)); + lease_check.set_missed_tick_behavior(MissedTickBehavior::Skip); + + // Graceful-shutdown signal: SIGTERM on unix, Ctrl+Break/Close on Windows. + // `tokio::signal::ctrl_c()` is handled in its own select! arm below and + // works on both platforms. + #[cfg(unix)] + let mut sigterm = tokio::signal::unix::signal(tokio::signal::unix::SignalKind::terminate())?; + #[cfg(windows)] + let mut sigterm = tokio::signal::windows::ctrl_shutdown()?; + + while !shutdown { + tokio::select! { + _ = tokio::signal::ctrl_c() => { + shutdown = true; + } + + _ = lease_check.tick() => { + if let Some(duration) = lease_duration { + if last_lease_renewal.elapsed() > duration { + tracing::info!( + elapsed_secs = last_lease_renewal.elapsed().as_secs(), + lease_secs = duration.as_secs(), + "owner lease expired — shutting down" + ); + shutdown = true; + } + } + } + + _ = sigterm.recv() => { + tracing::info!("received SIGTERM, shutting down"); + shutdown = true; + } + + // HTTP API requests (when --api-port is active) + result = api_rx.recv() => { + if let Some(req) = result { + match req { + ListenApiRequest::Spawn { + name, + cli, + transport, + model, + args, + task, + channels, + cwd, + team, + shadow_of, + shadow_mode, + continue_from, + idle_threshold_secs, + skip_relay_prompt, + restart_policy, + agent_token, + reply, + } => { + let effective_channels = if channels.is_empty() { + default_spawn_channels() + } else { + channels.clone() + }; + let spec = match build_http_api_spawn_spec( + name.clone(), + cli.clone(), + transport, + model.clone(), + args, + effective_channels.clone(), + cwd, + team, + shadow_of, + shadow_mode, + *restart_policy, + ) { + Ok(spec) => spec, + Err(error) => { + let _ = reply.send(Err(error.to_string())); + continue; + } + }; + let mut preregistration_warning: Option = None; + let registration_result = retry_agent_registration( + &relaycast_http, &name, Some(&cli), + ).await; + let worker_relay_key = match registration_result { + Ok(token) => Some(token), + Err(RegRetryOutcome::RetryableExhausted(error)) => { + let message = format_worker_preregistration_error(&name, &error); + tracing::warn!( + worker = %name, + error = %error, + "continuing spawn without pre-registration after retries exhausted" + ); + preregistration_warning = Some(message); + None + } + Err(RegRetryOutcome::Fatal(error)) => { + let _ = reply.send(Err(format_worker_preregistration_error(&name, &error))); + continue; + } + }; + + // Caller-supplied agent_token overrides auto-registration + let worker_relay_key = agent_token.or(worker_relay_key); + + let mut effective_task = normalize_initial_task(task); + if let Some(ref continue_from) = continue_from { + let continuity_dir = continuity_dir(&paths.state); + let continuity_file = continuity_dir.join(format!("{}.json", continue_from)); + if continuity_file.exists() { + match std::fs::read_to_string(&continuity_file) { + Ok(contents) => { + if let Ok(ctx) = serde_json::from_str::(&contents) { + let prev_task = ctx + .get("initial_task") + .and_then(Value::as_str) + .unwrap_or("unknown"); + let summary = ctx + .get("summary") + .and_then(Value::as_str) + .unwrap_or("no summary available"); + let messages = ctx + .get("message_history") + .and_then(Value::as_array) + .map(|msgs| { + msgs.iter() + .filter_map(|m| { + let from = m + .get("from") + .and_then(Value::as_str) + .unwrap_or("?"); + let text = m + .get("text") + .and_then(Value::as_str) + .unwrap_or(""); + if text.is_empty() { + None + } else { + Some(format!(" {}: {}", from, text)) + } + }) + .collect::>() + .join("\n") + }) + .unwrap_or_default(); + + let continuity_block = format!( + "## Continuity Context (from previous session as '{}')\n\ + Previous task: {}\n\ + Session summary: {}\n{}", + continue_from, + prev_task, + summary, + if messages.is_empty() { + String::new() + } else { + format!("Recent messages:\n{}\n", messages) + } + ); + + effective_task = Some(match effective_task { + Some(new_task) => { + format!( + "{}\n\n## Current Task\n{}", + continuity_block, new_task + ) + } + None => continuity_block, + }); + tracing::info!( + agent = %name, + continue_from = %continue_from, + "injected continuity context from previous session for HTTP API spawn" + ); + } + } + Err(e) => { + tracing::warn!( + agent = %name, + continue_from = %continue_from, + error = %e, + "failed to read continuity file for HTTP API spawn" + ); + } + } + } else { + tracing::warn!( + agent = %name, + continue_from = %continue_from, + "no continuity file found at {}", + continuity_file.display() + ); + } + } + + match workers.spawn( + spec, + Some("Dashboard".to_string()), + None, + worker_relay_key.clone(), + skip_relay_prompt, + idle_threshold_secs.map(|s| s.to_string()), + ).await { + Ok(effective_spec) => { + if let Some(ref task_text) = effective_task { + workers.initial_tasks.insert(name.clone(), task_text.clone()); + } + agent_spawn_count += 1; + telemetry.track(TelemetryEvent::AgentSpawn { + cli: cli.clone(), + runtime: runtime_label(&effective_spec.runtime).to_string(), + spawn_source: ActionSource::HumanDashboard, + has_task: effective_task.is_some(), + is_shadow: effective_spec.shadow_of.is_some() + || effective_spec.shadow_mode.is_some(), + }); + let pid = workers.worker_pid(&name).unwrap_or(0); + state.agents.insert( + name.clone(), + broker::PersistedAgent { + runtime: effective_spec.runtime.clone(), + parent: Some("Dashboard".to_string()), + channels: effective_spec.channels.clone(), + pid: workers.worker_pid(&name), + started_at: Some( + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_secs(), + ), + spec: Some(effective_spec.clone()), + restart_policy: None, + initial_task: effective_task, + + }, + ); + if paths.persist { let _ = state.save(&paths.state); } + note_local_spawn_control_dedup( + &mut dedup, + default_workspace_id + .as_deref() + .or_else(|| workspaces.first().map(|workspace| workspace.workspace_id.as_str())), + &name, + worker_relay_key.as_deref(), + ); + let _ = send_event( + &sdk_out_tx, + json!({ + "kind":"agent_spawned", + "name":&name, + "runtime":runtime_label(&effective_spec.runtime), + "provider": effective_spec.provider.clone(), + "cli": effective_spec.cli.clone(), + "model": effective_spec.model.clone(), + "pid":pid, + "source":"http_api", + "pre_registered": worker_relay_key.is_some(), + "registration_warning": preregistration_warning.clone(), + }), + ).await; + publish_agent_state_transition( + &ws_control_tx, + &name, + "spawned", + Some("http_api_spawn"), + ) + .await; + let _ = reply.send(Ok(json!({ + "success": true, + "name": name, + "runtime": runtime_label(&effective_spec.runtime), + "model": effective_spec.model.clone(), + "pid": pid, + "pre_registered": worker_relay_key.is_some(), + "warning": preregistration_warning, + }))); + } + Err(e) => { + eprintln!("[agent-relay] HTTP API: failed to spawn '{}': {}", name, e); + let _ = reply.send(Err(e.to_string())); + } + } + } + ListenApiRequest::SetModel { name, model, timeout_ms, reply } => { + let Some(handle) = workers.workers.get_mut(&name) else { + let _ = reply.send(Err(format!("unknown worker '{}'", name))); + continue; + }; + + let model_command = format!("/model {}\n", model); + let result = async { + handle + .stdin + .write_all(model_command.as_bytes()) + .await + .with_context(|| { + format!("failed writing model command to worker '{}'", name) + })?; + handle + .stdin + .flush() + .await + .with_context(|| { + format!("failed flushing worker '{}' stdin", name) + })?; + if let Some(timeout_ms) = timeout_ms { + tracing::info!( + name = %name, + timeout_ms, + "HTTP API set_model timeout_ms is currently advisory only" + ); + } + Ok::<(), anyhow::Error>(()) + } + .await; + + match result { + Ok(()) => { + let _ = reply.send(Ok(json!({ + "name": name, + "model": model, + "success": true, + }))); + } + Err(error) => { + let _ = reply.send(Err(error.to_string())); + } + } + } + ListenApiRequest::Release { name, reason, reply } => { + if let Some(ref r) = reason { + tracing::info!(worker = %name, reason = %r, "releasing agent via HTTP API"); + } + // Unregister from supervisor before release to prevent + // auto-restart of intentionally released agents. + workers.supervisor.unregister(&name); + workers.metrics.on_release(&name); + match workers.release(&name).await { + Ok(()) => { + if let Err(error) = relaycast_http.mark_agent_offline(&name).await { + tracing::warn!( + worker = %name, + error = %error, + "failed to mark released worker offline in relaycast" + ); + } + let dropped = drop_pending_for_worker(&mut pending_deliveries, &name); + if dropped > 0 { + let _ = send_event( + &sdk_out_tx, + json!({"kind":"delivery_dropped","name":&name,"count":dropped,"reason":"agent_released"}), + ).await; + } + fail_pending_requests_for_worker(&mut pending_requests, &name, "agent_released"); + delivery_states.remove(&name); + state.agents.remove(&name); + if paths.persist { let _ = state.save(&paths.state); } + let _ = send_event( + &sdk_out_tx, + json!({"kind":"agent_released","name":&name}), + ).await; + publish_agent_state_transition( + &ws_control_tx, + &name, + "exited", + Some("http_api_release"), + ) + .await; + let _ = reply.send(Ok(json!({ "success": true, "name": name }))); + } + Err(e) => { + let message = e.to_string(); + if is_unknown_worker_error_message(&message) { + relaycast_http.forget_agent_registration(&name); + state.agents.remove(&name); + if paths.persist { + let _ = state.save(&paths.state); + } + tracing::debug!( + worker = %name, + "ignoring duplicate HTTP API release for already exited worker" + ); + let _ = reply.send(Ok(json!({ "success": true, "name": name }))); + } else { + eprintln!("[agent-relay] HTTP API: failed to release '{}': {}", name, e); + let _ = reply.send(Err(message)); + } + } + } + } + ListenApiRequest::Send { + to, + text, + from, + thread_id, + workspace_id, + workspace_alias, + mode, + reply, + } => { + let normalized_to = to.trim().to_string(); + let selected_workspace = if let Some(workspace_id) = workspace_id.as_deref() { + workspace_lookup + .get(workspace_id) + .cloned() + .ok_or_else(|| format!("workspace_not_found:workspace '{}' is not attached", workspace_id)) + } else if let Some(workspace_alias) = workspace_alias.as_deref() { + workspaces + .iter() + .find(|workspace| { + workspace + .workspace_alias + .as_deref() + .is_some_and(|alias| alias.eq_ignore_ascii_case(workspace_alias)) + }) + .cloned() + .ok_or_else(|| format!("workspace_not_found:workspace alias '{}' is not attached", workspace_alias)) + } else if workspaces.len() == 1 { + Ok(workspaces[0].clone()) + } else if let Some(default_workspace_id) = default_workspace_id.as_deref() { + workspace_lookup + .get(default_workspace_id) + .cloned() + .ok_or_else(|| format!("workspace_not_found: default workspace '{}' not found", default_workspace_id)) + } else { + Err("ambiguous_workspace:workspaceId or workspaceAlias is required when multiple workspaces are attached".to_string()) + }; + let selected_workspace = match selected_workspace { + Ok(workspace) => workspace, + Err(error) => { + let _ = reply.send(Err(error)); + continue; + } + }; + let selected_workspace_id = selected_workspace.workspace_id.clone(); + let selected_workspace_alias = selected_workspace.workspace_alias.clone(); + let workspace_self_name = selected_workspace.self_name.clone(); + let normalized_sender = normalize_sender(from.clone()); + let from_dashboard = + sender_is_dashboard_label(&normalized_sender, &workspace_self_name); + let delivery_from = if from_dashboard { + workspace_self_name.clone() + } else { + normalized_sender.clone() + }; + tracing::info!( + target = "relay_broker::http_api", + + raw_from = ?from, + normalized_sender = %normalized_sender, + from_dashboard = %from_dashboard, + delivery_from = %delivery_from, + to = %normalized_to, + thread_id = ?thread_id, + self_name = %workspace_self_name, + "HTTP API send request" + ); + let ui_from = if from_dashboard { + workspace_self_name.clone() + } else { + normalized_sender + }; + let event_id = format!("http_{}", Uuid::new_v4().simple()); + let priority = if normalized_to.starts_with('#') { 3 } else { 2 }; + let mut delivered = 0usize; + let mut delivery_errors = 0usize; + let request_start = Instant::now(); + let local_delivery_timeout = http_api_local_delivery_timeout(); + let relaycast_timeout = http_api_relaycast_send_timeout(); + let event_emit_timeout = http_api_event_emit_timeout(); + + record_thread_history_event( + &mut recent_thread_messages, + json!({ + "event_id": event_id.clone(), + "from": ui_from.clone(), + "target": normalized_to.clone(), + "to": normalized_to.clone(), + "text": text.clone(), + "thread_id": thread_id.clone(), + "workspace_id": selected_workspace_id.clone(), + "workspace_alias": selected_workspace_alias.clone(), + "timestamp": chrono::Utc::now().to_rfc3339(), + }), + ); + + let targets = if normalized_to.starts_with('#') { + workers.worker_names_for_channel_delivery(&normalized_to, &delivery_from, Some(&selected_workspace_id)) + } else { + workers.worker_names_for_direct_target(&normalized_to, &delivery_from, Some(&selected_workspace_id)) + }; + + tracing::info!( + target = "relay_broker::http_api", + + event_id = %event_id, + to = %normalized_to, + delivery_from = %delivery_from, + target_count = %targets.len(), + "resolved HTTP API send targets" + ); + + for worker_name in targets { + // Inbound-delivery queue: every inbound message + // enters the per-worker FIFO first. `auto_inject` + // drains immediately; `manual_flush` holds and + // counts as delivered so the HTTP caller's ack + // semantics are unchanged. We pass the FULL + // routing context so any drain reproduces the + // original delivery (channel/thread/workspace + // /priority/mode), not a stripped-down DM. + match queue_inbound_for_delivery_mode( + &mut delivery_states, + &workers, + &worker_name, + InboundContext { + from: &delivery_from, + body: &text, + target: &normalized_to, + thread_id: thread_id.as_deref(), + workspace_id: Some(selected_workspace_id.as_str()), + workspace_alias: selected_workspace_alias.as_deref(), + priority, + mode: mode.clone(), + event_id: Some(&event_id), + }, + ) { + InboundQueueOutcome::Queued => { + delivered = delivered.saturating_add(1); + tracing::info!( + target = "relay_broker::http_api", + event_id = %event_id, + to = %normalized_to, + worker = %worker_name, + "queued local delivery (manual_flush inbound delivery mode)" + ); + let _ = send_event( + &sdk_out_tx, + json!({ + "kind":"delivery_queued", + "name":&worker_name, + "event_id":&event_id, + "from":&delivery_from, + "target":&normalized_to, + "reason":"inbound_delivery_manual_flush", + }), + ).await; + continue; + } + InboundQueueOutcome::DrainNow(to_drain) => { + for queued in to_drain { + let queued_event_id = + queued.event_id.as_deref().unwrap_or(""); + let is_current = + queued.event_id.as_deref() == Some(event_id.as_str()); + match timeout( + local_delivery_timeout, + try_inject_pending_relay_message( + &mut workers, + &mut pending_deliveries, + &worker_name, + &queued, + delivery_retry_interval, + ), + ) + .await + { + Ok(Ok(_)) => { + if is_current { + delivered = delivered.saturating_add(1); + } + } + Ok(Err(error)) => { + if is_current { + delivery_errors = + delivery_errors.saturating_add(1); + } + tracing::warn!( + target = "relay_broker::http_api", + + event_id = %queued_event_id, + to = %queued.target, + worker = %worker_name, + error = %error, + "local delivery attempt failed" + ); + } + Err(_) => { + if is_current { + delivery_errors = + delivery_errors.saturating_add(1); + } + tracing::warn!( + target = "relay_broker::http_api", + + event_id = %queued_event_id, + to = %queued.target, + worker = %worker_name, + timeout_ms = %local_delivery_timeout.as_millis(), + "local delivery attempt timed out" + ); + } + } + } + continue; + } + InboundQueueOutcome::WorkerMissing => { + // Fall through so the standard + // not-found accounting path runs. + } + } + match timeout( + local_delivery_timeout, + queue_and_try_delivery_raw( + &mut workers, + &mut pending_deliveries, + &worker_name, + &event_id, + &delivery_from, + &normalized_to, + &text, + thread_id.clone(), + Some(selected_workspace_id.clone()), + selected_workspace_alias.clone(), + priority, + mode.clone(), + delivery_retry_interval, + ), + ) + .await + { + Ok(Ok(_)) => { + delivered = delivered.saturating_add(1); + } + Ok(Err(error)) => { + delivery_errors = delivery_errors.saturating_add(1); + tracing::warn!( + target = "relay_broker::http_api", + + event_id = %event_id, + to = %normalized_to, + worker = %worker_name, + error = %error, + "local delivery attempt failed" + ); + } + Err(_) => { + delivery_errors = delivery_errors.saturating_add(1); + tracing::warn!( + target = "relay_broker::http_api", + + event_id = %event_id, + to = %normalized_to, + worker = %worker_name, + timeout_ms = %local_delivery_timeout.as_millis(), + "local delivery attempt timed out" + ); + } + } + } + + if delivered > 0 { + tracing::info!( + target = "relay_broker::http_api", + + event_id = %event_id, + to = %normalized_to, + delivery_from = %delivery_from, + ui_from = %ui_from, + delivered = %delivered, + "local delivery succeeded" + ); + emit_http_api_event_with_timeout( + &sdk_out_tx, + json!({ + "kind": "relay_inbound", + "event_id": event_id, + "from": ui_from, + "target": normalized_to, + "body": text, + "thread_id": thread_id.clone(), + "workspace_id": selected_workspace_id.clone(), + "workspace_alias": selected_workspace_alias.clone(), + }), + event_emit_timeout, + ) + .await; + if reply + .send(Ok(json!({ + "success": true, + "event_id": event_id, + "delivered": delivered, + "local": true, + "workspace_id": selected_workspace_id, + "workspace_alias": selected_workspace_alias, + }))) + .is_err() + { + tracing::warn!( + target = "relay_broker::http_api", + + event_id = %event_id, + "broker HTTP API reply channel closed before local delivery response" + ); + } + } else { + tracing::info!( + target = "relay_broker::http_api", + + event_id = %event_id, + to = %normalized_to, + mode = ?mode, + delivery_errors = %delivery_errors, + delivery_from = %delivery_from, + ui_from = %ui_from, + relaycast_timeout_ms = %relaycast_timeout.as_millis(), + "no local deliveries succeeded; forwarding to relaycast" + ); + let relaycast_start = Instant::now(); + match timeout( + relaycast_timeout, + selected_workspace + .http_client + .send_with_mode(&normalized_to, &text, mode.clone()), + ) + .await + { + Ok(Ok(())) => { + tracing::info!( + target = "relay_broker::http_api", + + event_id = %event_id, + to = %normalized_to, + relaycast_ms = %relaycast_start.elapsed().as_millis(), + "relaycast publish succeeded" + ); + emit_http_api_event_with_timeout( + &sdk_out_tx, + json!({ + "kind": "relay_inbound", + "event_id": event_id, + "from": ui_from, + "target": normalized_to, + "body": text, + "thread_id": thread_id.clone(), + "workspace_id": selected_workspace_id.clone(), + "workspace_alias": selected_workspace_alias.clone(), + }), + event_emit_timeout, + ) + .await; + if reply + .send(Ok(json!({ + "success": true, + "event_id": event_id, + "relaycast_published": true, + "local": false, + "workspace_id": selected_workspace_id, + "workspace_alias": selected_workspace_alias, + }))) + .is_err() + { + tracing::warn!( + target = "relay_broker::http_api", + + event_id = %event_id, + "broker HTTP API reply channel closed before relaycast response" + ); + } + } + Ok(Err(error)) => { + tracing::warn!( + target = "relay_broker::http_api", + + event_id = %event_id, + to = %normalized_to, + relaycast_ms = %relaycast_start.elapsed().as_millis(), + error = %error, + "relaycast publish failed" + ); + let not_found = format!("Agent \"{}\" not found", normalized_to); + if reply + .send(Err(format!( + "{not_found} and Relaycast publish failed: {error}" + ))) + .is_err() + { + tracing::warn!( + target = "relay_broker::http_api", + + event_id = %event_id, + "broker HTTP API reply channel closed before relaycast failure response" + ); + } + } + Err(_) => { + tracing::warn!( + target = "relay_broker::http_api", + + event_id = %event_id, + to = %normalized_to, + relaycast_timeout_ms = %relaycast_timeout.as_millis(), + relaycast_ms = %relaycast_start.elapsed().as_millis(), + "relaycast publish timed out" + ); + let not_found = format!("Agent \"{}\" not found", normalized_to); + if reply + .send(Err(format!( + "{not_found} and Relaycast publish timed out after {}ms", + relaycast_timeout.as_millis() + ))) + .is_err() + { + tracing::warn!( + target = "relay_broker::http_api", + + event_id = %event_id, + "broker HTTP API reply channel closed before relaycast timeout response" + ); + } + } + } + } + tracing::info!( + target = "relay_broker::http_api", + + event_id = %event_id, + to = %normalized_to, + total_ms = %request_start.elapsed().as_millis(), + "HTTP API send request handling complete" + ); + } + ListenApiRequest::List { reply } => { + let _ = reply.send(Ok(json!({ "agents": workers.list() }))); + } + ListenApiRequest::Threads { reply } => { + let mut messages: Vec = + recent_thread_messages.iter().cloned().collect(); + match relaycast_http.get_all_dms(200).await { + Ok(dm_messages) => messages.extend(dm_messages), + Err(error) => { + tracing::debug!( + error = %error, + "failed to fetch relaycast dm history for /api/threads" + ); + } + } + let threads = build_thread_infos(&messages, &self_names); + let _ = reply.send(Ok(json!({ "threads": threads }))); + } + ListenApiRequest::SendInput { name, data, reply } => { + if let Err(err) = workers.send_to_worker( + &name, "write_pty", Some(format!("api_{}", Uuid::new_v4().simple())), + json!({ "data": data }), + ).await { + let _ = reply.send(Err(format!("agent_not_found: {}", err))); + } else { + let _ = reply.send(Ok(json!({ + "name": name, + "bytes_written": data.len(), + }))); + } + } + ListenApiRequest::ResizePty { name, rows, cols, reply } => { + if rows == 0 || cols == 0 { + let _ = reply.send(Err("invalid_dimensions: rows and cols must be >= 1".into())); + } else if let Err(err) = workers.send_to_worker( + &name, "resize_pty", Some(format!("api_{}", Uuid::new_v4().simple())), + json!({ "rows": rows, "cols": cols }), + ).await { + let _ = reply.send(Err(format!("agent_not_found: {}", err))); + } else { + let _ = reply.send(Ok(json!({ + "name": name, + "rows": rows, + "cols": cols, + }))); + } + } + ListenApiRequest::WorkerRequest { name, kind, payload, timeout, reply } => { + // Generic worker request/response: validate the + // worker exists and supports a PTY (all current + // request/response routes target the PTY side), + // then ship the frame and park the `reply` + // oneshot in `pending_requests`. The response is + // fulfilled either by the `*_response` arm below + // or by the deadline sweep in `reap_tick`. + // + // Headless workers don't run a VT and don't handle + // PTY-oriented RPCs — short-circuit with a typed + // error rather than letting the request sit until + // the timeout sweep returns a misleading + // `worker_timeout`. + let runtime = workers + .workers + .get(&name) + .map(|handle| handle.spec.runtime.clone()); + match runtime { + None => { + let _ = reply.send(Err( + worker_request::RequestWorkerError::WorkerNotFound( + format!("no worker named '{name}'"), + ), + )); + } + Some(AgentRuntime::Headless) => { + let _ = reply.send(Err( + worker_request::RequestWorkerError::UnsupportedRuntime( + format!("worker '{name}' is headless; {kind} is only supported on PTY workers"), + ), + )); + } + Some(AgentRuntime::Pty) => { + let request_id = format!("req_{}", Uuid::new_v4().simple()); + if let Err(err) = workers.send_to_worker( + &name, + &kind, + Some(request_id.clone()), + payload, + ).await { + let _ = reply.send(Err( + worker_request::RequestWorkerError::SendFailed( + err.to_string(), + ), + )); + } else { + pending_requests.insert( + request_id, + worker_request::PendingRequest { + kind, + worker_name: name, + reply, + deadline: Instant::now() + timeout, + }, + ); + } + } + } + } + ListenApiRequest::GetMetrics { agent, reply } => { + if let Some(ref agent_name) = agent { + if let Some(handle) = workers.workers.get(agent_name) { + let m = build_agent_metrics(handle); + let _ = reply.send(Ok(json!({ "agents": [m], "broker": workers.metrics.snapshot(workers.workers.len()) }))); + } else { + let _ = reply.send(Err(format!("unknown worker '{}'", agent_name))); + } + } else { + let mut agent_metrics: Vec = workers.workers.values() + .map(build_agent_metrics) + .collect(); + agent_metrics.sort_by(|a, b| a.name.cmp(&b.name)); + let _ = reply.send(Ok(json!({ + "agents": agent_metrics, + "broker": workers.metrics.snapshot(workers.workers.len()), + }))); + } + } + ListenApiRequest::GetStatus { reply } => { + let pending: Vec = pending_deliveries.values().map(|pd| { + json!({ + "delivery_id": pd.delivery.delivery_id, + "worker_name": pd.worker_name, + "event_id": pd.delivery.event_id, + "attempts": pd.attempts, + }) + }).collect(); + let _ = reply.send(Ok(json!({ + "agent_count": workers.workers.len(), + "agents": workers.list(), + "pending_delivery_count": pending.len(), + "pending_deliveries": pending, + }))); + } + ListenApiRequest::GetCrashInsights { reply } => { + let _ = reply.send(Ok(crash_insights.to_json())); + } + ListenApiRequest::Preflight { agents, reply } => { + let count = agents.len(); + let _ = reply.send(Ok(json!({ "queued": count }))); + // Background preflight — same as stdio handler + for entry in agents { + let http = relaycast_http.clone(); + tokio::spawn(async move { + let _ = tokio::time::timeout( + Duration::from_secs(30), + http.register_agent_token(&entry.name, Some(&entry.cli)), + ).await; + }); + } + } + ListenApiRequest::SubscribeChannels { name, channels, reply } => { + let Some(handle) = workers.workers.get_mut(&name) else { + let _ = reply.send(Err(format!("unknown worker '{}'", name))); + continue; + }; + let mut added = Vec::new(); + for ch in &channels { + let exists = handle.spec.channels.iter() + .any(|c| c.eq_ignore_ascii_case(ch)); + if !exists { + handle.spec.channels.push(ch.clone()); + added.push(ch.clone()); + } + } + let all_channels = handle.spec.channels.clone(); + let _ = reply.send(Ok(json!({ + "name": name, + "channels": all_channels, + }))); + } + ListenApiRequest::UnsubscribeChannels { name, channels, reply } => { + let Some(handle) = workers.workers.get_mut(&name) else { + let _ = reply.send(Err(format!("unknown worker '{}'", name))); + continue; + }; + handle.spec.channels.retain(|c| { + !channels.iter().any(|rem| rem.eq_ignore_ascii_case(c)) + }); + let remaining = handle.spec.channels.clone(); + let _ = reply.send(Ok(json!({ + "name": name, + "channels": remaining, + }))); + } + ListenApiRequest::GetInboundDeliveryMode { name, reply } => { + if !workers.has_worker(&name) { + let _ = reply.send(Err(DeliveryRouteError::WorkerNotFound(name))); + } else { + let mode = delivery_states + .get(&name) + .map(|s| s.mode) + .unwrap_or_default(); + let _ = reply.send(Ok(mode)); + } + } + ListenApiRequest::SetInboundDeliveryMode { name, mode, reply } => { + if !workers.has_worker(&name) { + let _ = reply.send(Err(DeliveryRouteError::WorkerNotFound(name))); + } else { + let entry = delivery_states.entry(name.clone()).or_default(); + let previous = entry.mode; + entry.mode = mode; + let to_flush: Vec = if previous + == InboundDeliveryMode::ManualFlush + && mode == InboundDeliveryMode::AutoInject + { + entry.drain_pending() + } else { + Vec::new() + }; + let flushed = to_flush.len(); + if !to_flush.is_empty() { + tracing::info!( + target = "agent_relay::broker", + worker = %name, + drained = flushed, + "draining pending queue on manual_flush → auto_inject transition" + ); + } + for queued in to_flush { + inject_pending_relay_message( + &mut workers, + &mut pending_deliveries, + &name, + &queued, + delivery_retry_interval, + ) + .await; + } + tracing::info!( + target = "agent_relay::broker", + worker = %name, + previous_mode = previous.as_wire_str(), + mode = mode.as_wire_str(), + flushed, + "inbound delivery mode updated" + ); + if previous != mode { + let _ = send_event( + &sdk_out_tx, + json!({ + "kind":"agent_inbound_delivery_mode_changed", + "name":&name, + "previous_mode":previous.as_wire_str(), + "mode":mode.as_wire_str(), + }), + ).await; + } + if flushed > 0 { + let _ = send_event( + &sdk_out_tx, + json!({ + "kind":"agent_pending_drained", + "name":&name, + "count":flushed, + "reason":"delivery_mode_transition", + }), + ).await; + } + let _ = reply.send(Ok(SetInboundDeliveryModeOk { mode, flushed })); + } + } + ListenApiRequest::GetPending { name, reply } => { + if !workers.has_worker(&name) { + let _ = reply.send(Err(DeliveryRouteError::WorkerNotFound(name))); + } else { + let snapshot = delivery_states + .get(&name) + .map(|s| s.pending_snapshot()) + .unwrap_or_default(); + let _ = reply.send(Ok(snapshot)); + } + } + ListenApiRequest::FlushPending { name, reply } => { + if !workers.has_worker(&name) { + let _ = reply.send(Err(DeliveryRouteError::WorkerNotFound(name))); + } else { + let to_flush: Vec = delivery_states + .get_mut(&name) + .map(|state| state.drain_pending()) + .unwrap_or_default(); + let flushed = to_flush.len(); + if flushed > 0 { + tracing::info!( + target = "agent_relay::broker", + worker = %name, + drained = flushed, + "flushing pending queue on explicit /flush" + ); + } + for queued in to_flush { + inject_pending_relay_message( + &mut workers, + &mut pending_deliveries, + &name, + &queued, + delivery_retry_interval, + ) + .await; + } + if flushed > 0 { + let _ = send_event( + &sdk_out_tx, + json!({ + "kind":"agent_pending_drained", + "name":&name, + "count":flushed, + "reason":"explicit_flush", + }), + ).await; + } + let _ = reply.send(Ok(flushed)); + } + } + ListenApiRequest::Shutdown { reply } => { + let _ = reply.send(Ok(json!({ "status": "shutting_down" }))); + shutdown = true; + } + ListenApiRequest::RenewLease { reply } => { + last_lease_renewal = Instant::now(); + let expires_in = lease_duration.map(|d| d.as_secs()).unwrap_or(0); + let _ = reply.send(Ok(json!({ + "renewed": true, + "expires_in_secs": expires_in, + "persist": cmd.persist, + }))); + } + } + } + } + + // Stdin is no longer used for SDK communication — all control + // goes through the HTTP/WS API. We drain stdin to avoid + // blocking if anything writes to it, and stop polling after EOF. + result = sdk_lines.next_line(), if stdin_open => { + if matches!(result, Ok(None) | Err(_)) { + stdin_open = false; + } + } + + ws_msg = ws_inbound_rx.recv() => { + if let Some(ws_msg) = ws_msg { + let workspace_id = ws_msg.workspace_id.clone(); + let workspace_alias = ws_msg.workspace_alias.clone(); + let ws_value = ws_msg.value; + let workspace_state = workspace_lookup + .get(&workspace_id) + .cloned() + .unwrap_or_else(|| default_workspace.clone()); + let workspace_self_name = workspace_state.self_name.clone(); + let workspace_self_names = workspace_state.self_names.clone(); + let workspace_self_agent_ids = workspace_state.self_agent_ids.clone(); + let workspace_http = workspace_state.http_client.clone(); + let ws_type = ws_value + .get("type") + .and_then(Value::as_str) + .unwrap_or(""); + tracing::info!( + target = "agent_relay::broker", + ws_type = %ws_type, + workspace_id = %workspace_id, + event = %ws_value, + "received relaycast ws event" + ); + + let control_dedup_key = if matches!( + ws_type, + "agent.spawn_requested" | "agent.release_requested" + ) { + relaycast_ws_control_dedup_key(&workspace_id, ws_type, &ws_value) + } else { + None + }; + + if let Some(ref control_dedup_key) = control_dedup_key { + if !dedup.insert_if_new(control_dedup_key, Instant::now()) { + tracing::info!( + ws_type = %ws_type, + workspace_id = %workspace_id, + "dropping duplicate relaycast control event" + ); + continue; + } + } + + if matches!(ws_type, "agent.spawn_requested" | "agent.release_requested") { + if let Err(ref deser_err) = serde_json::from_value::(ws_value.clone()) { + eprintln!( + "[agent-relay] WARNING: failed to deserialize {} event: {}", + ws_type, deser_err + ); + } + } + if let Ok(ws_event) = serde_json::from_value::(ws_value.clone()) { + match ws_event { + WsEvent::AgentReleaseRequested(event) => { + let name = event.agent.name; + if is_relaycast_self_control_target( + &name, + &workspace_self_name, + &workspace_self_names, + ) { + workspace_http.forget_agent_registration(&name); + tracing::debug!( + worker = %name, + "ignoring relaycast release request for broker self" + ); + continue; + } + workers.supervisor.unregister(&name); + workers.metrics.on_release(&name); + match workers.release(&name).await { + Ok(()) => { + workspace_http.forget_agent_registration(&name); + let dropped = drop_pending_for_worker(&mut pending_deliveries, &name); + if dropped > 0 { + let _ = send_event( + &sdk_out_tx, + json!({"kind":"delivery_dropped","name":name,"count":dropped,"reason":"agent_released"}), + ).await; + } + fail_pending_requests_for_worker(&mut pending_requests, &name, "relaycast_release"); + delivery_states.remove(&name); + telemetry.track(TelemetryEvent::AgentRelease { + cli: String::new(), + release_reason: "relaycast_release".to_string(), + lifetime_seconds: 0, + release_source: ActionSource::Protocol, + }); + state.agents.remove(&name); + if paths.persist { + if let Err(error) = state.save(&paths.state) { + tracing::warn!(path = %paths.state.display(), error = %error, "failed to persist broker state"); + } + } + let _ = send_event( + &sdk_out_tx, + json!({"kind":"agent_released","name":name}), + ).await; + publish_agent_state_transition( + &workspace_state.ws_control_tx, + &name, + "exited", + Some("relaycast_release"), + ) + .await; + tracing::info!(child = %name, "released worker via relaycast in broker mode"); + eprintln!("[agent-relay] released worker '{}' via relaycast", name); + } + Err(error) => { + let message = error.to_string(); + if is_unknown_worker_error_message(&message) { + workspace_http.forget_agent_registration(&name); + state.agents.remove(&name); + if paths.persist { + if let Err(save_error) = state.save(&paths.state) { + tracing::warn!( + path = %paths.state.display(), + error = %save_error, + "failed to persist broker state" + ); + } + } + tracing::debug!( + child = %name, + "ignoring duplicate relaycast release for already exited worker" + ); + } else { + tracing::error!(child = %name, error = %error, "failed to release worker via relaycast"); + eprintln!("[agent-relay] failed to release '{}': {}", name, error); + } + } + } + continue; + } + WsEvent::AgentSpawnRequested(event) => { + let name = event.agent.name; + eprintln!("[agent-relay] received spawn request for '{}' (cli: {})", name, event.agent.cli); + if is_relaycast_self_control_target( + &name, + &workspace_self_name, + &workspace_self_names, + ) { + tracing::debug!( + worker = %name, + "ignoring relaycast spawn request for broker self" + ); + eprintln!("[agent-relay] ignoring spawn request for '{}' (broker self)", name); + continue; + } + let local_spawn_echo_key = + relaycast_spawn_control_dedup_key(&workspace_id, &name); + if relaycast_ws_should_apply_local_spawn_echo_dedup( + control_dedup_key.as_deref(), + &local_spawn_echo_key, + ) && !dedup.insert_if_new(&local_spawn_echo_key, Instant::now()) + { + tracing::info!( + worker = %name, + workspace_id = %workspace_id, + "dropping duplicate/local relaycast spawn request" + ); + eprintln!("[agent-relay] dropping duplicate spawn request for '{}'", name); + continue; + } + let cli = event.agent.cli; + let task = Some(event.agent.task).filter(|value| !value.trim().is_empty()); + let channel = event.agent.channel; + + tracing::info!(name = %name, cli = %cli, task = ?task, channel = ?channel, "handling spawn request from relaycast WS"); + let channels = channel + .as_deref() + .map(|ch| { + let mut chs = default_spawn_channels(); + if !chs.contains(&ch.to_string()) { + chs.push(ch.to_string()); + } + chs + }) + .unwrap_or_else(default_spawn_channels); + let spec = AgentSpec { + name: name.clone(), + runtime: AgentRuntime::Pty, + provider: None, + cli: Some(cli.clone()), + model: None, + cwd: None, + team: None, + shadow_of: None, + shadow_mode: None, + args: vec![], + channels: channels.clone(), + restart_policy: None, + }; + let effective_task = normalize_initial_task(task.clone()); + + // Pre-register agent token. Claude doesn't need this — it + // bakes the API key into --mcp-config JSON and self-registers. + // Non-Claude CLIs need the token injected into their CLI args + // at spawn time, so we do a quick (3s) registration attempt. + let cli_command = parse_cli_command(&cli).map(|(cmd, _)| cmd).unwrap_or_else(|_| cli.clone()); + let cli_name_lower = normalize_cli_name(&cli_command).to_lowercase(); + let is_claude = cli_name_lower == "claude" || cli_name_lower.starts_with("claude:"); + let worker_relay_key = { + let ws_token = relaycast_ws_spawn_token(&ws_value); + if ws_token.is_some() { + ws_token + } else if is_claude { + // Claude self-registers via its MCP server — skip blocking call + None + } else { + const REG_TIMEOUT: Duration = Duration::from_secs(3); + match tokio::time::timeout( + REG_TIMEOUT, + workspace_http.register_agent_token(&name, Some(cli.as_str())), + ).await { + Ok(Ok(token)) => { + tracing::info!( + worker = %name, + "pre-registered agent via broker for WS spawn" + ); + Some(token) + } + Ok(Err(error)) => { + tracing::warn!( + worker = %name, + error = %error, + "WS spawn pre-registration failed; agent will self-register" + ); + None + } + Err(_) => { + tracing::warn!( + worker = %name, + "WS spawn pre-registration timed out (3s); agent will self-register" + ); + None + } + } + } + }; + + match workers.spawn( + spec, + Some("Relaycast".to_string()), + None, + worker_relay_key.clone(), + false, + Some(workspace_id.clone()), + ).await { + Ok(effective_spec) => { + if let Some(ref task_text) = effective_task { + workers.initial_tasks.insert(name.clone(), task_text.clone()); + } + agent_spawn_count += 1; + telemetry.track(TelemetryEvent::AgentSpawn { + cli: cli.clone(), + runtime: runtime_label(&effective_spec.runtime).to_string(), + spawn_source: ActionSource::Protocol, + has_task: effective_task.is_some(), + is_shadow: false, + }); + let pid = workers.worker_pid(&name).unwrap_or(0); + state.agents.insert( + name.clone(), + broker::PersistedAgent { + runtime: AgentRuntime::Pty, + parent: Some("Relaycast".to_string()), + channels, + pid: workers.worker_pid(&name), + started_at: Some( + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_secs(), + ), + spec: Some(effective_spec.clone()), + restart_policy: None, + initial_task: effective_task, + + }, + ); + if paths.persist { let _ = state.save(&paths.state); } + let _ = send_event( + &sdk_out_tx, + json!({ + "kind": "agent_spawned", + "name": name, + "runtime": "pty", + "cli": cli, + "model": effective_spec.model.clone(), + "pid": pid, + "source": "relaycast_ws", + "pre_registered": worker_relay_key.is_some(), + }), + ).await; + publish_agent_state_transition( + &workspace_state.ws_control_tx, + &name, + "spawned", + Some("relaycast_spawn"), + ) + .await; + tracing::info!(child = %name, pid, "spawned worker via relaycast WS"); + eprintln!("[agent-relay] spawned worker '{}' via relaycast", name); + } + Err(e) => { + let msg = e.to_string(); + if msg.contains("already exists") { + tracing::debug!(child = %name, "agent already spawned via SDK, skipping duplicate relaycast WS spawn"); + } else { + tracing::error!(child = %name, error = %e, "failed to spawn worker via relaycast WS"); + eprintln!("[agent-relay] failed to spawn '{}': {}", name, e); + } + } + } + continue; + } + _ => {} + } + } else if ws_type == "agent.spawn_requested" { + // Fallback: the SDK failed to deserialize the event (e.g. missing + // fields like `already_existed` or `task: null`). Extract the + // spawn info directly from the raw JSON so we don't silently + // drop the request. + let agent_obj = ws_value.get("agent"); + let name = agent_obj + .and_then(|a| a.get("name")) + .and_then(Value::as_str) + .unwrap_or("") + .to_string(); + let cli = agent_obj + .and_then(|a| a.get("cli")) + .and_then(Value::as_str) + .unwrap_or("claude") + .to_string(); + let task = agent_obj + .and_then(|a| a.get("task")) + .and_then(Value::as_str) + .unwrap_or("") + .to_string(); + let channel = agent_obj + .and_then(|a| a.get("channel")) + .and_then(Value::as_str) + .map(String::from); + + if !name.is_empty() { + eprintln!("[agent-relay] handling spawn request for '{}' via JSON fallback (cli: {})", name, cli); + + if is_relaycast_self_control_target( + &name, + &workspace_self_name, + &workspace_self_names, + ) { + eprintln!("[agent-relay] ignoring spawn request for '{}' (broker self)", name); + } else { + let local_spawn_echo_key = + relaycast_spawn_control_dedup_key(&workspace_id, &name); + let should_dedup = relaycast_ws_should_apply_local_spawn_echo_dedup( + control_dedup_key.as_deref(), + &local_spawn_echo_key, + ); + // Always insert the local echo key for consistency with the primary path + let is_new = dedup.insert_if_new(&local_spawn_echo_key, Instant::now()); + if !should_dedup || is_new + { + let channels = channel + .as_deref() + .map(|ch| { + let mut chs = default_spawn_channels(); + if !chs.contains(&ch.to_string()) { + chs.push(ch.to_string()); + } + chs + }) + .unwrap_or_else(default_spawn_channels); + let spec = AgentSpec { + name: name.clone(), + runtime: AgentRuntime::Pty, + provider: None, + cli: Some(cli.clone()), + model: None, + cwd: None, + team: None, + shadow_of: None, + shadow_mode: None, + args: vec![], + channels: channels.clone(), + restart_policy: None, + }; + let task_opt = Some(task).filter(|v| !v.trim().is_empty()); + let effective_task = normalize_initial_task(task_opt.clone()); + + // Pre-register (same logic as primary WS spawn path). + let cli_command = parse_cli_command(&cli).map(|(cmd, _)| cmd).unwrap_or_else(|_| cli.clone()); + let cli_name_lower = normalize_cli_name(&cli_command).to_lowercase(); + let is_claude = cli_name_lower == "claude" || cli_name_lower.starts_with("claude:"); + let worker_relay_key = { + let ws_token = relaycast_ws_spawn_token(&ws_value); + if ws_token.is_some() { + ws_token + } else if is_claude { + None + } else { + const REG_TIMEOUT: Duration = Duration::from_secs(3); + match tokio::time::timeout( + REG_TIMEOUT, + workspace_http.register_agent_token(&name, Some(cli.as_str())), + ).await { + Ok(Ok(token)) => Some(token), + Ok(Err(error)) => { + tracing::warn!( + worker = %name, + error = %error, + "WS spawn fallback pre-registration failed" + ); + None + } + Err(_) => { + tracing::warn!(worker = %name, "WS spawn fallback pre-registration timed out (3s)"); + None + } + } + } + }; + + match workers.spawn( + spec, + Some("Relaycast".to_string()), + None, + worker_relay_key.clone(), + false, + Some(workspace_id.clone()), + ).await { + Ok(effective_spec) => { + if let Some(ref task_text) = effective_task { + workers.initial_tasks.insert(name.clone(), task_text.clone()); + } + agent_spawn_count += 1; + telemetry.track(TelemetryEvent::AgentSpawn { + cli: cli.clone(), + runtime: runtime_label(&effective_spec.runtime).to_string(), + spawn_source: ActionSource::Protocol, + has_task: effective_task.is_some(), + is_shadow: false, + }); + let pid = workers.worker_pid(&name).unwrap_or(0); + state.agents.insert( + name.clone(), + broker::PersistedAgent { + runtime: AgentRuntime::Pty, + parent: Some("Relaycast".to_string()), + channels, + pid: workers.worker_pid(&name), + started_at: Some( + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_secs(), + ), + spec: Some(effective_spec.clone()), + restart_policy: None, + initial_task: effective_task, + + }, + ); + if paths.persist { let _ = state.save(&paths.state); } + let _ = send_event( + &sdk_out_tx, + json!({ + "kind": "agent_spawned", + "name": name, + "runtime": "pty", + "cli": cli, + "model": effective_spec.model.clone(), + "pid": pid, + "source": "relaycast_ws_fallback", + "pre_registered": worker_relay_key.is_some(), + }), + ).await; + publish_agent_state_transition( + &workspace_state.ws_control_tx, + &name, + "spawned", + Some("relaycast_spawn"), + ) + .await; + eprintln!("[agent-relay] spawned worker '{}' via relaycast (JSON fallback)", name); + } + Err(e) => { + let msg = e.to_string(); + if !msg.contains("already exists") { + eprintln!("[agent-relay] failed to spawn '{}': {}", name, e); + } + } + } + } else { + eprintln!("[agent-relay] dropping duplicate spawn request for '{}' (fallback)", name); + } + } + } + // Don't fall through to map_ws_event for control events + // handled by the JSON fallback path. + continue; + } + + // Preserve the raw channel from the WS event for thread replies. + // The mapper may set target = "thread" (synthetic) when the SDK + // struct lacks a channel field; we use the raw value to fix + // display_target so the dashboard can route the message correctly. + let raw_ws_channel = ws_value + .get("channel") + .and_then(Value::as_str) + .map(String::from); + + if let Some(mapped) = map_ws_event(&ws_value, &workspace_id, workspace_alias.as_deref()) { + tracing::info!( + from = %mapped.from, + target = %mapped.target, + kind = ?mapped.kind, + event_id = %mapped.event_id, + text_len = mapped.text.len(), + "mapped inbound WS event" + ); + let dedup_key = format!("{}:{}", mapped.workspace_id, mapped.event_id); + if !dedup.insert_if_new(&dedup_key, Instant::now()) { + tracing::info!(event_id = %mapped.event_id, workspace_id = %mapped.workspace_id, "dropping duplicate event"); + continue; + } + let has_local_target = if mapped.target.starts_with('#') { + !workers + .worker_names_for_channel_delivery(&mapped.target, &mapped.from, Some(&workspace_id)) + .is_empty() + } else if matches!(mapped.kind, InboundKind::ThreadReply) && mapped.target == "thread" { + // Thread replies target "thread" (synthetic), not a specific worker. + // Treat as having a local target when any worker exists so the + // self-echo filter doesn't drop dashboard-originated thread replies. + workers.has_any_worker() + } else { + workers.has_worker_by_name_ignoring_case(&mapped.target) + }; + if routing::is_self_echo( + &mapped, + &workspace_self_names, + &workspace_self_agent_ids, + has_local_target, + ) { + tracing::info!(from = %mapped.from, sender_agent_id = ?mapped.sender_agent_id, self_names = ?workspace_self_names, "skipping self-echo in broker loop"); + continue; + } + + telemetry.track(TelemetryEvent::MessageSend { + is_broadcast: mapped.target.starts_with('#'), + has_thread: mapped.thread_id.is_some(), + }); + + let mut delivery_plan = { + let worker_view = workers.routing_workers(); + routing::resolve_delivery_targets(&mapped, &worker_view) + }; + + // For thread replies with synthetic target "thread", override + // display_target with the actual channel so the dashboard can + // route the message to the correct channel/DM view. + if matches!(mapped.kind, InboundKind::ThreadReply) + && delivery_plan.display_target == "thread" + { + if let Some(ref ch) = raw_ws_channel { + let chan_target = if ch.starts_with('#') { + ch.clone() + } else { + format!("#{ch}") + }; + tracing::info!( + original_target = "thread", + resolved_target = %chan_target, + "overriding thread reply display_target with raw WS channel" + ); + delivery_plan.display_target = chan_target; + } + } + + if mapped.target.starts_with('#') { + tracing::info!( + channel = %mapped.target, + from = %mapped.from, + target_count = delivery_plan.targets.len(), + targets = ?delivery_plan.targets, + "channel delivery targets" + ); + } else { + tracing::info!( + target = %mapped.target, + from = %mapped.from, + kind = ?mapped.kind, + direct_targets = ?delivery_plan.targets, + "direct message routing" + ); + } + + if delivery_plan.needs_dm_resolution { + let conversation_id = mapped.target.clone(); + tracing::info!(conversation_id = %conversation_id, "resolving DM participants"); + let participants = resolve_dm_participants_cached( + &workspace_http, + &mut dm_participants_cache, + &workspace_id, + &conversation_id, + ) + .await; + tracing::info!(participants = ?participants, "resolved DM participants"); + + if let Some(participant) = participants + .iter() + .find(|participant| !agent_name_eq(participant, &mapped.from)) + { + delivery_plan.display_target = participant.clone(); + } + + let worker_view = workers.routing_workers(); + delivery_plan.targets = routing::worker_names_for_dm_participants( + &worker_view, + &participants, + &mapped.from, + Some(&workspace_id), + ); + tracing::info!(dm_targets = ?delivery_plan.targets, "DM participant-based routing targets"); + } + + for worker_name in delivery_plan.targets { + // Inbound-delivery queue: mirrors the /api/send + // queue above. Auto-inject workers drain the queue + // immediately; manual-flush workers leave relaycast + // messages parked until flush. The same full-context + // capture makes drains reproduce the original + // delivery (channel/thread/workspace). + match queue_inbound_for_delivery_mode( + &mut delivery_states, + &workers, + &worker_name, + InboundContext { + from: &mapped.from, + body: &mapped.text, + target: &mapped.target, + thread_id: mapped.thread_id.as_deref(), + workspace_id: Some(mapped.workspace_id.as_str()), + workspace_alias: mapped.workspace_alias.as_deref(), + priority: mapped.priority.as_u8(), + mode: MessageInjectionMode::Wait, + event_id: Some(&mapped.event_id), + }, + ) { + InboundQueueOutcome::Queued => { + tracing::info!( + target = "agent_relay::broker", + event_id = %mapped.event_id, + worker = %worker_name, + "queued inbound relay message (manual_flush inbound delivery mode)" + ); + let _ = send_event( + &sdk_out_tx, + json!({ + "kind":"delivery_queued", + "name":&worker_name, + "event_id":&mapped.event_id, + "from":&mapped.from, + "target":&mapped.target, + "reason":"inbound_delivery_manual_flush", + }), + ).await; + continue; + } + InboundQueueOutcome::DrainNow(to_drain) => { + for queued in to_drain { + if let Err(error) = try_inject_pending_relay_message( + &mut workers, + &mut pending_deliveries, + &worker_name, + &queued, + delivery_retry_interval, + ) + .await + { + let _ = send_error( + &sdk_out_tx, + None, + "delivery_failed", + error.to_string(), + true, + Some(json!({"worker": worker_name})), + ) + .await; + } + } + continue; + } + InboundQueueOutcome::WorkerMissing => {} + } + if let Err(error) = queue_and_try_delivery( + &mut workers, + &mut pending_deliveries, + &worker_name, + &mapped, + delivery_retry_interval, + ).await { + let _ = send_error(&sdk_out_tx, None, "delivery_failed", error.to_string(), true, Some(json!({"worker": worker_name}))).await; + } + } + + let display_target = + display_target_for_dashboard(&delivery_plan.display_target, &workspace_self_names, &workspace_self_name); + let display_from = if is_self_name(&workspace_self_names, &mapped.from) + { + workspace_self_name.clone() + } else { + mapped.from.clone() + }; + tracing::info!( + from = %display_from, + display_target = %display_target, + event_id = %mapped.event_id, + body_len = mapped.text.len(), + "broadcasting relay_inbound to dashboard" + ); + record_thread_history_event( + &mut recent_thread_messages, + json!({ + "event_id": mapped.event_id.clone(), + "from": display_from.clone(), + "target": display_target.clone(), + "text": mapped.text.clone(), + "thread_id": mapped.thread_id.clone(), + "workspace_id": mapped.workspace_id.clone(), + "workspace_alias": mapped.workspace_alias.clone(), + "timestamp": chrono::Utc::now().to_rfc3339(), + }), + ); + let _ = send_event( + &sdk_out_tx, + json!({ + "kind": "relay_inbound", + "event_id": mapped.event_id, + "from": display_from, + "target": display_target, + "body": mapped.text, + "thread_id": mapped.thread_id, + "workspace_id": mapped.workspace_id, + "workspace_alias": mapped.workspace_alias, + }), + ).await; + } else if ws_type != "broker.connection" && ws_type != "broker.channel_join" { + tracing::info!( + target = "agent_relay::broker", + ws_type = %ws_type, + event = %ws_value, + "relaycast ws event ignored by inbound mapper" + ); + } + } + } + + worker_event = worker_event_rx.recv() => { + if let Some(worker_event) = worker_event { + match worker_event { + WorkerEvent::Message { name, value } => { + if let Some(msg_type) = value.get("type").and_then(Value::as_str) { + if msg_type == "delivery_ack" { + if let Some(payload) = value.get("payload") { + let delivery_id = payload + .get("delivery_id") + .and_then(Value::as_str) + .unwrap_or(""); + + // Terminal guard: ignore late delivery_ack events once a + // delivery has reached terminal failed status. + if !delivery_id.is_empty() + && terminal_failed_deliveries.contains(delivery_id) + { + tracing::info!( + worker = %name, + delivery_id = %delivery_id, + "ignoring late delivery_ack after terminal failed status" + ); + continue; + } + + if let Ok(ack) = serde_json::from_value::(payload.clone()) { + clear_pending_delivery_if_event_matches( + &mut pending_deliveries, + &ack.delivery_id, + Some(&ack.event_id), + &name, + "delivery_ack", + ); + terminal_failed_deliveries.remove(&ack.delivery_id); + } + let _ = send_event(&sdk_out_tx, json!({ + "kind": "delivery_ack", + "name": name, + "delivery_id": payload.get("delivery_id"), + "event_id": payload.get("event_id"), + "timestamp": payload.get("timestamp"), + })).await; + } + } else if msg_type == "delivery_queued" { + if let Some(payload) = value.get("payload") { + let _ = send_event(&sdk_out_tx, json!({ + "kind": msg_type, + "name": name, + "delivery_id": payload.get("delivery_id"), + "event_id": payload.get("event_id"), + "timestamp": payload.get("timestamp"), + })).await; + } + } else if msg_type == "delivery_injected" { + if let Some(payload) = value.get("payload") { + let delivery_id = payload + .get("delivery_id") + .and_then(Value::as_str) + .unwrap_or(""); + let event_id = + payload.get("event_id").and_then(Value::as_str); + clear_pending_delivery_if_event_matches( + &mut pending_deliveries, + delivery_id, + event_id, + &name, + "delivery_injected", + ); + let _ = send_event(&sdk_out_tx, json!({ + "kind": msg_type, + "name": name, + "delivery_id": payload.get("delivery_id"), + "event_id": payload.get("event_id"), + "timestamp": payload.get("timestamp"), + })).await; + } + } else if msg_type == "delivery_verified" { + if let Some(payload) = value.get("payload") { + let delivery_id = payload.get("delivery_id").and_then(Value::as_str).unwrap_or(""); + let event_id = payload.get("event_id").and_then(Value::as_str).unwrap_or(""); + tracing::debug!( + target = "agent_relay::broker", + worker = %name, + delivery_id = %delivery_id, + event_id = %event_id, + "delivery verified by echo detection" + ); + clear_pending_delivery_if_event_matches( + &mut pending_deliveries, + delivery_id, + Some(event_id), + &name, + "delivery_verified", + ); + let _ = send_event(&sdk_out_tx, json!({ + "kind": "delivery_verified", + "name": name, + "delivery_id": delivery_id, + "event_id": event_id, + })).await; + } + } else if msg_type == "delivery_active" { + if let Some(payload) = value.get("payload") { + let _ = send_event(&sdk_out_tx, json!({ + "kind": "delivery_active", + "name": name, + "delivery_id": payload.get("delivery_id"), + "event_id": payload.get("event_id"), + "pattern": payload.get("pattern"), + })).await; + } + } else if msg_type == "delivery_failed" { + if let Some(payload) = value.get("payload") { + let delivery_id = payload.get("delivery_id").and_then(Value::as_str).unwrap_or(""); + let event_id = payload.get("event_id").and_then(Value::as_str).unwrap_or(""); + let reason = payload.get("reason").and_then(Value::as_str).unwrap_or("unknown"); + tracing::warn!( + target = "agent_relay::broker", + worker = %name, + delivery_id = %delivery_id, + event_id = %event_id, + reason = %reason, + "delivery failed — echo not detected" + ); + clear_pending_delivery_if_event_matches( + &mut pending_deliveries, + delivery_id, + Some(event_id), + &name, + "delivery_failed", + ); + if !delivery_id.is_empty() { + terminal_failed_deliveries + .insert(delivery_id.to_string()); + } + let _ = send_event(&sdk_out_tx, json!({ + "kind": "delivery_failed", + "name": name, + "delivery_id": delivery_id, + "event_id": event_id, + "reason": reason, + })).await; + } + } else if msg_type == "worker_error" { + let _ = send_event(&sdk_out_tx, json!({ + "kind": "worker_error", + "name": name, + "error": value.get("payload").cloned().unwrap_or(Value::Null) + })).await; + } else if msg_type.ends_with("_response") { + // Generic worker request/response dispatch. + // Any frame whose `type` ends in + // `_response` is routed by `request_id` + // into the matching parked `oneshot` in + // `pending_requests`. The pending entry + // owns the format/error decoding logic + // via `worker_request::fulfil_response_frame`. + let routed = worker_request::fulfil_response_frame( + &mut pending_requests, + &value, + ); + if !routed { + let req_id = value + .get("request_id") + .and_then(Value::as_str) + .unwrap_or(""); + tracing::debug!( + target = "agent_relay::broker", + worker = %name, + msg_type = %msg_type, + request_id = %req_id, + "worker response with no pending caller — dropping" + ); + } + } else if msg_type == "worker_stream" { + let _ = send_event(&sdk_out_tx, json!({ + "kind": "worker_stream", + "name": name, + "stream": value.get("payload").and_then(|p| p.get("stream")).cloned().unwrap_or(Value::String("stdout".to_string())), + "chunk": value.get("payload").and_then(|p| p.get("chunk")).cloned().unwrap_or(Value::String(String::new())), + })).await; + } else if msg_type == "worker_ready" { + if let Some(task_text) = workers.initial_tasks.remove(&name) { + let event_id = format!("init_{}", Uuid::new_v4().simple()); + if let Err(e) = queue_and_try_delivery_raw( + &mut workers, + &mut pending_deliveries, + &name, + &event_id, + "broker", + &name, + &task_text, + None, + None, + None, + 2, + MessageInjectionMode::Wait, + delivery_retry_interval, + ).await { + tracing::warn!(worker = %name, error = %e, "failed to deliver initial_task"); + } + } + let runtime = value.get("payload") + .and_then(|p| p.get("runtime")) + .and_then(Value::as_str) + .unwrap_or("pty"); + let (provider_val, cli_val, model_val) = workers.workers.get(&name) + .map(|h| (h.spec.provider.clone(), h.spec.cli.clone(), h.spec.model.clone())) + .unwrap_or((None, None, None)); + let _ = send_event(&sdk_out_tx, json!({ + "kind": "worker_ready", + "name": name, + "runtime": runtime, + "provider": provider_val, + "cli": cli_val, + "model": model_val, + })).await; + } else if msg_type == "agent_idle" { + let idle_secs = value.get("payload") + .and_then(|p| p.get("idle_secs")) + .and_then(Value::as_u64) + .unwrap_or(0); + let _ = send_event(&sdk_out_tx, json!({ + "kind": "agent_idle", + "name": name, + "idle_secs": idle_secs, + })).await; + publish_agent_state_transition( + &ws_control_tx, + &name, + "idle", + Some("idle_threshold"), + ) + .await; + } else if msg_type == "agent_exit" { + let reason = value.get("payload") + .and_then(|p| p.get("reason")) + .and_then(Value::as_str) + .unwrap_or("unknown"); + tracing::info!(agent = %name, reason = %reason, "agent requested exit"); + let _ = send_event(&sdk_out_tx, json!({ + "kind": "agent_exit", + "name": name, + "reason": reason, + })).await; + } else if msg_type == "continuity_command" { + // Agent-initiated continuity: the pty_worker detected a + // KIND: continuity block in PTY output and emitted this event. + let action = value.get("payload") + .and_then(|p| p.get("action")) + .and_then(Value::as_str) + .unwrap_or(""); + let content = value.get("payload") + .and_then(|p| p.get("content")) + .and_then(Value::as_str) + .unwrap_or(""); + match action { + "save" => { + let cont_dir = continuity_dir(&paths.state); + if let Err(e) = std::fs::create_dir_all(&cont_dir) { + tracing::warn!( + agent = %name, + error = %e, + "continuity_command save: failed to create dir" + ); + } else { + // Build a minimal continuity record with the provided summary. + let agent_data = state.agents.get(&name); + let cli = agent_data + .and_then(|d| d.spec.as_ref()) + .and_then(|s| s.cli.clone()); + let initial_task = agent_data + .and_then(|d| d.initial_task.clone()); + let continuity = json!({ + "agent_name": name, + "cli": cli, + "initial_task": initial_task, + "released_at": null, + "lifetime_seconds": null, + "message_history": [], + "summary": content, + }); + let cont_file = cont_dir.join(format!("{}.json", name)); + match std::fs::write( + &cont_file, + serde_json::to_string_pretty(&continuity) + .unwrap_or_default(), + ) { + Ok(()) => tracing::info!( + agent = %name, + path = %cont_file.display(), + "continuity_command: saved agent-initiated continuity" + ), + Err(e) => tracing::warn!( + agent = %name, + error = %e, + "continuity_command save: failed to write file" + ), + } + } + } + "load" => { + let cont_dir = continuity_dir(&paths.state); + let cont_file = cont_dir.join(format!("{}.json", name)); + if cont_file.exists() { + match std::fs::read_to_string(&cont_file) { + Ok(raw) => { + if let Ok(ctx) = serde_json::from_str::(&raw) { + // Build a context summary and inject it + let prev_task = ctx.get("initial_task") + .and_then(Value::as_str) + .unwrap_or("unknown"); + let summary = ctx.get("summary") + .and_then(Value::as_str) + .unwrap_or("no summary"); + let history_str = ctx.get("message_history") + .and_then(Value::as_array) + .map(|msgs| { + msgs.iter() + .filter_map(|m| { + let from = m.get("from")?.as_str()?; + let text = m.get("text") + .or_else(|| m.get("body"))? + .as_str()?; + Some(format!(" - {}: {}", from, text)) + }) + .collect::>() + .join("\n") + }) + .unwrap_or_default(); + let history_section = if history_str.is_empty() { + String::new() + } else { + format!("\nRecent messages:\n{}", history_str) + }; + let inject_body = format!( + "## Continuity Context (from previous session as '{}')\n\ + Previous task: {}\n\ + Session summary: {}{}", + name, prev_task, summary, history_section + ); + let event_id = format!("cont_load_{}", Uuid::new_v4().simple()); + if let Err(e) = queue_and_try_delivery_raw( + &mut workers, + &mut pending_deliveries, + &name, + &event_id, + "broker", + &name, + &inject_body, + None, + None, + None, + 2, + MessageInjectionMode::Wait, + delivery_retry_interval, + ).await { + tracing::warn!( + agent = %name, + error = %e, + "continuity_command load: failed to inject context" + ); + } else { + tracing::info!( + agent = %name, + "continuity_command: injected loaded context" + ); + } + } + } + Err(e) => tracing::warn!( + agent = %name, + error = %e, + "continuity_command load: failed to read file" + ), + } + } else { + tracing::debug!( + agent = %name, + "continuity_command load: no continuity file found" + ); + } + } + "uncertain" => { + tracing::info!( + agent = %name, + content = %content, + "continuity_command: agent reported uncertainty" + ); + } + other => { + tracing::warn!( + agent = %name, + action = %other, + "continuity_command: unknown action ignored" + ); + } + } + } else if msg_type == "worker_exited" { + // PTY worker process is exiting — clean up and + // emit agent_exited so the SDK doesn't have to + // wait for the reap_exited polling cycle. + let code = value.get("payload") + .and_then(|p| p.get("code")) + .and_then(Value::as_i64) + .map(|c| c as i32); + let signal = value.get("payload") + .and_then(|p| p.get("signal")) + .and_then(Value::as_str) + .map(String::from); + tracing::info!( + agent = %name, + code = ?code, + signal = ?signal, + "worker_exited received — cleaning up" + ); + // Remove from registry so reap_exited won't + // double-process this worker. + workers.workers.remove(&name); + workers.initial_tasks.remove(&name); + // Drop pending deliveries for this worker + let dropped = drop_pending_for_worker(&mut pending_deliveries, &name); + if dropped > 0 { + let _ = send_event( + &sdk_out_tx, + json!({ + "kind": "delivery_dropped", + "name": name, + "count": dropped, + "reason": "worker_exited", + }), + ).await; + } + fail_pending_requests_for_worker(&mut pending_requests, &name, "worker_exited"); + delivery_states.remove(&name); + let _ = send_event( + &sdk_out_tx, + json!({ + "kind": "agent_exited", + "name": name, + "code": code, + "signal": signal, + }), + ).await; + publish_agent_state_transition( + &ws_control_tx, + &name, + "exited", + Some("worker_exited"), + ) + .await; + if let Err(error) = relaycast_http.mark_agent_offline(&name).await { + tracing::warn!( + worker = %name, + error = %error, + "failed to mark exited worker offline in relaycast" + ); + } + state.agents.remove(&name); + if paths.persist { + if let Err(error) = state.save(&paths.state) { + tracing::warn!( + path = %paths.state.display(), + error = %error, + "failed to persist broker state" + ); + } + } + } + } + } + } + } + } + + _ = reap_tick.tick() => { + let now = Instant::now(); + + // Time out worker request/response calls whose worker never + // responded. Common cause: worker crashed between us sending + // the request frame and it parsing the frame. Without this + // sweep the HTTP handler would hang forever on its oneshot. + for (req_id, worker_name, kind) in + worker_request::reap_expired(&mut pending_requests, now) + { + tracing::warn!( + target = "agent_relay::broker", + request_id = %req_id, + worker = %worker_name, + kind = %kind, + "worker request timed out before worker responded" + ); + } + + let due_ids: Vec = pending_deliveries + .iter() + .filter_map(|(delivery_id, pending)| { + if pending.next_retry_at <= now { + Some(delivery_id.clone()) + } else { + None + } + }) + .collect(); + + for delivery_id in due_ids { + let was_retry = pending_deliveries + .get(&delivery_id) + .map(|pending| pending.attempts > 0) + .unwrap_or(false); + + match retry_pending_delivery( + &delivery_id, + &mut workers, + &mut pending_deliveries, + delivery_retry_interval, + ) + .await { + Ok(Some((worker_name, attempts, event_id))) => { + if was_retry { + let _ = send_event( + &sdk_out_tx, + json!({ + "kind":"delivery_retry", + "name": worker_name, + "delivery_id": delivery_id, + "event_id": event_id, + "attempts": attempts, + }), + ).await; + } + } + Ok(None) => { + if was_retry { + let _ = send_event( + &sdk_out_tx, + json!({ + "kind": "delivery_dropped", + "delivery_id": delivery_id, + "reason": "max_retries_exceeded", + }), + ).await; + } + } + Err(error) => { + let _ = send_error( + &sdk_out_tx, + None, + "delivery_failed", + error.to_string(), + true, + Some(json!({"delivery_id": delivery_id})), + ).await; + } + } + } + + let exited = match workers.reap_exited().await { + Ok(v) => v, + Err(e) => { + tracing::warn!(err = %e, "reap_exited failed, skipping this cycle"); + vec![] + } + }; + for (name, code, signal) in &exited { + // Record crash in insights + let (category, description) = relay_broker::crash_insights::CrashInsights::analyze(*code, signal.as_deref()); + crash_insights.record(relay_broker::crash_insights::CrashRecord { + agent_name: name.clone(), + exit_code: *code, + signal: signal.clone(), + timestamp: std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_secs(), + uptime_secs: 0, + category, + description, + }); + + telemetry.track(TelemetryEvent::AgentCrash { + cli: String::new(), + exit_code: *code, + lifetime_seconds: 0, + }); + + // Check supervisor for restart decision + use relay_broker::supervisor::RestartDecision; + match workers.supervisor.on_exit(name, *code, signal.as_deref()) { + Some(RestartDecision::Restart { delay }) => { + // Keep pending deliveries — we'll redeliver after restart + workers.metrics.on_crash(name); + let restart_count = workers.supervisor.restart_count(name) + 1; + tracing::info!( + name = %name, + exit_code = ?code, + signal = ?signal, + restart_count, + delay_ms = delay.as_millis() as u64, + "agent will be restarted" + ); + let _ = send_event( + &sdk_out_tx, + json!({ + "kind": "agent_restarting", + "name": name, + "code": code, + "signal": signal, + "restart_count": restart_count, + "delay_ms": delay.as_millis() as u64, + }), + ).await; + publish_agent_state_transition( + &ws_control_tx, + name, + "stuck", + Some("restarting"), + ) + .await; + } + Some(RestartDecision::PermanentlyDead { reason }) => { + workers.metrics.on_permanent_death(name); + let dropped = drop_pending_for_worker(&mut pending_deliveries, name); + if dropped > 0 { + let _ = send_event( + &sdk_out_tx, + json!({ + "kind":"delivery_dropped", + "name": name, + "count": dropped, + "reason":"worker_permanently_dead", + }), + ).await; + } + fail_pending_requests_for_worker(&mut pending_requests, name, "worker_permanently_dead"); + delivery_states.remove(name); + let _ = send_event( + &sdk_out_tx, + json!({"kind":"agent_permanently_dead","name":name,"reason":reason}), + ).await; + publish_agent_state_transition( + &ws_control_tx, + name, + "stuck", + Some("permanently_dead"), + ) + .await; + if let Err(error) = relaycast_http.mark_agent_offline(name).await { + tracing::warn!( + worker = %name, + error = %error, + "failed to mark permanently dead worker offline in relaycast" + ); + } + state.agents.remove(name); + if paths.persist { + if let Err(error) = state.save(&paths.state) { + tracing::warn!(path = %paths.state.display(), error = %error, "failed to persist broker state"); + } + } + } + None => { + // Not supervised — original behavior + let dropped = drop_pending_for_worker(&mut pending_deliveries, name); + if dropped > 0 { + let _ = send_event( + &sdk_out_tx, + json!({ + "kind":"delivery_dropped", + "name": name, + "count": dropped, + "reason":"worker_exited", + }), + ).await; + } + fail_pending_requests_for_worker(&mut pending_requests, name, "worker_exited"); + delivery_states.remove(name); + let _ = send_event( + &sdk_out_tx, + json!({"kind":"agent_exited","name":name,"code":code,"signal":signal}), + ).await; + publish_agent_state_transition( + &ws_control_tx, + name, + "exited", + Some("worker_exited"), + ) + .await; + if let Err(error) = relaycast_http.mark_agent_offline(name).await { + tracing::warn!( + worker = %name, + error = %error, + "failed to mark exited worker offline in relaycast" + ); + } + state.agents.remove(name); + if paths.persist { + if let Err(error) = state.save(&paths.state) { + tracing::warn!(path = %paths.state.display(), error = %error, "failed to persist broker state"); + } + } + } + } + } + + // Check for agents ready to restart (past cooldown) + if !shutdown { + let pending_restarts = workers.supervisor.pending_restarts(); + for (name, rst) in pending_restarts { + if let Some(remaining) = relaycast_http.registration_block_remaining(&name) + { + tracing::debug!( + worker = %name, + retry_after_secs = remaining.as_secs().max(1), + "skipping restart while relaycast registration is rate-limited" + ); + continue; + } + + let worker_relay_key = if rst.skip_relay_prompt { + None + } else { + match relaycast_http + .register_agent_token(&name, rst.spec.cli.as_deref()) + .await + { + Ok(token) => Some(token), + Err(error) => { + match registration_retry_after_secs(&error) { + Some(retry_after_secs) => { + tracing::warn!( + worker = %name, + retry_after_secs, + error = %error, + "restart blocked by relaycast registration rate limit" + ); + } + None => { + tracing::error!( + worker = %name, + error = %error, + "failed to pre-register worker before restart" + ); + } + } + continue; + } + } + }; + + match workers + .spawn( + rst.spec.clone(), + rst.parent.clone(), + None, + worker_relay_key, + rst.skip_relay_prompt, + None, + ) + .await + { + Ok(_) => { + workers.supervisor.on_restarted(&name); + workers.metrics.on_restart(&name); + if let Some(task) = rst.initial_task { + workers.initial_tasks.insert(name.clone(), task); + } + tracing::info!(name = %name, restart_count = rst.restart_count, "agent restarted"); + let _ = send_event( + &sdk_out_tx, + json!({ + "kind": "agent_restarted", + "name": name, + "restart_count": rst.restart_count, + }), + ).await; + publish_agent_state_transition( + &ws_control_tx, + &name, + "spawned", + Some("restarted"), + ) + .await; + } + Err(e) => { + tracing::error!(name = %name, error = %e, "restart failed"); + } + } + } + } + + // Persist pending deliveries for crash recovery + if paths.persist { + if let Err(error) = save_pending_deliveries(&paths.pending, &pending_deliveries) { + tracing::warn!(path = %paths.pending.display(), error = %error, "failed to persist pending deliveries"); + } + } + } + } + } + + // Save crash insights before shutdown (only in persist mode) + if paths.persist { + if let Err(error) = crash_insights.save(&crash_insights_path) { + tracing::warn!(error = %error, "failed to save crash insights"); + } + } + + telemetry.track(TelemetryEvent::BrokerStop { + uptime_seconds: broker_start.elapsed().as_secs(), + agent_spawn_count, + }); + telemetry.shutdown(); + + let active_workers: Vec = workers.workers.keys().cloned().collect(); + for worker_name in active_workers { + if let Err(error) = relaycast_http.mark_agent_offline(&worker_name).await { + tracing::warn!( + worker = %worker_name, + error = %error, + "failed to mark worker offline during shutdown" + ); + } + } + + // Mark broker agent offline in Relaycast before shutting down WS + if let Err(error) = relaycast_http.mark_offline().await { + tracing::warn!(error = %error, "failed to mark broker offline during shutdown"); + } + + if let Err(error) = ws_control_tx.send(WsControl::Shutdown).await { + tracing::warn!(error = %error, "failed to send ws shutdown signal"); + } + pending_deliveries.clear(); + // Clean shutdown — remove pending file since nothing is pending + if paths.persist { + let _ = std::fs::remove_file(&paths.pending); + } + workers.shutdown_all().await?; + + // Clean up state and connection files on graceful shutdown + if paths.persist { + let _ = std::fs::remove_file(&paths.state); + } + let connection_path = paths.state.parent().unwrap().join("connection.json"); + let _ = std::fs::remove_file(&connection_path); + + Ok(()) +} + +/// Get terminal rows from TIOCGWINSZ. +#[cfg(unix)] +pub(crate) fn terminal_rows() -> Option { + use nix::libc; + use nix::pty::Winsize; + let mut ws = Winsize { + ws_row: 0, + ws_col: 0, + ws_xpixel: 0, + ws_ypixel: 0, + }; + unsafe { + if libc::ioctl(libc::STDOUT_FILENO, libc::TIOCGWINSZ, &mut ws) == 0 && ws.ws_row > 0 { + Some(ws.ws_row) + } else { + None + } + } +} + +/// Get terminal cols from TIOCGWINSZ. +#[cfg(unix)] +pub(crate) fn terminal_cols() -> Option { + use nix::libc; + use nix::pty::Winsize; + let mut ws = Winsize { + ws_row: 0, + ws_col: 0, + ws_xpixel: 0, + ws_ypixel: 0, + }; + unsafe { + if libc::ioctl(libc::STDOUT_FILENO, libc::TIOCGWINSZ, &mut ws) == 0 && ws.ws_col > 0 { + Some(ws.ws_col) + } else { + None + } + } +} + +#[cfg(not(unix))] +pub(crate) fn terminal_rows() -> Option { + None +} +#[cfg(not(unix))] +pub(crate) fn terminal_cols() -> Option { + None +} + +#[cfg(target_os = "linux")] +pub(crate) fn memory_bytes_for_pid(pid: u32) -> u64 { + let statm_path = format!("/proc/{pid}/statm"); + let statm = match std::fs::read_to_string(statm_path) { + Ok(contents) => contents, + Err(_) => return 0, + }; + + let rss_pages = match statm + .split_whitespace() + .nth(1) + .and_then(|value| value.parse::().ok()) + { + Some(value) => value, + None => return 0, + }; + + let page_size = unsafe { nix::libc::sysconf(nix::libc::_SC_PAGESIZE) }; + if page_size <= 0 { + return 0; + } + + rss_pages.saturating_mul(page_size as u64) +} + +#[cfg(not(target_os = "linux"))] +pub(crate) fn memory_bytes_for_pid(_pid: u32) -> u64 { + 0 +} + +pub(crate) fn build_agent_metrics(handle: &WorkerHandle) -> AgentMetrics { + let pid = handle.child.id().unwrap_or_default(); + AgentMetrics { + name: handle.spec.name.clone(), + pid, + memory_bytes: if pid == 0 { + 0 + } else { + memory_bytes_for_pid(pid) + }, + uptime_secs: handle.spawned_at.elapsed().as_secs(), + } +} + +/// Outcome of [`queue_inbound_for_delivery_mode`]. Distinguishes the +/// three cases broker call sites care about: the message is queued and +/// should wait for an explicit flush, the queue should be drained now, +/// or there's no worker (caller falls through to existing target handling). +#[derive(Debug, Clone, PartialEq, Eq)] +pub(crate) enum InboundQueueOutcome { + Queued, + DrainNow(Vec), + WorkerMissing, +} + +/// Bundle of routing context captured into the pending queue. Mirrors the +/// args `queue_and_try_delivery_raw` +/// expects so a drain reproduces the original delivery exactly — same +/// target (channel / DM / thread sentinel), thread, workspace, +/// priority, and injection mode. +pub(crate) struct InboundContext<'a> { + from: &'a str, + body: &'a str, + target: &'a str, + thread_id: Option<&'a str>, + workspace_id: Option<&'a str>, + workspace_alias: Option<&'a str>, + priority: u8, + mode: MessageInjectionMode, + event_id: Option<&'a str>, +} + +/// Queue an inbound relay message through the per-worker [`InboundDeliveryMode`]. +/// +/// Every inbound message is appended to the per-worker pending queue. In +/// [`InboundDeliveryMode::AutoInject`] the caller immediately drains the queue +/// in the same broker turn; in [`InboundDeliveryMode::ManualFlush`] the message +/// stays parked until an explicit flush or mode transition. +/// +/// Pulled out so the broker has one obvious choke point for the two +/// inbound paths (`/api/send` and the relaycast inbound feed) that the +/// `drive` client needs to intercept. Internal broker-driven injections +/// (`worker_ready` initial task, continuity restore) bypass this queue by +/// not calling this helper. +pub(crate) fn queue_inbound_for_delivery_mode( + delivery_states: &mut HashMap, + workers: &WorkerRegistry, + worker_name: &str, + ctx: InboundContext<'_>, +) -> InboundQueueOutcome { + if !workers.has_worker(worker_name) { + return InboundQueueOutcome::WorkerMissing; + } + let state = delivery_states.entry(worker_name.to_string()).or_default(); + let should_drain = state.should_drain_immediately(); + let queued_at_ms = chrono::Utc::now().timestamp_millis().max(0) as u64; + let msg = PendingRelayMessage { + from: ctx.from.to_string(), + body: ctx.body.to_string(), + target: ctx.target.to_string(), + thread_id: ctx.thread_id.map(str::to_string), + workspace_id: ctx.workspace_id.map(str::to_string), + workspace_alias: ctx.workspace_alias.map(str::to_string), + priority: ctx.priority, + mode: ctx.mode, + queued_at_ms, + event_id: ctx.event_id.map(str::to_string), + }; + match state.accept_inbound(msg) { + InboundDeliveryDispatch::Queued { queue_len } => { + tracing::debug!( + target = "agent_relay::broker", + worker = %worker_name, + from = %ctx.from, + mode = state.mode.as_wire_str(), + queue_len, + "queued inbound relay message" + ); + } + InboundDeliveryDispatch::QueuedEvicted { + queue_len, + dropped_from, + } => { + tracing::warn!( + target = "agent_relay::broker", + worker = %worker_name, + from = %ctx.from, + dropped_from = %dropped_from, + mode = state.mode.as_wire_str(), + queue_len, + max_pending = relay_broker::types::MAX_PENDING_PER_WORKER, + "pending queue full — evicting oldest message" + ); + } + } + if should_drain { + let to_drain = state.drain_pending(); + tracing::debug!( + target = "agent_relay::broker", + worker = %worker_name, + drained = to_drain.len(), + "draining inbound queue immediately (auto_inject delivery mode)" + ); + InboundQueueOutcome::DrainNow(to_drain) + } else { + InboundQueueOutcome::Queued + } +} + +pub(crate) async fn try_inject_pending_relay_message( + workers: &mut WorkerRegistry, + pending_deliveries: &mut HashMap, + worker_name: &str, + msg: &PendingRelayMessage, + retry_interval: Duration, +) -> Result<()> { + let event_id = msg + .event_id + .clone() + .unwrap_or_else(|| format!("flush_{}", Uuid::new_v4().simple())); + match timeout( + retry_interval, + queue_and_try_delivery_raw( + workers, + pending_deliveries, + worker_name, + &event_id, + &msg.from, + // Use the ORIGINAL routing target captured at queue time — + // `#general`, the DM recipient name, `"thread"`, etc. Falling + // back to `worker_name` here would silently reframe channel + // messages as direct-to-worker messages on drain. + &msg.target, + &msg.body, + msg.thread_id.clone(), + msg.workspace_id.clone(), + msg.workspace_alias.clone(), + msg.priority, + msg.mode.clone(), + retry_interval, + ), + ) + .await + { + Ok(result) => result, + Err(_) => Err(anyhow::anyhow!( + "pending relay delivery timed out after {}ms", + retry_interval.as_millis() + )), + } +} + +/// Inject a previously-queued pending relay message into the worker via +/// the existing `queue_and_try_delivery_raw` path. Used by the +/// `/api/spawned/{name}/flush` handler and by the auto-drain on a +/// `manual_flush → auto_inject` transition. Failures are logged but not +/// propagated — the broker treats `flush` as best-effort fire-and-forget +/// the same way `/api/send` does for individual targets. +pub(crate) async fn inject_pending_relay_message( + workers: &mut WorkerRegistry, + pending_deliveries: &mut HashMap, + worker_name: &str, + msg: &PendingRelayMessage, + retry_interval: Duration, +) { + let event_id = msg.event_id.as_deref().unwrap_or(""); + if let Err(error) = try_inject_pending_relay_message( + workers, + pending_deliveries, + worker_name, + msg, + retry_interval, + ) + .await + { + tracing::warn!( + target = "agent_relay::broker", + worker = %worker_name, + from = %msg.from, + event_id = %event_id, + error = %error, + "failed to inject pending relay message during flush" + ); + } +} + +pub(crate) async fn queue_and_try_delivery( + workers: &mut WorkerRegistry, + pending_deliveries: &mut HashMap, + worker_name: &str, + mapped: &relay_broker::types::InboundRelayEvent, + retry_interval: Duration, +) -> Result<()> { + queue_and_try_delivery_raw( + workers, + pending_deliveries, + worker_name, + &mapped.event_id, + &mapped.from, + &mapped.target, + &mapped.text, + mapped.thread_id.clone(), + Some(mapped.workspace_id.clone()), + mapped.workspace_alias.clone(), + mapped.priority.as_u8(), + MessageInjectionMode::Wait, + retry_interval, + ) + .await +} + +#[allow(clippy::too_many_arguments)] +pub(crate) async fn queue_and_try_delivery_raw( + workers: &mut WorkerRegistry, + pending_deliveries: &mut HashMap, + worker_name: &str, + event_id: &str, + from: &str, + target: &str, + body: &str, + thread_id: Option, + workspace_id: Option, + workspace_alias: Option, + priority: u8, + injection_mode: MessageInjectionMode, + retry_interval: Duration, +) -> Result<()> { + let delivery = RelayDelivery { + delivery_id: format!("del_{}", Uuid::new_v4().simple()), + event_id: event_id.to_string(), + workspace_id, + workspace_alias, + from: from.to_string(), + target: target.to_string(), + body: body.to_string(), + thread_id, + priority: Some(priority), + injection_mode, + }; + let delivery_id = delivery.delivery_id.clone(); + pending_deliveries.insert( + delivery_id.clone(), + PendingDelivery { + worker_name: worker_name.to_string(), + delivery, + attempts: 0, + next_retry_at: Instant::now(), + }, + ); + + let _ = + retry_pending_delivery(&delivery_id, workers, pending_deliveries, retry_interval).await?; + Ok(()) +} + +pub(crate) async fn retry_pending_delivery( + delivery_id: &str, + workers: &mut WorkerRegistry, + pending_deliveries: &mut HashMap, + retry_interval: Duration, +) -> Result> { + let pending = match pending_deliveries.get(delivery_id) { + Some(pending) => pending.clone(), + None => return Ok(None), + }; + + if pending.attempts >= MAX_DELIVERY_RETRIES { + pending_deliveries.remove(delivery_id); + return Ok(None); + } + + if !workers.has_worker(&pending.worker_name) { + pending_deliveries.remove(delivery_id); + return Ok(None); + } + + match workers + .deliver(&pending.worker_name, pending.delivery.clone()) + .await + { + Ok(()) => { + if let Some(current) = pending_deliveries.get_mut(delivery_id) { + current.attempts = current.attempts.saturating_add(1); + current.next_retry_at = Instant::now() + retry_interval; + return Ok(Some(( + current.worker_name.clone(), + current.attempts, + current.delivery.event_id.clone(), + ))); + } + Ok(None) + } + Err(error) => { + if let Some(current) = pending_deliveries.get_mut(delivery_id) { + current.next_retry_at = Instant::now() + retry_interval; + } + Err(error) + } + } +} + +pub(crate) fn drop_pending_for_worker( + pending_deliveries: &mut HashMap, + worker_name: &str, +) -> usize { + let before = pending_deliveries.len(); + pending_deliveries.retain(|_, pending| pending.worker_name != worker_name); + before.saturating_sub(pending_deliveries.len()) +} + +/// Drain every in-flight worker request targeting `worker_name` and +/// notify each awaiter with [`worker_request::RequestWorkerError::WorkerDisappeared`]. +/// Called from every worker-teardown path (explicit release, +/// `worker_exited` frame, `reap_exited` periodic sweep) so HTTP callers +/// don't have to wait out the request deadline when the worker has +/// clearly gone. Logs one structured warning per drained request. +pub(crate) fn fail_pending_requests_for_worker( + pending_requests: &mut HashMap, + worker_name: &str, + reason: &'static str, +) -> usize { + let failed = worker_request::fail_for_worker(pending_requests, worker_name); + for (req_id, kind) in &failed { + tracing::warn!( + target = "agent_relay::broker", + request_id = %req_id, + worker = %worker_name, + kind = %kind, + reason = reason, + "failed pending worker request because worker is gone" + ); + } + failed.len() +} + +pub(crate) fn should_clear_pending_delivery_for_event( + pending: Option<&PendingDelivery>, + event_id: Option<&str>, +) -> bool { + let Some(pending) = pending else { + return true; + }; + + let Some(event_id) = event_id + .map(str::trim) + .filter(|event_id| !event_id.is_empty()) + else { + return true; + }; + + pending.delivery.event_id == event_id +} + +pub(crate) fn clear_pending_delivery_if_event_matches( + pending_deliveries: &mut HashMap, + delivery_id: &str, + event_id: Option<&str>, + worker_name: &str, + worker_signal: &str, +) { + let pending = pending_deliveries.get(delivery_id); + if should_clear_pending_delivery_for_event(pending, event_id) { + pending_deliveries.remove(delivery_id); + return; + } + + if let Some(pending) = pending { + tracing::warn!( + target = "agent_relay::broker", + worker = %worker_name, + signal = %worker_signal, + delivery_id = %delivery_id, + expected_event_id = %pending.delivery.event_id, + received_event_id = %event_id.unwrap_or(""), + "ignoring stale delivery lifecycle event due to event_id mismatch" + ); + } +} + +pub(crate) async fn run_headless_worker(cmd: HeadlessCommand) -> Result<()> { + let provider: ProtocolHeadlessProvider = cmd.provider.into(); + let provider_name = headless_provider_cli_name(&provider); + let provider_args = cmd.args.clone(); + + let (out_tx, mut out_rx) = mpsc::channel::>(512); + let writer_task = tokio::spawn(async move { + // Keep one async stdout handle for this process. Tokio's `write_all` + // is not cancel-safe if the task is aborted mid-write, so shutdown + // below drops `out_tx` and awaits this task before returning. + let mut stdout = tokio::io::stdout(); + while let Some(frame) = out_rx.recv().await { + if let Ok(mut line) = serde_json::to_string(&frame) { + line.push('\n'); + if stdout.write_all(line.as_bytes()).await.is_err() || stdout.flush().await.is_err() + { + break; + } + } + } + }); + + let mut lines = BufReader::new(tokio::io::stdin()).lines(); + let mut worker_name = cmd + .agent_name + .clone() + .unwrap_or_else(|| format!("headless-{provider_name}")); + let mut final_exit_code: Option = None; + let mut final_exit_signal: Option = None; + + while let Ok(Some(line)) = lines.next_line().await { + let frame: ProtocolEnvelope = match serde_json::from_str(&line) { + Ok(frame) => frame, + Err(error) => { + let _ = send_frame( + &out_tx, + "worker_error", + None, + json!({ + "code":"invalid_frame", + "message": error.to_string(), + "retryable": false, + }), + ) + .await; + continue; + } + }; + + match frame.msg_type.as_str() { + "init_worker" => { + worker_name = cmd + .agent_name + .clone() + .or_else(|| { + frame + .payload + .get("agent") + .and_then(|a| a.get("name")) + .and_then(Value::as_str) + .map(ToOwned::to_owned) + }) + .unwrap_or_else(|| format!("headless-{provider_name}")); + + let _ = send_frame( + &out_tx, + "worker_ready", + frame.request_id, + json!({ + "name": &worker_name, + "runtime": "headless", + }), + ) + .await; + } + "deliver_relay" => { + let request_id = frame.request_id.clone(); + let delivery: RelayDelivery = match serde_json::from_value(frame.payload) { + Ok(d) => d, + Err(error) => { + let _ = send_frame( + &out_tx, + "worker_error", + request_id, + json!({ + "code":"invalid_delivery", + "message": error.to_string(), + "retryable": false, + }), + ) + .await; + continue; + } + }; + + let timestamp = chrono::Utc::now().timestamp_millis(); + let delivery_id = delivery.delivery_id; + let event_id = delivery.event_id; + + let _ = send_frame( + &out_tx, + "delivery_queued", + None, + json!({ + "delivery_id": delivery_id, + "event_id": event_id, + "agent": &worker_name, + "timestamp": timestamp, + }), + ) + .await; + + let _ = send_frame( + &out_tx, + "delivery_injected", + None, + json!({ + "delivery_id": delivery_id, + "event_id": event_id, + "agent": &worker_name, + "timestamp": timestamp, + }), + ) + .await; + + let _ = send_frame( + &out_tx, + "delivery_active", + None, + json!({ + "delivery_id": delivery_id, + "event_id": event_id, + "pattern": format!("headless:{}", provider_name), + }), + ) + .await; + + let task_text = delivery.body.clone(); + let (binary, args) = + headless_provider_command(&provider, &task_text, &provider_args); + + let mut child_cmd = tokio::process::Command::new(&binary); + child_cmd + .args(&args) + .stdin(Stdio::null()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()); + + // Auto-approve tool permissions for opencode in headless mode. + if matches!(provider, ProtocolHeadlessProvider::Opencode) { + child_cmd.env( + "OPENCODE_PERMISSION", + r#"{"*":"allow","external_directory":{"*":"allow"}}"#, + ); + } + + let mut child = match child_cmd.spawn() { + Ok(child) => child, + Err(error) => { + let _ = send_frame( + &out_tx, + "delivery_failed", + None, + json!({ + "delivery_id": delivery_id, + "event_id": event_id, + "reason": format!("failed to spawn {}: {}", binary, error), + }), + ) + .await; + let _ = send_frame( + &out_tx, + "worker_error", + request_id, + json!({ + "code":"spawn_failed", + "message": format!("failed to spawn {}: {}", binary, error), + "retryable": false, + }), + ) + .await; + final_exit_code = Some(1); + break; + } + }; + + let _ = send_frame( + &out_tx, + "delivery_ack", + request_id.clone(), + json!({ + "delivery_id": delivery_id, + "event_id": event_id, + }), + ) + .await; + + let stdout = child.stdout.take(); + let stderr = child.stderr.take(); + + let stream_stdout = { + let out_tx = out_tx.clone(); + async move { + if let Some(stdout) = stdout { + let mut lines = BufReader::new(stdout).lines(); + while let Ok(Some(chunk)) = lines.next_line().await { + let _ = send_frame( + &out_tx, + "worker_stream", + None, + json!({ + "stream": "stdout", + "chunk": chunk, + }), + ) + .await; + } + } + } + }; + + let stream_stderr = { + let out_tx = out_tx.clone(); + async move { + if let Some(stderr) = stderr { + let mut lines = BufReader::new(stderr).lines(); + while let Ok(Some(chunk)) = lines.next_line().await { + let _ = send_frame( + &out_tx, + "worker_stream", + None, + json!({ + "stream": "stderr", + "chunk": chunk, + }), + ) + .await; + } + } + } + }; + + let (status, _, _) = tokio::join!(child.wait(), stream_stdout, stream_stderr); + + match status { + Ok(exit_status) => { + final_exit_code = exit_status.code(); + final_exit_signal = None; + if exit_status.success() { + let _ = send_frame( + &out_tx, + "delivery_verified", + None, + json!({ + "delivery_id": delivery_id, + "event_id": event_id, + }), + ) + .await; + } else { + let reason = match exit_status.code() { + Some(code) => format!("{} exited with code {}", binary, code), + None => format!("{} exited without an exit code", binary), + }; + let _ = send_frame( + &out_tx, + "delivery_failed", + None, + json!({ + "delivery_id": delivery_id, + "event_id": event_id, + "reason": reason, + }), + ) + .await; + } + } + Err(error) => { + let reason = format!("failed waiting for {}: {}", binary, error); + let _ = send_frame( + &out_tx, + "delivery_failed", + None, + json!({ + "delivery_id": delivery_id, + "event_id": event_id, + "reason": reason, + }), + ) + .await; + let _ = send_frame( + &out_tx, + "worker_error", + request_id, + json!({ + "code":"wait_failed", + "message": format!("failed waiting for {}: {}", binary, error), + "retryable": false, + }), + ) + .await; + final_exit_code = Some(1); + } + } + + break; + } + "ping" => { + let ts = frame + .payload + .get("ts_ms") + .and_then(Value::as_u64) + .unwrap_or_default(); + let _ = send_frame(&out_tx, "pong", frame.request_id, json!({"ts_ms": ts})).await; + } + "shutdown_worker" => { + break; + } + other => { + let _ = send_frame( + &out_tx, + "worker_error", + frame.request_id, + json!({ + "code":"unknown_type", + "message": format!("unsupported message type '{}'", other), + "retryable": false, + }), + ) + .await; + } + } + } + + let _ = send_frame( + &out_tx, + "worker_exited", + None, + json!({"code": final_exit_code, "signal": final_exit_signal}), + ) + .await; + drop(out_tx); + let _ = writer_task.await; + + Ok(()) +} + +pub(crate) async fn send_error( + tx: &mpsc::Sender>, + request_id: Option, + code: &str, + message: String, + retryable: bool, + data: Option, +) -> Result<()> { + send_frame( + tx, + "error", + request_id, + json!({ + "code": code, + "message": message, + "retryable": retryable, + "data": data, + }), + ) + .await +} + +pub(crate) async fn send_event( + tx: &mpsc::Sender>, + payload: Value, +) -> Result<()> { + send_frame(tx, "event", None, payload).await +} + +pub(crate) async fn emit_http_api_event_with_timeout( + tx: &mpsc::Sender>, + payload: Value, + timeout_window: Duration, +) { + match timeout(timeout_window, send_event(tx, payload)).await { + Ok(Ok(())) => {} + Ok(Err(error)) => { + tracing::warn!( + target = "relay_broker::http_api", + error = %error, + "failed to enqueue HTTP API event" + ); + } + Err(_) => { + tracing::warn!( + target = "relay_broker::http_api", + timeout_ms = %timeout_window.as_millis(), + "timed out enqueuing HTTP API event" + ); + } + } +} + +pub(crate) async fn send_frame( + tx: &mpsc::Sender>, + msg_type: &str, + request_id: Option, + payload: Value, +) -> Result<()> { + tx.send(ProtocolEnvelope { + v: PROTOCOL_VERSION, + msg_type: msg_type.to_string(), + request_id, + payload, + }) + .await + .context("failed to enqueue outbound frame") +} + +pub(crate) fn init_tracing() { + let (writer, guard) = tracing_appender::non_blocking(std::io::stderr()); + let subscriber = tracing_subscriber::fmt::Subscriber::builder() + .with_env_filter( + tracing_subscriber::EnvFilter::try_from_default_env() + .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info")), + ) + .with_target(true) + .with_writer(writer) + .finish(); + if tracing::subscriber::set_global_default(subscriber).is_ok() { + let _ = TRACING_GUARD.set(guard); + } +} + +pub(crate) fn channels_from_csv(raw: &str) -> Vec { + raw.split(',') + .map(str::trim) + .filter(|s| !s.is_empty()) + .map(ToOwned::to_owned) + .collect() +} + +/// Default channels for freshly spawned agents. +/// Reads RELAY_DEFAULT_CHANNELS (comma-separated) or falls back to the +/// broker's default channels: vec!["general", "engineering"] — both created +/// at startup by ensure_default_channels(). +pub(crate) fn default_spawn_channels() -> Vec { + if let Ok(raw) = std::env::var("RELAY_DEFAULT_CHANNELS") { + let parsed = channels_from_csv(&raw); + if !parsed.is_empty() { + return parsed; + } + } + // channels: ["general", "engineering"] (must match ensure_default_channels) + vec!["general".to_string(), "engineering".to_string()] +} + +pub(crate) fn command_targets_self(cmd_event: &BrokerCommandEvent, self_agent_id: &str) -> bool { + match cmd_event.handler_agent_id.as_deref() { + Some(handler_id) => handler_id == self_agent_id, + None => { + tracing::warn!( + command = %cmd_event.command, + invoked_by = %cmd_event.invoked_by, + "command has no handler_agent_id; accepting by default (multi-broker setups should scope commands)" + ); + true + } + } +} + +pub(crate) fn env_flag_enabled(name: &str) -> bool { + std::env::var(name) + .ok() + .map(|value| value.trim().to_ascii_lowercase()) + .is_some_and(|value| matches!(value.as_str(), "1" | "true" | "yes" | "on")) +} + +pub(crate) fn delivery_retry_interval() -> Duration { + let ms = std::env::var("AGENT_RELAY_DELIVERY_RETRY_MS") + .ok() + .and_then(|raw| raw.trim().parse::().ok()) + .unwrap_or(DEFAULT_DELIVERY_RETRY_MS); + Duration::from_millis(ms.max(50)) +} + +pub(crate) fn http_api_local_delivery_timeout() -> Duration { + let ms = std::env::var("AGENT_RELAY_HTTP_API_LOCAL_DELIVERY_TIMEOUT_MS") + .ok() + .and_then(|raw| raw.trim().parse::().ok()) + .unwrap_or(DEFAULT_HTTP_API_LOCAL_DELIVERY_TIMEOUT_MS); + Duration::from_millis(ms.max(100)) +} + +pub(crate) fn http_api_relaycast_send_timeout() -> Duration { + let ms = std::env::var("AGENT_RELAY_HTTP_API_RELAYCAST_SEND_TIMEOUT_MS") + .ok() + .and_then(|raw| raw.trim().parse::().ok()) + .unwrap_or(DEFAULT_HTTP_API_RELAYCAST_SEND_TIMEOUT_MS); + Duration::from_millis(ms.max(500)) +} + +pub(crate) fn http_api_event_emit_timeout() -> Duration { + let ms = std::env::var("AGENT_RELAY_HTTP_API_EVENT_EMIT_TIMEOUT_MS") + .ok() + .and_then(|raw| raw.trim().parse::().ok()) + .unwrap_or(DEFAULT_HTTP_API_EVENT_EMIT_TIMEOUT_MS); + Duration::from_millis(ms.max(25)) +} + +pub(crate) fn normalize_channel(raw: &str) -> String { + let trimmed = raw.trim(); + if trimmed.starts_with('#') { + trimmed.to_string() + } else { + format!("#{trimmed}") + } +} + +pub(crate) fn build_agent_state_transition_event( + name: &str, + state: &str, + reason: Option<&str>, +) -> Value { + let mut payload = json!({ + "type": "agent.state", + "state": state, + "agent": { "name": name }, + "timestamp": chrono::Utc::now().to_rfc3339(), + }); + if let Some(reason) = reason.map(str::trim).filter(|value| !value.is_empty()) { + payload["reason"] = json!(reason); + } + payload +} + +pub(crate) async fn publish_agent_state_transition( + ws_control_tx: &mpsc::Sender, + name: &str, + state: &str, + reason: Option<&str>, +) { + let event = build_agent_state_transition_event(name, state, reason); + if let Err(error) = ws_control_tx.send(WsControl::Publish(event)).await { + tracing::debug!( + agent = %name, + state = %state, + error = %error, + "failed to publish agent state transition" + ); + } +} + +pub(crate) fn normalize_identity_for_thread(raw: &str) -> String { + raw.trim().trim_start_matches('@').to_ascii_lowercase() +} + +pub(crate) fn json_scalar_to_string(value: &Value) -> Option { + match value { + Value::String(text) => { + let trimmed = text.trim(); + if trimmed.is_empty() { + None + } else { + Some(trimmed.to_string()) + } + } + Value::Number(number) => Some(number.to_string()), + _ => None, + } +} + +pub(crate) fn first_string(value: &Value, pointers: &[&str]) -> Option { + pointers + .iter() + .find_map(|pointer| value.pointer(pointer).and_then(json_scalar_to_string)) +} + +pub(crate) fn first_bool(value: &Value, pointers: &[&str]) -> Option { + pointers + .iter() + .find_map(|pointer| value.pointer(pointer).and_then(Value::as_bool)) +} + +pub(crate) fn first_u64(value: &Value, pointers: &[&str]) -> Option { + pointers + .iter() + .find_map(|pointer| value.pointer(pointer).and_then(Value::as_u64)) +} + +pub(crate) fn first_i64(value: &Value, pointers: &[&str]) -> Option { + pointers + .iter() + .find_map(|pointer| value.pointer(pointer).and_then(Value::as_i64)) +} + +pub(crate) fn relaycast_ws_control_dedup_key( + workspace_id: &str, + ws_type: &str, + value: &Value, +) -> Option { + let identity = if ws_type == "agent.spawn_requested" { + relaycast_ws_spawn_token(value) + .or_else(|| { + first_string( + value, + &[ + "/event_id", + "/id", + "/payload/id", + "/payload/event_id", + "/agent/id", + "/agent/event_id", + "/message/id", + "/message/event_id", + "/message_id", + ], + ) + }) + .or_else(|| first_string(value, &["/agent/name", "/payload/agent/name", "/name"])) + } else { + first_string( + value, + &[ + "/event_id", + "/id", + "/payload/id", + "/payload/event_id", + "/agent/id", + "/agent/event_id", + "/message/id", + "/message/event_id", + "/message_id", + ], + ) + } + .or_else(|| serde_json::to_string(value).ok())?; + Some(format!("control:{workspace_id}:{ws_type}:{identity}")) +} + +pub(crate) fn relaycast_ws_spawn_token(value: &Value) -> Option { + first_string( + value, + &[ + "/agent/token", + "/agent/relay_key", + "/agent/api_key", + "/token", + ], + ) +} + +pub(crate) fn relaycast_spawn_control_dedup_key(workspace_id: &str, identity: &str) -> String { + format!("control:{workspace_id}:agent.spawn_requested:{identity}") +} + +pub(crate) fn relaycast_ws_should_apply_local_spawn_echo_dedup( + control_dedup_key: Option<&str>, + local_spawn_echo_key: &str, +) -> bool { + control_dedup_key != Some(local_spawn_echo_key) +} + +pub(crate) fn note_local_spawn_control_dedup( + dedup: &mut DedupCache, + workspace_id: Option<&str>, + agent_name: &str, + relay_key: Option<&str>, +) { + let Some(workspace_id) = workspace_id else { + return; + }; + let agent_name = agent_name.trim(); + if !agent_name.is_empty() { + let key = relaycast_spawn_control_dedup_key(workspace_id, agent_name); + dedup.insert_if_new(&key, Instant::now()); + } + if let Some(relay_key) = relay_key.map(str::trim).filter(|value| !value.is_empty()) { + let key = relaycast_spawn_control_dedup_key(workspace_id, relay_key); + dedup.insert_if_new(&key, Instant::now()); + } +} + +pub(crate) fn is_unknown_worker_error_message(message: &str) -> bool { + message.contains("unknown worker '") +} + +pub(crate) fn is_relaycast_self_control_target( + name: &str, + workspace_self_name: &str, + workspace_self_names: &HashSet, +) -> bool { + let normalized = normalize_identity_for_thread(name); + normalized == normalize_identity_for_thread(workspace_self_name) + || workspace_self_names.contains(&normalized) +} + +pub(crate) fn message_sender(value: &Value) -> Option { + first_string( + value, + &[ + "/from", + "/sender", + "/author", + "/agent_name", + "/message/from", + "/message/sender", + "/message/author", + "/payload/from", + "/payload/sender", + "/payload/author", + "/payload/message/from", + "/payload/message/sender", + "/payload/message/author", + ], + ) +} + +pub(crate) fn message_target(value: &Value) -> Option { + first_string( + value, + &[ + "/target", + "/to", + "/recipient", + "/channel", + "/conversation_id", + "/conversationId", + "/message/target", + "/message/to", + "/message/recipient", + "/message/channel", + "/message/conversation_id", + "/message/conversationId", + "/payload/target", + "/payload/to", + "/payload/recipient", + "/payload/channel", + "/payload/conversation_id", + "/payload/conversationId", + "/payload/message/target", + "/payload/message/to", + "/payload/message/recipient", + "/payload/message/channel", + "/payload/message/conversation_id", + "/payload/message/conversationId", + ], + ) +} + +pub(crate) fn message_preview(value: &Value) -> Option { + let text = first_string( + value, + &[ + "/text", + "/body", + "/content", + "/message/text", + "/message/body", + "/message/content", + "/payload/text", + "/payload/body", + "/payload/content", + "/payload/message/text", + "/payload/message/body", + "/payload/message/content", + "/message", + "/payload/message", + ], + )?; + Some(truncate_thread_preview(&text, 200)) +} + +pub(crate) fn truncate_thread_preview(input: &str, max_len: usize) -> String { + let trimmed = input.trim(); + if trimmed.len() <= max_len { + return trimmed.to_string(); + } + let boundary = floor_char_boundary(trimmed, max_len); + let mut out = trimmed[..boundary].to_string(); + out.push_str("..."); + out +} + +pub(crate) fn parse_sort_key_from_raw_timestamp(raw: &str) -> Option { + let trimmed = raw.trim(); + if trimmed.is_empty() { + return None; + } + if let Ok(epoch) = trimmed.parse::() { + return Some(epoch); + } + chrono::DateTime::parse_from_rfc3339(trimmed) + .ok() + .map(|parsed| parsed.timestamp_millis()) +} + +pub(crate) fn message_timestamp_string(value: &Value) -> Option { + first_string( + value, + &[ + "/created_at", + "/createdAt", + "/timestamp", + "/ts", + "/message/created_at", + "/message/createdAt", + "/message/timestamp", + "/message/ts", + "/payload/created_at", + "/payload/createdAt", + "/payload/timestamp", + "/payload/ts", + "/payload/message/created_at", + "/payload/message/createdAt", + "/payload/message/timestamp", + "/payload/message/ts", + ], + ) +} + +pub(crate) fn message_sort_key(value: &Value, index: usize) -> i64 { + if let Some(raw) = message_timestamp_string(value) { + if let Some(parsed) = parse_sort_key_from_raw_timestamp(&raw) { + return parsed; + } + } + + first_i64( + value, + &[ + "/created_at", + "/createdAt", + "/timestamp", + "/ts", + "/message/created_at", + "/message/createdAt", + "/message/timestamp", + "/message/ts", + "/payload/created_at", + "/payload/createdAt", + "/payload/timestamp", + "/payload/ts", + ], + ) + .unwrap_or(index as i64) +} + +pub(crate) fn message_thread_id(value: &Value) -> Option { + if let Some(explicit) = first_string( + value, + &[ + "/thread_id", + "/threadId", + "/parent_id", + "/conversation_id", + "/conversationId", + "/message/thread_id", + "/message/threadId", + "/message/parent_id", + "/message/conversation_id", + "/message/conversationId", + "/payload/thread_id", + "/payload/threadId", + "/payload/parent_id", + "/payload/conversation_id", + "/payload/conversationId", + "/payload/message/thread_id", + "/payload/message/threadId", + "/payload/message/parent_id", + "/payload/message/conversation_id", + "/payload/message/conversationId", + ], + ) { + return Some(explicit); + } + + let target = message_target(value)?; + if target.starts_with('#') { + return Some(normalize_channel(&target)); + } + if target.starts_with("conv_") + || target.starts_with("dm_") + || target.chars().all(|ch| ch.is_ascii_digit()) + { + return Some(target); + } + + let sender = message_sender(value)?; + let sender = normalize_identity_for_thread(&sender); + let target = normalize_identity_for_thread(&target); + if sender.is_empty() || target.is_empty() { + return None; + } + let (first, second) = if sender <= target { + (sender, target) + } else { + (target, sender) + }; + Some(format!("direct:{first}:{second}")) +} + +pub(crate) fn is_self_identity(value: &str, self_names: &HashSet) -> bool { + let normalized = normalize_identity_for_thread(value); + !normalized.is_empty() + && self_names + .iter() + .any(|self_name| normalize_identity_for_thread(self_name) == normalized) +} + +pub(crate) fn derive_thread_name( + message: &Value, + thread_id: &str, + self_names: &HashSet, +) -> String { + if let Some(explicit) = first_string( + message, + &[ + "/thread_name", + "/threadName", + "/title", + "/subject", + "/conversation_name", + "/conversationName", + ], + ) { + return explicit; + } + + if thread_id.starts_with('#') { + return thread_id.to_string(); + } + + // Use participants array (from workspace-level DM data) to build a combined name + // like "WorkerA ↔ WorkerB" for DMs between non-broker agents. + if let Some(participants) = message.get("participants").and_then(|v| v.as_array()) { + let names: Vec<&str> = participants + .iter() + .filter_map(|p| p.as_str()) + .filter(|name| !is_self_identity(name, self_names)) + .collect(); + if names.len() >= 2 { + return format!("{} ↔ {}", names[0], names[1]); + } else if names.len() == 1 { + return names[0].to_string(); + } + } + + if let Some(sender) = message_sender(message) { + if !is_self_identity(&sender, self_names) { + return sender.trim().trim_start_matches('@').to_string(); + } + } + + if let Some(target) = message_target(message) { + let trimmed = target.trim().trim_start_matches('@'); + if trimmed.starts_with('#') { + return normalize_channel(trimmed); + } + if !trimmed.is_empty() + && !trimmed.eq_ignore_ascii_case(thread_id) + && !is_self_identity(trimmed, self_names) + && !trimmed.starts_with("conv_") + && !trimmed.starts_with("dm_") + && !trimmed.chars().all(|ch| ch.is_ascii_digit()) + { + return trimmed.to_string(); + } + } + + thread_id.to_string() +} + +pub(crate) fn thread_unread_increment(message: &Value, self_names: &HashSet) -> usize { + if let Some(read) = first_bool( + message, + &[ + "/read", + "/is_read", + "/isRead", + "/message/read", + "/message/is_read", + "/message/isRead", + "/payload/read", + "/payload/is_read", + "/payload/isRead", + "/payload/message/read", + "/payload/message/is_read", + "/payload/message/isRead", + ], + ) { + return usize::from(!read); + } + + if let Some(sender) = message_sender(message) { + return usize::from(!is_self_identity(&sender, self_names)); + } + 0 +} + +pub(crate) fn build_thread_infos( + messages: &[Value], + self_names: &HashSet, +) -> Vec { + let mut by_thread: HashMap = HashMap::new(); + + for (index, message) in messages.iter().enumerate() { + let Some(thread_id) = message_thread_id(message) else { + continue; + }; + + let name = derive_thread_name(message, &thread_id, self_names); + let sort_key = message_sort_key(message, index); + let preview = message_preview(message); + let timestamp = message_timestamp_string(message); + let explicit_unread = first_u64( + message, + &[ + "/unread_count", + "/unreadCount", + "/message/unread_count", + "/message/unreadCount", + "/payload/unread_count", + "/payload/unreadCount", + "/payload/message/unread_count", + "/payload/message/unreadCount", + ], + ) + .map(|value| value as usize); + let unread_delta = thread_unread_increment(message, self_names); + + let entry = by_thread + .entry(thread_id.clone()) + .or_insert_with(|| ThreadAccumulator { + info: ThreadInfo { + thread_id: thread_id.clone(), + name: name.clone(), + unread_count: 0, + last_message: None, + last_message_at: None, + }, + sort_key, + }); + + if entry.info.name == entry.info.thread_id && name != entry.info.thread_id { + entry.info.name = name.clone(); + } + + if let Some(explicit_unread) = explicit_unread { + entry.info.unread_count = entry.info.unread_count.max(explicit_unread); + } else { + entry.info.unread_count = entry.info.unread_count.saturating_add(unread_delta); + } + + if sort_key >= entry.sort_key { + entry.sort_key = sort_key; + entry.info.name = name; + entry.info.last_message = preview; + entry.info.last_message_at = timestamp; + } + } + + let mut threads: Vec = by_thread.into_values().collect(); + threads.sort_by(|left, right| { + right + .sort_key + .cmp(&left.sort_key) + .then_with(|| left.info.thread_id.cmp(&right.info.thread_id)) + }); + + threads.into_iter().map(|entry| entry.info).collect() +} + +pub(crate) fn record_thread_history_event(history: &mut VecDeque, event: Value) { + if history.len() >= THREAD_HISTORY_LIMIT { + let _ = history.pop_front(); + } + history.push_back(event); +} + +/// Get current terminal size. Returns (rows, cols). +/// +/// Uses `crossterm::terminal::size()`, which is cross-platform: +/// TIOCGWINSZ on unix, GetConsoleScreenBufferInfo on Windows. +pub(crate) fn get_terminal_size() -> Option<(u16, u16)> { + crossterm::terminal::size() + .ok() + .map(|(cols, rows)| (rows, cols)) +} + +/// Detect Claude Code auto-suggestion ghost text. +/// +/// Auto-suggestions are rendered with reverse-video cursor + dim ghost text, +/// and often include the "↵ send" hint. +/// Extract Relaycast message IDs from MCP tool response output. +/// +/// When the agent sends a message via MCP (send_dm, send_message, etc.), +/// the response JSON contains `"id": ""`. We extract these IDs +/// and pre-seed the dedup cache so the WS echo of the same message is dropped. +/// This is more robust than name-based filtering since it works regardless +/// of what identity the MCP server registers with. +pub(crate) fn extract_mcp_message_ids(buffer: &str) -> Vec { + let mut ids = Vec::new(); + // Match patterns like "id": "147310274064424960" (Relaycast snowflake IDs are 18-digit numbers) + let mut search_start = 0; + while let Some(key_pos) = buffer[search_start..].find("\"id\"") { + let abs_pos = search_start + key_pos + 4; // skip past "id" + if abs_pos >= buffer.len() { + break; + } + let rest = &buffer[abs_pos..]; + // Skip whitespace and colon + let rest = rest.trim_start(); + let rest = if let Some(r) = rest.strip_prefix(':') { + r.trim_start() + } else { + search_start = abs_pos; + continue; + }; + // Extract quoted value + if let Some(r) = rest.strip_prefix('"') { + if let Some(end) = r.find('"') { + let value = &r[..end]; + // Only match numeric snowflake IDs (15-20 digits) + if value.len() >= 15 + && value.len() <= 20 + && value.chars().all(|c| c.is_ascii_digit()) + { + ids.push(value.to_string()); + } + } + } + search_start = abs_pos; + } + ids +} + +/// Returns the continuity directory path derived from the state file path. +/// State path is always `{cwd}/.agent-relay/state.json`, so parent is `{cwd}/.agent-relay/`. +pub(crate) fn continuity_dir(state_path: &Path) -> PathBuf { + state_path + .parent() + .expect("state_path always has a parent (.agent-relay/)") + .join("continuity") +} + +/// Create ephemeral runtime paths in the system temp directory. +/// +/// Unlike `ensure_runtime_paths`, this function: +/// - Writes nothing to the project directory +/// - Uses a deterministic temp directory derived from cwd+broker name so +/// duplicate brokers still collide on the same lock/PID files +/// +/// The temp directory is NOT removed on exit — the OS cleans it up on reboot. +/// State and pending-delivery files are still written there so they don't +/// interfere with the project tree; they're just ephemeral. +/// Ephemeral mode: no lock file, no PID file, no temp directory. +/// The broker lifecycle is tied to the parent process via stdin — when the +/// parent (SDK client) exits, stdin gets EOF and the broker shuts down. +/// Single-instance enforcement is unnecessary here because each SDK client +/// manages its own child process. +pub(crate) fn ensure_ephemeral_paths(_cwd: &Path, _broker_name: &str) -> Result { + // Use a random temp subdir so concurrent ephemeral brokers don't collide + // on state files. + let root = std::env::temp_dir().join(format!("agent-relay-ephemeral-{}", std::process::id())); + std::fs::create_dir_all(&root) + .with_context(|| format!("failed to create ephemeral temp dir {}", root.display()))?; + + Ok(RuntimePaths { + persist: false, + state: root.join("state.json"), + pending: root.join("pending.json"), + _lock: None, + }) +} + +pub(crate) fn ensure_runtime_paths( + cwd: &Path, + broker_name: &str, + state_dir: Option<&Path>, +) -> Result { + let root = state_dir + .map(PathBuf::from) + .unwrap_or_else(|| cwd.join(".agent-relay")); + std::fs::create_dir_all(&root) + .with_context(|| format!("failed to create runtime dir {}", root.display()))?; + + // Sanitise name for use in filenames — keep only alphanumeric and hyphens + let safe_name: String = broker_name + .chars() + .map(|c| { + if c.is_alphanumeric() || c == '-' { + c + } else { + '-' + } + }) + .collect(); + + // Lock and PID files are per-broker-name so concurrent workflows can coexist. + let lock_path = root.join(format!("broker-{safe_name}.lock")); + let lock_file = std::fs::File::create(&lock_path) + .with_context(|| format!("failed to create lock file {}", lock_path.display()))?; + + #[cfg(unix)] + { + use std::os::unix::io::AsRawFd; + let fd = lock_file.as_raw_fd(); + let rc = unsafe { nix::libc::flock(fd, nix::libc::LOCK_EX | nix::libc::LOCK_NB) }; + if rc != 0 { + // Lock acquisition failed — check if the holder is still alive + // by reading the PID from connection.json. + let connection_path = root.join("connection.json"); + let old_pid = std::fs::read_to_string(&connection_path) + .ok() + .and_then(|c| serde_json::from_str::(&c).ok()) + .and_then(|v| v.get("pid").and_then(|p| p.as_u64())) + .map(|p| p as u32); + if let Some(old_pid) = old_pid { + if !broker::is_pid_alive(old_pid) { + tracing::warn!( + old_pid = old_pid, + "stale broker lock detected (PID {} is dead), recovering", + old_pid + ); + // The old process is dead — remove stale PID file and retry lock. + // We drop and re-create the lock file to clear the stale flock. + drop(lock_file); + let lock_file = std::fs::File::create(&lock_path).with_context(|| { + format!( + "failed to re-create lock file after stale recovery {}", + lock_path.display() + ) + })?; + let fd = lock_file.as_raw_fd(); + let rc = + unsafe { nix::libc::flock(fd, nix::libc::LOCK_EX | nix::libc::LOCK_NB) }; + if rc != 0 { + anyhow::bail!( + "another broker instance is already running in this directory ({})", + root.display() + ); + } + // Successfully recovered — PID is written via connection.json at API start + return Ok(RuntimePaths { + persist: true, + state: root.join(format!("state-{safe_name}.json")), + pending: root.join(format!("pending-{safe_name}.json")), + _lock: Some(lock_file), + }); + } else { + anyhow::bail!( + "another broker instance is already running in this directory (pid: {}, {})", + old_pid, + root.display() + ); + } + } + // PID file missing or unreadable while lock is held — treat as stale. + // This happens when the user deletes .agent-relay/ while an old broker + // is still alive, or during the shutdown race (PID deleted before flock + // released). + tracing::warn!( + "broker lock held but no valid PID file found, treating as stale and recovering" + ); + drop(lock_file); + let lock_file = std::fs::File::create(&lock_path).with_context(|| { + format!( + "failed to re-create lock file after stale recovery {}", + lock_path.display() + ) + })?; + let fd = lock_file.as_raw_fd(); + let rc = unsafe { nix::libc::flock(fd, nix::libc::LOCK_EX | nix::libc::LOCK_NB) }; + if rc != 0 { + anyhow::bail!( + "another broker instance is already running in this directory ({})", + root.display() + ); + } + return Ok(RuntimePaths { + persist: true, + state: root.join(format!("state-{safe_name}.json")), + pending: root.join(format!("pending-{safe_name}.json")), + _lock: Some(lock_file), + }); + } + } + + // PID is written via connection.json at API start + + Ok(RuntimePaths { + persist: true, + state: root.join(format!("state-{safe_name}.json")), + pending: root.join(format!("pending-{safe_name}.json")), + _lock: Some(lock_file), + }) +} + +pub(crate) fn derive_ws_base_url_from_http(http_base: &str) -> String { + let trimmed = http_base.trim(); + if let Some(rest) = trimmed.strip_prefix("https://") { + format!("wss://{rest}") + } else if let Some(rest) = trimmed.strip_prefix("http://") { + format!("ws://{rest}") + } else { + trimmed.to_string() + } +} + +#[cfg(test)] +mod tests { + use std::{ + collections::{BTreeSet, HashMap, HashSet}, + path::PathBuf, + process::Stdio, + time::{Duration, Instant}, + }; + + use crate::helpers::{ + detect_bypass_permissions_prompt, detect_claude_trust_prompt, floor_char_boundary, + format_injection, is_auto_suggestion, is_bypass_selection_menu, is_in_editor_mode, + strip_ansi, + }; + use crate::worker::{WorkerEvent, WorkerHandle, WorkerRegistry}; + use relay_broker::protocol::{AgentSpec, MessageInjectionMode, RelayDelivery}; + use serde_json::{json, Value}; + use tokio::sync::mpsc; + + use super::{ + build_agent_state_transition_event, build_http_api_spawn_spec, build_thread_infos, + channels_from_csv, continuity_dir, delivery_retry_interval, derive_ws_base_url_from_http, + display_target_for_dashboard, drop_pending_for_worker, extract_mcp_message_ids, + http_api_event_emit_timeout, http_api_local_delivery_timeout, + http_api_relaycast_send_timeout, is_relaycast_self_control_target, + is_unknown_worker_error_message, normalize_channel, normalize_initial_task, + normalize_sender, queue_inbound_for_delivery_mode, relaycast_spawn_control_dedup_key, + relaycast_ws_control_dedup_key, relaycast_ws_should_apply_local_spawn_echo_dedup, + relaycast_ws_spawn_token, sender_is_dashboard_label, + should_clear_pending_delivery_for_event, AgentRuntime, InboundContext, InboundQueueOutcome, + PendingDelivery, ProtocolHeadlessProvider, + }; + use relay_broker::dedup::DedupCache; + use relay_broker::relaycast_ws::{ + format_worker_preregistration_error, RelaycastRegistrationError, + }; + use relay_broker::types::{InboundDeliveryMode, InboundDeliveryState}; + + async fn make_worker_registry_with_worker(name: &str) -> WorkerRegistry { + let (tx, _rx) = mpsc::channel::(16); + let mut registry = WorkerRegistry::new( + tx, + Vec::new(), + PathBuf::from("/tmp/agent-relay-broker-tests"), + Instant::now(), + ); + let mut child = tokio::process::Command::new("cat") + .stdin(Stdio::piped()) + .stdout(Stdio::null()) + .stderr(Stdio::null()) + .spawn() + .expect("test worker process should spawn"); + let stdin = child.stdin.take().expect("test worker stdin should exist"); + registry.workers.insert( + name.to_string(), + WorkerHandle { + spec: AgentSpec { + name: name.to_string(), + runtime: AgentRuntime::Pty, + provider: None, + cli: Some("cat".to_string()), + model: None, + cwd: None, + team: None, + shadow_of: None, + shadow_mode: None, + args: Vec::new(), + channels: Vec::new(), + restart_policy: None, + }, + parent: None, + workspace_id: Some("ws_demo".to_string()), + child, + stdin, + spawned_at: Instant::now(), + }, + ); + registry + } + + async fn cleanup_worker_registry(mut registry: WorkerRegistry) { + for handle in registry.workers.values_mut() { + let _ = handle.child.start_kill(); + let _ = handle.child.wait().await; + } + } + + fn inbound_ctx<'a>(event_id: &'a str) -> InboundContext<'a> { + InboundContext { + from: "Alice", + body: "hello from relay", + target: "#general", + thread_id: Some("thr_123"), + workspace_id: Some("ws_demo"), + workspace_alias: Some("Demo"), + priority: 1, + mode: MessageInjectionMode::Steer, + event_id: Some(event_id), + } + } + + #[tokio::test] + async fn inbound_queue_auto_inject_drains_immediately_with_full_context() { + let worker_name = "worker-a"; + let workers = make_worker_registry_with_worker(worker_name).await; + let mut delivery_states = HashMap::new(); + + let outcome = queue_inbound_for_delivery_mode( + &mut delivery_states, + &workers, + worker_name, + inbound_ctx("evt_auto"), + ); + + match outcome { + InboundQueueOutcome::DrainNow(messages) => { + assert_eq!(messages.len(), 1); + let msg = &messages[0]; + assert_eq!(msg.from, "Alice"); + assert_eq!(msg.body, "hello from relay"); + assert_eq!(msg.target, "#general"); + assert_eq!(msg.thread_id.as_deref(), Some("thr_123")); + assert_eq!(msg.workspace_id.as_deref(), Some("ws_demo")); + assert_eq!(msg.workspace_alias.as_deref(), Some("Demo")); + assert_eq!(msg.priority, 1); + assert_eq!(msg.mode, MessageInjectionMode::Steer); + assert_eq!(msg.event_id.as_deref(), Some("evt_auto")); + } + other => panic!("expected immediate drain, got {other:?}"), + } + assert_eq!( + delivery_states + .get(worker_name) + .expect("state should be created") + .pending_snapshot(), + Vec::new(), + "auto_inject drains the per-worker pending queue in the same broker turn" + ); + + cleanup_worker_registry(workers).await; + } + + #[tokio::test] + async fn inbound_queue_manual_flush_holds_until_explicit_drain() { + let worker_name = "worker-a"; + let workers = make_worker_registry_with_worker(worker_name).await; + let mut delivery_states = HashMap::from([( + worker_name.to_string(), + InboundDeliveryState::new(InboundDeliveryMode::ManualFlush), + )]); + + let outcome = queue_inbound_for_delivery_mode( + &mut delivery_states, + &workers, + worker_name, + inbound_ctx("evt_manual"), + ); + + assert_eq!(outcome, InboundQueueOutcome::Queued); + let snapshot = delivery_states + .get(worker_name) + .expect("manual state should remain present") + .pending_snapshot(); + assert_eq!(snapshot.len(), 1); + assert_eq!(snapshot[0].event_id.as_deref(), Some("evt_manual")); + assert_eq!(snapshot[0].target, "#general"); + + cleanup_worker_registry(workers).await; + } + + #[tokio::test] + async fn inbound_queue_worker_missing_does_not_create_state() { + let (tx, _rx) = mpsc::channel::(16); + let workers = WorkerRegistry::new( + tx, + Vec::new(), + PathBuf::from("/tmp/agent-relay-broker-tests"), + Instant::now(), + ); + let mut delivery_states = HashMap::new(); + + let outcome = queue_inbound_for_delivery_mode( + &mut delivery_states, + &workers, + "ghost", + inbound_ctx("evt_missing"), + ); + + assert_eq!(outcome, InboundQueueOutcome::WorkerMissing); + assert!(delivery_states.is_empty()); + } + + fn extract_kind_literals(source: &str) -> BTreeSet { + let marker = "\"kind\""; + let mut kinds = BTreeSet::new(); + let mut cursor = 0; + while let Some(offset) = source[cursor..].find(marker) { + let mut start = cursor + offset + marker.len(); + if start >= source.len() { + break; + } + if !source[start..].starts_with(':') { + cursor = start; + continue; + } + start += 1; + while start < source.len() && source.as_bytes()[start].is_ascii_whitespace() { + start += 1; + } + if start >= source.len() || source.as_bytes()[start] != b'"' { + cursor = start; + continue; + } + start += 1; + if let Some(end) = source[start..].find('"') { + let candidate = &source[start..start + end]; + if !candidate.is_empty() + && candidate + .chars() + .all(|c| c.is_ascii_lowercase() || c == '_' || c.is_ascii_digit()) + { + kinds.insert(candidate.to_string()); + } + } + cursor = start; + if cursor >= source.len() { + break; + } + } + kinds + } + + #[test] + fn parses_channels() { + assert_eq!(channels_from_csv("general,ops"), vec!["general", "ops"]); + } + + #[test] + fn channel_normalization() { + assert_eq!(normalize_channel("general"), "#general"); + assert_eq!(normalize_channel("#ops"), "#ops"); + } + + #[test] + fn normalize_initial_task_drops_empty_values() { + assert_eq!(normalize_initial_task(None), None); + assert_eq!(normalize_initial_task(Some(String::new())), None); + assert_eq!(normalize_initial_task(Some(" ".to_string())), None); + } + + #[test] + fn normalize_initial_task_keeps_non_empty_values() { + assert_eq!( + normalize_initial_task(Some("Ship the patch".to_string())), + Some("Ship the patch".to_string()) + ); + } + + #[test] + fn ws_base_derivation() { + assert_eq!( + derive_ws_base_url_from_http("https://api.relaycast.dev"), + "wss://api.relaycast.dev" + ); + assert_eq!( + derive_ws_base_url_from_http("http://localhost:8787"), + "ws://localhost:8787" + ); + } + + #[test] + fn relaycast_control_dedup_key_prefers_event_id() { + let value = json!({ + "type": "agent.spawn_requested", + "event_id": "evt_123", + "agent": { "name": "worker-a", "cli": "claude", "task": "Ship it" } + }); + + assert_eq!( + relaycast_ws_control_dedup_key("ws_1", "agent.spawn_requested", &value), + Some("control:ws_1:agent.spawn_requested:evt_123".to_string()) + ); + } + + #[test] + fn relaycast_control_dedup_key_prefers_spawn_token_for_spawn_requests() { + let value = json!({ + "type": "agent.spawn_requested", + "event_id": "evt_123", + "agent": { + "name": "worker-a", + "cli": "claude", + "task": "Ship it", + "token": "at_live_worker" + } + }); + + assert_eq!( + relaycast_ws_control_dedup_key("ws_1", "agent.spawn_requested", &value), + Some("control:ws_1:agent.spawn_requested:at_live_worker".to_string()) + ); + } + + #[test] + fn relaycast_control_dedup_key_falls_back_to_agent_name_for_spawn_requests() { + let value = json!({ + "type": "agent.spawn_requested", + "agent": { + "name": "worker-a", + "cli": "claude", + "task": "Ship it" + } + }); + + assert_eq!( + relaycast_ws_control_dedup_key("ws_1", "agent.spawn_requested", &value), + Some("control:ws_1:agent.spawn_requested:worker-a".to_string()) + ); + } + + #[test] + fn relaycast_control_dedup_key_falls_back_to_serialized_payload() { + let value = json!({ + "type": "agent.release_requested", + "agent": { "name": "worker-a" } + }); + + let key = relaycast_ws_control_dedup_key("ws_1", "agent.release_requested", &value) + .expect("fallback dedup key"); + assert!(key.starts_with("control:ws_1:agent.release_requested:{")); + assert!(key.contains("\"worker-a\"")); + } + + #[test] + fn relaycast_ws_spawn_token_extracts_agent_token() { + let value = json!({ + "type": "agent.spawn_requested", + "agent": { + "name": "worker-a", + "token": "at_live_worker" + } + }); + + assert_eq!( + relaycast_ws_spawn_token(&value), + Some("at_live_worker".to_string()) + ); + } + + #[test] + fn relaycast_ws_spawn_name_only_control_key_skips_second_name_dedup() { + let value = json!({ + "type": "agent.spawn_requested", + "agent": { + "name": "worker-a", + "cli": "claude", + "task": "Ship it" + } + }); + + let control_key = relaycast_ws_control_dedup_key("ws_1", "agent.spawn_requested", &value) + .expect("control dedup key"); + let local_key = relaycast_spawn_control_dedup_key("ws_1", "worker-a"); + + assert_eq!(control_key, local_key); + assert!(!relaycast_ws_should_apply_local_spawn_echo_dedup( + Some(control_key.as_str()), + &local_key + )); + } + + #[test] + fn relaycast_ws_spawn_event_id_echo_still_uses_local_name_dedup() { + let value = json!({ + "type": "agent.spawn_requested", + "event_id": "evt_123", + "agent": { + "name": "worker-a", + "cli": "claude", + "task": "Ship it" + } + }); + + let control_key = relaycast_ws_control_dedup_key("ws_1", "agent.spawn_requested", &value) + .expect("control dedup key"); + let local_key = relaycast_spawn_control_dedup_key("ws_1", "worker-a"); + + assert_ne!(control_key, local_key); + assert!(relaycast_ws_should_apply_local_spawn_echo_dedup( + Some(control_key.as_str()), + &local_key + )); + + let now = Instant::now(); + let mut dedup = DedupCache::new(Duration::from_secs(60), 16); + assert!(dedup.insert_if_new(&local_key, now)); + assert!(dedup.insert_if_new(&control_key, now + Duration::from_secs(1))); + assert!(!dedup.insert_if_new(&local_key, now + Duration::from_secs(2))); + } + + #[test] + fn unknown_worker_error_message_matches_release_failures() { + assert!(is_unknown_worker_error_message("unknown worker 'worker-a'")); + assert!(is_unknown_worker_error_message( + "failed to release 'worker-a': unknown worker 'worker-a'" + )); + assert!(!is_unknown_worker_error_message("failed to bind api port")); + } + + #[test] + fn relaycast_self_control_target_matches_aliases_case_insensitively() { + let self_names = HashSet::from([ + "relay-broker".to_string(), + "relay-broker@workspace".to_string(), + ]); + + assert!(is_relaycast_self_control_target( + "Relay-Broker", + "relay-broker", + &self_names + )); + assert!(is_relaycast_self_control_target( + "@relay-broker@workspace", + "relay-broker", + &self_names + )); + assert!(!is_relaycast_self_control_target( + "worker-a", + "relay-broker", + &self_names + )); + } + + #[tokio::test] + async fn contract_health_fixture_requires_rich_listen_health_shape() { + let fixture: Value = serde_json::from_str(include_str!( + "../../../packages/contracts/fixtures/health-fixtures.json" + )) + .expect("health fixture should be valid JSON"); + let expected_shape = fixture + .get("health_response") + .and_then(Value::as_object) + .expect("health fixture must include health_response object"); + + let actual = crate::listen_api::listen_api_health_payload(None, vec![]); + + for required_key in expected_shape.keys() { + // TODO(contract-wave1-health-shape): listen-mode /health should + // implement the shared BrokerHealthResponse contract fields. + assert!( + actual.get(required_key).is_some(), + "listen /health response is missing required contract field: {}", + required_key + ); + } + } + + #[tokio::test] + async fn contract_startup_429_fixture_requires_degraded_health_status() { + let fixture: Value = serde_json::from_str(include_str!( + "../../../packages/contracts/fixtures/health-fixtures.json" + )) + .expect("health fixture should be valid JSON"); + let expected = fixture + .get("wave0_startup_429_degraded") + .and_then(|v| v.get("expected_health_status")) + .and_then(Value::as_str) + .expect("health fixture must include expected degraded health status"); + let startup_error_code = fixture + .get("wave0_startup_429_degraded") + .and_then(|v| v.get("error")) + .and_then(|v| v.get("code")) + .and_then(Value::as_str) + .expect("health fixture must include startup error code"); + std::env::set_var("AGENT_RELAY_STARTUP_ERROR_CODE", startup_error_code); + let actual = crate::listen_api::listen_api_health_payload(None, vec![]) + .get("status") + .and_then(Value::as_str) + .unwrap_or("unknown") + .to_string(); + std::env::remove_var("AGENT_RELAY_STARTUP_ERROR_CODE"); + + assert_eq!( + actual, expected, + "listen /health status \"{}\" does not match startup 429 degraded contract \"{}\"", + actual, expected + ); + } + + #[test] + fn contract_replay_fixture_requires_replay_route_exposure() { + let replay_fixture: Value = serde_json::from_str(include_str!( + "../../../packages/contracts/fixtures/replay-fixtures.json" + )) + .expect("replay fixture should be valid JSON"); + assert!( + replay_fixture.get("replay_cursor_request").is_some(), + "replay fixture must include replay_cursor_request" + ); + assert!( + replay_fixture.get("replay_response").is_some(), + "replay fixture must include replay_response" + ); + + let source = include_str!("listen_api.rs"); + assert!( + source.contains(".route(\"/api/events/replay\""), + "listen API router does not expose /api/events/replay" + ); + } + + #[test] + fn contract_timeout_fixture_requires_terminal_failed_guard_before_late_ack() { + let replay_fixture: Value = serde_json::from_str(include_str!( + "../../../packages/contracts/fixtures/replay-fixtures.json" + )) + .expect("replay fixture should be valid JSON"); + let timeout_fixture = replay_fixture + .get("wave0_timeout_terminal_semantics") + .and_then(Value::as_object) + .expect("replay fixture must include wave0_timeout_terminal_semantics object"); + + let expected_terminal_status = timeout_fixture + .get("expected_terminal_status") + .and_then(Value::as_str) + .expect("timeout fixture requires expected_terminal_status"); + let late_event_kind = timeout_fixture + .get("late_event_kind") + .and_then(Value::as_str) + .expect("timeout fixture requires late_event_kind"); + + let source = include_str!("runtime.rs"); + let ack_branch = source + .find("msg_type == \"delivery_ack\"") + .map(|idx| { + let end = (idx + 1200).min(source.len()); + &source[idx..end] + }) + .expect("main.rs must include delivery_ack handling"); + + assert!( + ack_branch.contains(expected_terminal_status) || ack_branch.contains("terminal"), + "delivery_ack branch lacks terminal guard for timeout status \"{}\" and late event \"{}\"", + expected_terminal_status, + late_event_kind + ); + } + + #[test] + fn contract_broadcast_whitelist_fixture_requires_filtering_to_required_kinds() { + let event_fixture: Value = serde_json::from_str(include_str!( + "../../../packages/contracts/fixtures/event-fixtures.json" + )) + .expect("event fixture should be valid JSON"); + let required = event_fixture + .get("wave0_broadcast_whitelist") + .and_then(|v| v.get("required_kinds")) + .and_then(Value::as_array) + .expect("event fixture must include wave0_broadcast_whitelist.required_kinds") + .iter() + .filter_map(Value::as_str) + .map(str::to_owned) + .collect::>(); + + let emitted = extract_kind_literals(include_str!("runtime.rs")); + + assert!( + required.is_subset(&emitted), + "broker source is missing required broadcast kinds; expected {:?}, got {:?}", + required, + emitted + ); + } + + #[test] + fn build_thread_infos_groups_channel_messages() { + let messages = vec![ + json!({ + "from": "broker", + "target": "#general", + "text": "outbound", + "timestamp": "2026-02-23T10:00:00Z", + }), + json!({ + "from": "Lead", + "target": "#general", + "text": "inbound", + "timestamp": "2026-02-23T10:01:00Z", + }), + ]; + let self_names = HashSet::from(["broker".to_string()]); + let threads = build_thread_infos(&messages, &self_names); + + assert_eq!(threads.len(), 1); + assert_eq!(threads[0].thread_id, "#general"); + assert_eq!(threads[0].name, "#general"); + assert_eq!(threads[0].unread_count, 1); + assert_eq!(threads[0].last_message.as_deref(), Some("inbound")); + } + + #[test] + fn build_thread_infos_groups_direct_messages_case_insensitively() { + let messages = vec![ + json!({ + "from": "BROKER", + "to": "WorkerA", + "text": "ping", + "timestamp": "2026-02-23T10:00:00Z", + }), + json!({ + "from": "workera", + "to": "broker", + "text": "pong", + "timestamp": "2026-02-23T10:01:00Z", + }), + ]; + let self_names = HashSet::from(["broker".to_string()]); + let threads = build_thread_infos(&messages, &self_names); + + assert_eq!(threads.len(), 1); + assert_eq!(threads[0].thread_id, "direct:broker:workera"); + assert_eq!(threads[0].name, "workera"); + assert_eq!(threads[0].unread_count, 1); + assert_eq!(threads[0].last_message.as_deref(), Some("pong")); + } + + #[test] + fn build_thread_infos_uses_dm_conversation_id_and_sender_name() { + let messages = vec![json!({ + "from": "Planner", + "conversation_id": "conv_123", + "text": "dm payload", + "timestamp": "2026-02-23T10:01:00Z", + })]; + let self_names = HashSet::from(["broker".to_string()]); + let threads = build_thread_infos(&messages, &self_names); + + assert_eq!(threads.len(), 1); + assert_eq!(threads[0].thread_id, "conv_123"); + assert_eq!(threads[0].name, "Planner"); + assert_eq!(threads[0].unread_count, 1); + } + + #[test] + fn build_thread_infos_shows_dms_between_non_broker_agents() { + let messages = vec![ + json!({ + "from": "WorkerA", + "conversation_id": "dm_456", + "participants": ["WorkerA", "WorkerB"], + "text": "hello WorkerB", + "timestamp": "2026-02-23T10:00:00Z", + }), + json!({ + "from": "WorkerB", + "conversation_id": "dm_456", + "participants": ["WorkerA", "WorkerB"], + "text": "hi WorkerA", + "timestamp": "2026-02-23T10:01:00Z", + }), + ]; + let self_names = HashSet::from(["broker".to_string()]); + let threads = build_thread_infos(&messages, &self_names); + + assert_eq!(threads.len(), 1, "should group into one conversation"); + assert_eq!(threads[0].thread_id, "dm_456"); + assert_eq!(threads[0].name, "WorkerA ↔ WorkerB"); + assert_eq!( + threads[0].unread_count, 2, + "both messages unread (neither from broker)" + ); + assert_eq!(threads[0].last_message.as_deref(), Some("hi WorkerA")); + } + + #[test] + fn build_thread_infos_dm_with_participants_filters_broker() { + let messages = vec![json!({ + "from": "WorkerA", + "conversation_id": "dm_789", + "participants": ["broker", "WorkerA"], + "text": "hello broker", + "timestamp": "2026-02-23T10:00:00Z", + })]; + let self_names = HashSet::from(["broker".to_string()]); + let threads = build_thread_infos(&messages, &self_names); + + assert_eq!(threads.len(), 1); + assert_eq!( + threads[0].name, "WorkerA", + "should filter out broker from participants" + ); + } + + #[test] + fn build_thread_infos_multiple_independent_dm_conversations() { + let messages = vec![ + json!({ + "from": "Alice", + "conversation_id": "dm_aaa", + "participants": ["Alice", "Bob"], + "text": "hi Bob", + "timestamp": "2026-02-23T10:00:00Z", + }), + json!({ + "from": "Charlie", + "conversation_id": "dm_bbb", + "participants": ["Charlie", "Diana"], + "text": "hi Diana", + "timestamp": "2026-02-23T10:01:00Z", + }), + json!({ + "from": "broker", + "conversation_id": "dm_ccc", + "participants": ["broker", "Eve"], + "text": "hi Eve", + "timestamp": "2026-02-23T10:02:00Z", + }), + ]; + let self_names = HashSet::from(["broker".to_string()]); + let threads = build_thread_infos(&messages, &self_names); + + assert_eq!( + threads.len(), + 3, + "should have three separate DM conversations" + ); + + let thread_aaa = threads.iter().find(|t| t.thread_id == "dm_aaa").unwrap(); + assert_eq!(thread_aaa.name, "Alice ↔ Bob"); + + let thread_bbb = threads.iter().find(|t| t.thread_id == "dm_bbb").unwrap(); + assert_eq!(thread_bbb.name, "Charlie ↔ Diana"); + + let thread_ccc = threads.iter().find(|t| t.thread_id == "dm_ccc").unwrap(); + assert_eq!(thread_ccc.name, "Eve", "broker filtered from participants"); + } + + #[test] + fn build_thread_infos_respects_explicit_unread_count() { + let messages = vec![json!({ + "from": "Planner", + "target": "broker", + "text": "status", + "unread_count": 7, + "timestamp": "2026-02-23T10:01:00Z", + })]; + let self_names = HashSet::from(["broker".to_string()]); + let threads = build_thread_infos(&messages, &self_names); + + assert_eq!(threads.len(), 1); + assert_eq!(threads[0].unread_count, 7); + } + + #[test] + fn build_agent_state_transition_event_has_expected_shape() { + let payload = build_agent_state_transition_event("worker-a", "spawned", Some("sdk_spawn")); + assert_eq!(payload["type"], "agent.state"); + assert_eq!(payload["state"], "spawned"); + assert_eq!(payload["agent"]["name"], "worker-a"); + assert_eq!(payload["reason"], "sdk_spawn"); + assert!(payload["timestamp"].as_str().is_some()); + + let no_reason = build_agent_state_transition_event("worker-a", "idle", None); + assert!(no_reason.get("reason").is_none()); + } + + #[test] + fn preregistration_error_message_dedupes_retry_after_for_rate_limit() { + let error = RelaycastRegistrationError::RateLimited { + agent_name: "Foobar".to_string(), + retry_after_secs: 60, + detail: "{\"ok\":false}".to_string(), + }; + let message = format_worker_preregistration_error("Foobar", &error); + assert_eq!(message.matches("retry after").count(), 1); + } + + #[test] + fn preregistration_error_message_does_not_invent_retry_after_for_transport_errors() { + let error = RelaycastRegistrationError::Transport { + agent_name: "Foobar".to_string(), + detail: "timeout".to_string(), + }; + let message = format_worker_preregistration_error("Foobar", &error); + assert!(!message.contains("retry after")); + } + + #[test] + fn injection_format_preserved() { + let rendered = format_injection("alice", "evt_1", "hello", "bob"); + assert!(rendered.contains("")); + assert!(rendered.contains("mcp__relaycast__message_dm_send")); + assert!(rendered.contains("Relay message from alice [evt_1]: hello")); + } + + #[test] + fn injection_format_includes_channel() { + let rendered = format_injection("alice", "evt_1", "hello", "#general"); + assert!(rendered.contains("mcp__relaycast__message_post")); + assert!(rendered.contains("channel: \"general\"")); + assert!(rendered.contains("Relay message from alice in #general [evt_1]: hello")); + } + + #[test] + fn normalize_sender_defaults_to_human_orchestrator() { + assert_eq!(normalize_sender(None), "human:orchestrator"); + assert_eq!(normalize_sender(Some(String::new())), "human:orchestrator"); + assert_eq!( + normalize_sender(Some(" ".to_string())), + "human:orchestrator" + ); + } + + #[test] + fn normalize_sender_normalizes_human_prefix() { + assert_eq!( + normalize_sender(Some("human: Dashboard ".to_string())), + "human:Dashboard" + ); + } + + #[test] + fn normalize_sender_preserves_worker_names() { + assert_eq!( + normalize_sender(Some("WorkerOne".to_string())), + "WorkerOne".to_string() + ); + } + + #[test] + fn sender_is_dashboard_label_accepts_legacy_dashboard_senders() { + assert!(sender_is_dashboard_label("Dashboard", "my-project")); + assert!(sender_is_dashboard_label("human:Dashboard", "my-project")); + assert!(sender_is_dashboard_label( + "human:orchestrator", + "my-project" + )); + assert!(sender_is_dashboard_label("my-project", "my-project")); + assert!(!sender_is_dashboard_label("Lead", "my-project")); + } + + #[test] + fn display_target_for_dashboard_maps_self_identity() { + let mut self_names = HashSet::new(); + self_names.insert("broker-951762d5".to_string()); + self_names.insert("DashProbe".to_string()); + let primary = "my-project"; + + assert_eq!( + display_target_for_dashboard("broker-951762d5", &self_names, primary), + "my-project" + ); + assert_eq!( + display_target_for_dashboard("dashprobe", &self_names, primary), + "my-project" + ); + assert_eq!( + display_target_for_dashboard("Lead", &self_names, primary), + "Lead".to_string() + ); + } + + #[test] + fn delivery_retry_interval_uses_default_and_env_override() { + std::env::remove_var("AGENT_RELAY_DELIVERY_RETRY_MS"); + assert_eq!(delivery_retry_interval().as_millis(), 1_000); + + std::env::set_var("AGENT_RELAY_DELIVERY_RETRY_MS", "250"); + assert_eq!(delivery_retry_interval().as_millis(), 250); + + std::env::set_var("AGENT_RELAY_DELIVERY_RETRY_MS", "1"); + assert_eq!(delivery_retry_interval().as_millis(), 50); + + std::env::remove_var("AGENT_RELAY_DELIVERY_RETRY_MS"); + } + + #[test] + fn http_api_timeout_windows_use_default_and_env_override() { + std::env::remove_var("AGENT_RELAY_HTTP_API_LOCAL_DELIVERY_TIMEOUT_MS"); + std::env::remove_var("AGENT_RELAY_HTTP_API_RELAYCAST_SEND_TIMEOUT_MS"); + std::env::remove_var("AGENT_RELAY_HTTP_API_EVENT_EMIT_TIMEOUT_MS"); + + assert_eq!(http_api_local_delivery_timeout().as_millis(), 3_000); + assert_eq!(http_api_relaycast_send_timeout().as_millis(), 20_000); + assert_eq!(http_api_event_emit_timeout().as_millis(), 200); + + std::env::set_var("AGENT_RELAY_HTTP_API_LOCAL_DELIVERY_TIMEOUT_MS", "10"); + std::env::set_var("AGENT_RELAY_HTTP_API_RELAYCAST_SEND_TIMEOUT_MS", "100"); + std::env::set_var("AGENT_RELAY_HTTP_API_EVENT_EMIT_TIMEOUT_MS", "1"); + + assert_eq!(http_api_local_delivery_timeout().as_millis(), 100); + assert_eq!(http_api_relaycast_send_timeout().as_millis(), 500); + assert_eq!(http_api_event_emit_timeout().as_millis(), 25); + + std::env::set_var("AGENT_RELAY_HTTP_API_LOCAL_DELIVERY_TIMEOUT_MS", "1500"); + std::env::set_var("AGENT_RELAY_HTTP_API_RELAYCAST_SEND_TIMEOUT_MS", "12000"); + std::env::set_var("AGENT_RELAY_HTTP_API_EVENT_EMIT_TIMEOUT_MS", "150"); + + assert_eq!(http_api_local_delivery_timeout().as_millis(), 1_500); + assert_eq!(http_api_relaycast_send_timeout().as_millis(), 12_000); + assert_eq!(http_api_event_emit_timeout().as_millis(), 150); + + std::env::remove_var("AGENT_RELAY_HTTP_API_LOCAL_DELIVERY_TIMEOUT_MS"); + std::env::remove_var("AGENT_RELAY_HTTP_API_RELAYCAST_SEND_TIMEOUT_MS"); + std::env::remove_var("AGENT_RELAY_HTTP_API_EVENT_EMIT_TIMEOUT_MS"); + } + + #[test] + fn drop_pending_for_worker_removes_only_matching_entries() { + let mut pending = HashMap::new(); + pending.insert( + "del_1".to_string(), + PendingDelivery { + worker_name: "A".to_string(), + delivery: RelayDelivery { + delivery_id: "del_1".to_string(), + event_id: "evt_1".to_string(), + workspace_id: Some("ws_test".to_string()), + workspace_alias: Some("test".to_string()), + from: "x".to_string(), + target: "#general".to_string(), + body: "hello".to_string(), + thread_id: None, + priority: None, + injection_mode: MessageInjectionMode::Wait, + }, + attempts: 1, + next_retry_at: Instant::now(), + }, + ); + pending.insert( + "del_2".to_string(), + PendingDelivery { + worker_name: "B".to_string(), + delivery: RelayDelivery { + delivery_id: "del_2".to_string(), + event_id: "evt_2".to_string(), + workspace_id: Some("ws_test".to_string()), + workspace_alias: Some("test".to_string()), + from: "y".to_string(), + target: "#general".to_string(), + body: "world".to_string(), + thread_id: None, + priority: None, + injection_mode: MessageInjectionMode::Wait, + }, + attempts: 1, + next_retry_at: Instant::now(), + }, + ); + + let dropped = drop_pending_for_worker(&mut pending, "A"); + assert_eq!(dropped, 1); + assert!(pending.contains_key("del_2")); + assert!(!pending.contains_key("del_1")); + } + + #[test] + fn should_clear_pending_delivery_when_event_id_matches() { + let pending = PendingDelivery { + worker_name: "A".to_string(), + delivery: RelayDelivery { + delivery_id: "del_1".to_string(), + event_id: "evt_1".to_string(), + workspace_id: Some("ws_test".to_string()), + workspace_alias: Some("test".to_string()), + from: "x".to_string(), + target: "#general".to_string(), + body: "hello".to_string(), + thread_id: None, + priority: None, + injection_mode: MessageInjectionMode::Wait, + }, + attempts: 1, + next_retry_at: Instant::now(), + }; + + assert!(should_clear_pending_delivery_for_event( + Some(&pending), + Some("evt_1") + )); + assert!(!should_clear_pending_delivery_for_event( + Some(&pending), + Some("evt_2") + )); + } + + #[test] + fn should_clear_pending_delivery_without_event_id_for_compatibility() { + let pending = PendingDelivery { + worker_name: "A".to_string(), + delivery: RelayDelivery { + delivery_id: "del_1".to_string(), + event_id: "evt_1".to_string(), + workspace_id: Some("ws_test".to_string()), + workspace_alias: Some("test".to_string()), + from: "x".to_string(), + target: "#general".to_string(), + body: "hello".to_string(), + thread_id: None, + priority: None, + injection_mode: MessageInjectionMode::Wait, + }, + attempts: 1, + next_retry_at: Instant::now(), + }; + + assert!(should_clear_pending_delivery_for_event( + Some(&pending), + None + )); + assert!(should_clear_pending_delivery_for_event( + Some(&pending), + Some("") + )); + assert!(should_clear_pending_delivery_for_event(None, Some("evt_1"))); + } + + // ==================== strip_ansi tests ==================== + + #[test] + fn strip_ansi_removes_csi_sequences() { + assert_eq!(strip_ansi("\x1b[32mHello\x1b[0m"), "Hello"); + assert_eq!(strip_ansi("\x1b[1;31mred bold\x1b[0m"), "red bold"); + } + + #[test] + fn strip_ansi_removes_osc_sequences() { + assert_eq!(strip_ansi("\x1b]0;title\x07rest"), "rest"); + assert_eq!(strip_ansi("\x1b]0;title\x1b\\rest"), "rest"); + } + + #[test] + fn strip_ansi_preserves_plain_text() { + assert_eq!(strip_ansi("Hello world"), "Hello world"); + assert_eq!(strip_ansi(""), ""); + } + + #[test] + fn strip_ansi_handles_mixed_content() { + let input = "\x1b[33m⚠️ bypass\x1b[0m permissions mode\n\x1b[1m(yes/no)\x1b[0m"; + let clean = strip_ansi(input); + assert!(clean.contains("bypass")); + assert!(clean.contains("(yes/no)")); + assert!(!clean.contains("\x1b")); + } + + #[test] + fn strip_ansi_handles_cursor_forward_sequences() { + // Claude Code uses \x1b[1C (cursor forward) instead of spaces + // These should be replaced with spaces so echo detection works + let input = "\x1b[1CYes,\x1b[1CI\x1b[1Caccept"; + let clean = strip_ansi(input); + assert_eq!(clean, " Yes, I accept"); + } + + // ==================== floor_char_boundary tests ==================== + + #[test] + fn floor_char_boundary_at_valid_positions() { + let s = "Hello 世界"; + assert_eq!(floor_char_boundary(s, 0), 0); + assert_eq!(floor_char_boundary(s, 6), 6); + assert_eq!(floor_char_boundary(s, 9), 9); + } + + #[test] + fn floor_char_boundary_mid_multibyte() { + let s = "Hello 世界"; + assert_eq!(floor_char_boundary(s, 7), 6); + assert_eq!(floor_char_boundary(s, 8), 6); + } + + #[test] + fn floor_char_boundary_past_end() { + let s = "Hello 世界"; + assert_eq!(floor_char_boundary(s, 100), s.len()); + } + + // ==================== detect_bypass_permissions_prompt tests ==================== + + #[test] + fn bypass_perms_yes_no_prompt() { + let output = "⚠️ Bypassing all permission checks.\nDo you want to proceed? (yes/no)"; + let (has_ref, has_confirm) = detect_bypass_permissions_prompt(output); + assert!(has_ref); + assert!(has_confirm); + } + + #[test] + fn bypass_perms_dangerously_with_yn() { + let output = "Running with --dangerously-skip-permissions\nAccept the risks? (y/n)"; + let (has_ref, has_confirm) = detect_bypass_permissions_prompt(output); + assert!(has_ref); + assert!(has_confirm); + } + + #[test] + fn bypass_perms_accept_risk_variant() { + let output = + "bypass permissions mode enabled\nDo you accept the risk of running in this mode?"; + let (has_ref, has_confirm) = detect_bypass_permissions_prompt(output); + assert!(has_ref); + assert!(has_confirm); + } + + #[test] + fn bypass_perms_no_match_normal_output() { + let output = "I'll help you fix that bug. Let me read the file first."; + let (has_ref, has_confirm) = detect_bypass_permissions_prompt(output); + assert!(!has_ref); + assert!(!has_confirm); + } + + #[test] + fn bypass_perms_no_false_positive_permission_without_bypass() { + let output = "File permission denied. (yes/no)"; + let (has_ref, has_confirm) = detect_bypass_permissions_prompt(output); + assert!(!has_ref, "permission without bypass should not match"); + assert!(has_confirm, "yes/no detected but insufficient alone"); + } + + #[test] + fn bypass_perms_no_false_positive_status_bar() { + let output = "-- INSERT -- ⏵⏵ bypass permissions on (shift+tab to cycle)"; + let (has_ref, has_confirm) = detect_bypass_permissions_prompt(output); + assert!(has_ref, "status bar has bypass+permissions"); + assert!(!has_confirm, "but no confirmation prompt"); + } + + #[test] + fn bypass_perms_selection_menu_format() { + let output = "WARNING: ClaudeCoderunninginBypassPermissionsmode\n\ + Byproceeding,youacceptallresponsibility\n\ + No,exit\nYes,Iaccept\nEntertoconfirm"; + let (has_ref, has_confirm) = detect_bypass_permissions_prompt(output); + assert!(has_ref); + assert!(has_confirm); + assert!(is_bypass_selection_menu(output)); + } + + #[test] + fn bypass_perms_selection_menu_with_spaces() { + let output = "WARNING: Claude Code running in Bypass Permissions mode\n\ + 1. No, exit\n2. Yes, I accept\nEnter to confirm"; + let (has_ref, has_confirm) = detect_bypass_permissions_prompt(output); + assert!(has_ref && has_confirm); + assert!(is_bypass_selection_menu(output)); + } + + #[test] + fn bypass_perms_legacy_not_selection_menu() { + let output = "bypass permissions mode\nProceed? (yes/no)"; + let (has_ref, has_confirm) = detect_bypass_permissions_prompt(output); + assert!(has_ref && has_confirm, "legacy should still detect"); + assert!( + !is_bypass_selection_menu(output), + "legacy should NOT be selection menu" + ); + } + + #[test] + fn bypass_perms_with_raw_ansi() { + let raw = "\x1b[33m⚠️ bypass permissions\x1b[0m mode\nProceed? \x1b[1m(yes/no)\x1b[0m"; + let clean = strip_ansi(raw); + let (has_ref, has_confirm) = detect_bypass_permissions_prompt(&clean); + assert!(has_ref && has_confirm); + } + + // ==================== detect_claude_trust_prompt tests ==================== + + #[test] + fn claude_trust_prompt_full_match() { + let output = "take a moment to review what's in this folder first.\n\ + Claude Code'll be able to read, edit, and execute files here.\n\ + Security guide\n\ + ❯ 1. Yes, I trust this folder\n\ + 2. No, exit\n\ + Enter to confirm · Esc to cancel"; + let (has_trust_ref, has_confirmation) = detect_claude_trust_prompt(output); + assert!(has_trust_ref); + assert!(has_confirmation); + } + + #[test] + fn claude_trust_prompt_stripped_spaces() { + let output = "Yes,Itrustthisfolder\nNo,exit"; + let (has_trust_ref, has_confirmation) = detect_claude_trust_prompt(output); + assert!(has_trust_ref); + assert!(has_confirmation); + } + + #[test] + fn claude_trust_prompt_no_match_normal_output() { + let output = "I'll help you fix that bug. Let me read the file first."; + let (has_trust_ref, has_confirmation) = detect_claude_trust_prompt(output); + assert!(!has_trust_ref); + assert!(!has_confirmation); + } + + #[test] + fn claude_trust_prompt_partial_no_exit() { + let output = "Yes, I trust this folder"; + let (has_trust_ref, has_confirmation) = detect_claude_trust_prompt(output); + assert!(has_trust_ref); + assert!(!has_confirmation, "should not match without exit option"); + } + + #[test] + fn claude_trust_prompt_with_ansi() { + let raw = "\x1b[1m❯ 1. Yes, I trust this folder\x1b[0m\n 2. No, exit"; + let clean = strip_ansi(raw); + let (has_trust_ref, has_confirmation) = detect_claude_trust_prompt(&clean); + assert!(has_trust_ref && has_confirmation); + } + + // ==================== is_in_editor_mode tests ==================== + + #[test] + fn editor_mode_vim_insert() { + assert!(is_in_editor_mode("Some text\n-- INSERT --\n")); + assert!(is_in_editor_mode("Some text\n-- INSERT --")); + } + + #[test] + fn editor_mode_claude_cli_not_vim() { + let output = "-- INSERT -- ⏵⏵ bypass permissions on (shift+tab to cycle)"; + assert!(!is_in_editor_mode(output)); + } + + #[test] + fn editor_mode_nano() { + let output = " GNU nano 5.8\nFile: test.txt\n^G Get Help ^O Write Out"; + assert!(is_in_editor_mode(output)); + } + + #[test] + fn editor_mode_less_pager() { + assert!(is_in_editor_mode("some content\n(END)")); + assert!(is_in_editor_mode("some content\n--More--")); + } + + #[test] + fn editor_mode_normal_output() { + assert!(!is_in_editor_mode( + "I'll help you with that task. Let me search." + )); + assert!(!is_in_editor_mode("$ ls -la\ntotal 0\n$ ")); + } + + #[test] + fn editor_mode_with_ansi() { + let output = "\x1b[32mSome text\x1b[0m\n-- INSERT --\n"; + assert!(is_in_editor_mode(output)); + } + + #[test] + fn editor_mode_vim_visual_modes() { + assert!(is_in_editor_mode("text\n-- VISUAL --\n")); + assert!(is_in_editor_mode("text\n-- VISUAL LINE --\n")); + assert!(is_in_editor_mode("text\n-- VISUAL BLOCK --\n")); + assert!(is_in_editor_mode("text\n-- REPLACE --\n")); + } + + #[test] + fn editor_mode_claude_normal_not_vim() { + assert!(!is_in_editor_mode("-- NORMAL -- ► some Claude UI text")); + assert!(!is_in_editor_mode("-- VISUAL -- ▶ Claude UI")); + } + + #[test] + fn auto_suggestion_detects_cursor_plus_dim_pattern() { + assert!(is_auto_suggestion( + "\x1b[7mW\x1b[27m\x1b[2mhat's the task?\x1b[22m" + )); + } + + #[test] + fn auto_suggestion_detects_send_hint() { + assert!(is_auto_suggestion(" ↵ send")); + } + + #[test] + fn auto_suggestion_ignores_normal_output() { + assert!(!is_auto_suggestion("Relay message from Alice [abc]: hello")); + assert!(!is_auto_suggestion("Running tests...")); + assert!(!is_auto_suggestion("> \x1b[7m \x1b[27m")); + } + + #[test] + fn extract_mcp_ids_from_tool_response() { + let output = r#" ⎿ { + "id": "147310274064424960", + "conversation_id": "147310245874507776", + "from": "agent-a", + "text": "hello" + }"#; + let ids = extract_mcp_message_ids(output); + // Only extracts "id" keys, not "conversation_id" + assert_eq!(ids, vec!["147310274064424960"]); + } + + #[test] + fn extract_mcp_ids_ignores_short_ids() { + let output = r#""id": "123""#; + assert!(extract_mcp_message_ids(output).is_empty()); + } + + #[test] + fn extract_mcp_ids_ignores_non_numeric() { + let output = r#""id": "msg_abc123def456ghi""#; + assert!(extract_mcp_message_ids(output).is_empty()); + } + + #[test] + fn extract_mcp_ids_handles_no_ids() { + assert!(extract_mcp_message_ids("normal output with no JSON").is_empty()); + assert!(extract_mcp_message_ids("").is_empty()); + } + + // ==================== bypass flag selection logic tests ==================== + // Tests for the bypass flag logic used in WorkerRegistry::spawn(). + // The logic is: claude/claude:* → --dangerously-skip-permissions, codex → --dangerously-bypass-approvals-and-sandbox + + fn compute_bypass_flag(cli: &str, existing_args: &[String]) -> Option<&'static str> { + let cli_lower = cli.to_lowercase(); + if (cli_lower == "claude" || cli_lower.starts_with("claude:")) + && !existing_args + .iter() + .any(|a| a.contains("dangerously-skip-permissions")) + { + Some("--dangerously-skip-permissions") + } else if cli_lower == "codex" + && !existing_args + .iter() + .any(|a| a.contains("dangerously-bypass") || a.contains("full-auto")) + { + Some("--dangerously-bypass-approvals-and-sandbox") + } else if cli_lower == "gemini" && !existing_args.iter().any(|a| a == "--yolo" || a == "-y") + { + Some("--yolo") + } else { + None + } + } + + #[test] + fn bypass_flag_claude_gets_skip_permissions() { + assert_eq!( + compute_bypass_flag("claude", &[]), + Some("--dangerously-skip-permissions") + ); + } + + #[test] + fn bypass_flag_claude_variant_gets_skip_permissions() { + assert_eq!( + compute_bypass_flag("claude:latest", &[]), + Some("--dangerously-skip-permissions") + ); + assert_eq!( + compute_bypass_flag("Claude", &[]), + Some("--dangerously-skip-permissions") + ); + assert_eq!( + compute_bypass_flag("CLAUDE:v2", &[]), + Some("--dangerously-skip-permissions") + ); + } + + #[test] + fn bypass_flag_codex_gets_dangerously_bypass() { + assert_eq!( + compute_bypass_flag("codex", &[]), + Some("--dangerously-bypass-approvals-and-sandbox") + ); + } + + #[test] + fn bypass_flag_gemini_gets_yolo() { + assert_eq!(compute_bypass_flag("gemini", &[]), Some("--yolo")); + } + + #[test] + fn bypass_flag_gemini_dedup_when_yolo_present() { + let args = vec!["--yolo".to_string()]; + assert_eq!( + compute_bypass_flag("gemini", &args), + None, + "should not duplicate --yolo flag" + ); + } + + #[test] + fn bypass_flag_gemini_dedup_when_y_present() { + let args = vec!["-y".to_string()]; + assert_eq!( + compute_bypass_flag("gemini", &args), + None, + "should not duplicate when -y shorthand present" + ); + } + + #[test] + fn bypass_flag_aider_gets_none() { + assert_eq!(compute_bypass_flag("aider", &[]), None); + } + + #[test] + fn bypass_flag_goose_gets_none() { + assert_eq!(compute_bypass_flag("goose", &[]), None); + } + + #[test] + fn bypass_flag_unknown_cli_gets_none() { + assert_eq!(compute_bypass_flag("mystery-cli", &[]), None); + } + + #[test] + fn bypass_flag_claude_dedup_when_already_present() { + let args = vec!["--dangerously-skip-permissions".to_string()]; + assert_eq!( + compute_bypass_flag("claude", &args), + None, + "should not duplicate flag" + ); + } + + #[test] + fn bypass_flag_codex_dedup_when_already_present() { + let args = vec!["--dangerously-bypass-approvals-and-sandbox".to_string()]; + assert_eq!( + compute_bypass_flag("codex", &args), + None, + "should not duplicate flag" + ); + } + + #[test] + fn bypass_flag_codex_dedup_when_full_auto_present() { + let args = vec!["--full-auto".to_string()]; + assert_eq!( + compute_bypass_flag("codex", &args), + None, + "should not add bypass when --full-auto already present" + ); + } + + #[test] + fn bypass_flag_claude_dedup_partial_match() { + // If someone passes a different arg containing the substring, still dedup + let args = vec!["--my-dangerously-skip-permissions-flag".to_string()]; + assert_eq!( + compute_bypass_flag("claude", &args), + None, + "substring match should prevent duplication" + ); + } + + #[test] + fn bypass_flag_codex_with_other_args() { + let args = vec!["--model".to_string(), "gpt-4".to_string()]; + assert_eq!( + compute_bypass_flag("codex", &args), + Some("--dangerously-bypass-approvals-and-sandbox"), + "unrelated args should not prevent bypass flag" + ); + } + + // ==================== is_pid_alive ==================== + + #[test] + fn is_pid_alive_returns_true_for_self() { + let pid = std::process::id(); + assert!( + crate::broker::is_pid_alive(pid), + "current process PID should be alive" + ); + } + + #[test] + fn is_pid_alive_returns_false_for_dead_pid() { + // Spawn a short-lived child, wait for it to exit, then verify it's dead + let child = std::process::Command::new("true") + .spawn() + .expect("failed to spawn 'true'"); + let pid = child.id(); + let mut child = child; + child.wait().expect("failed to wait on child"); + // After the child exits, its PID should not be alive + // (the PID may be recycled, but on macOS/Linux it won't be immediately) + assert!( + !crate::broker::is_pid_alive(pid), + "exited child PID should be dead" + ); + } + + #[test] + fn is_pid_alive_returns_false_for_bogus_pid() { + // PID 0 is the kernel scheduler — kill(0, 0) signals the entire process group, + // not a real target. Use a very high PID that almost certainly doesn't exist. + // On macOS pid_max is ~99999; on Linux it's typically 32768 or 4194304. + // 4_000_000 is unlikely to be in use. + assert!( + !crate::broker::is_pid_alive(4_000_000), + "bogus PID 4_000_000 should not be alive (ESRCH)" + ); + } + + #[test] + fn is_pid_alive_eperm_means_alive() { + // PID 1 (launchd/init) is owned by root. When run as a normal user, + // kill(1, 0) returns EPERM — the process exists but we can't signal it. + // This is exactly the EPERM case our fix handles. + // Skip if running as root (e.g., in some CI containers) since root can + // signal any process and would get rc=0 instead of EPERM. + if unsafe { nix::libc::getuid() } == 0 { + eprintln!("skipping EPERM test: running as root"); + return; + } + assert!( + crate::broker::is_pid_alive(1), + "PID 1 (init/launchd) should report alive via EPERM" + ); + } + + // ==================== write_pid_file ==================== + + // ==================== continuity_dir ==================== + + #[test] + fn continuity_dir_derives_correct_path_from_state_json() { + let state_path = std::path::Path::new("/project/.agent-relay/state.json"); + let result = continuity_dir(state_path); + assert_eq!( + result, + std::path::PathBuf::from("/project/.agent-relay/continuity") + ); + } + + #[test] + fn continuity_dir_works_with_nested_project_path() { + let state_path = std::path::Path::new("/home/user/projects/my-app/.agent-relay/state.json"); + let result = continuity_dir(state_path); + assert_eq!( + result, + std::path::PathBuf::from("/home/user/projects/my-app/.agent-relay/continuity") + ); + } + + #[test] + fn continuity_dir_preserves_relative_paths() { + let state_path = std::path::Path::new(".agent-relay/state.json"); + let result = continuity_dir(state_path); + assert_eq!(result, std::path::PathBuf::from(".agent-relay/continuity")); + } + + #[test] + fn http_api_spawn_spec_defaults_to_pty_runtime() { + let spec = build_http_api_spawn_spec( + "worker-a".to_string(), + "codex".to_string(), + None, + Some("o3".to_string()), + vec!["--fast".to_string()], + vec!["general".to_string()], + Some("/tmp/project".to_string()), + Some("core".to_string()), + Some("Lead".to_string()), + Some("subagent".to_string()), + None, + ) + .expect("spec should build"); + + assert!(matches!(spec.runtime, AgentRuntime::Pty)); + assert!(spec.provider.is_none()); + assert_eq!(spec.cli.as_deref(), Some("codex")); + assert_eq!(spec.model.as_deref(), Some("o3")); + } + + #[test] + fn http_api_spawn_spec_uses_headless_runtime_for_supported_providers() { + let spec = build_http_api_spawn_spec( + "worker-a".to_string(), + "opencode".to_string(), + Some("headless".to_string()), + Some("ignored".to_string()), + vec![], + vec!["general".to_string()], + None, + None, + None, + None, + None, + ) + .expect("headless spec should build"); + + assert!(matches!(spec.runtime, AgentRuntime::Headless)); + assert!(matches!( + spec.provider, + Some(ProtocolHeadlessProvider::Opencode) + )); + assert!(spec.cli.is_none()); + assert_eq!(spec.model.as_deref(), Some("ignored")); + } + + #[test] + fn headless_provider_command_claude_places_flags_before_task() { + let (bin, args) = super::headless_provider_command( + &ProtocolHeadlessProvider::Claude, + "hello world", + &[ + "--mcp-config".to_string(), + "{\"mcpServers\":{}}".to_string(), + ], + ); + + assert_eq!(bin, "claude"); + assert_eq!(args.last().map(String::as_str), Some("hello world")); + let mcp_pos = args.iter().position(|a| a == "--mcp-config").unwrap(); + let task_pos = args.iter().position(|a| a == "hello world").unwrap(); + assert!(mcp_pos < task_pos, "--mcp-config must precede task"); + } + + #[test] + fn headless_provider_command_opencode_places_flags_before_task() { + let (bin, args) = super::headless_provider_command( + &ProtocolHeadlessProvider::Opencode, + "hello world", + &["--agent".to_string(), "relaycast".to_string()], + ); + + assert_eq!(bin, "opencode"); + assert_eq!(args.first().map(String::as_str), Some("run")); + assert_eq!(args.last().map(String::as_str), Some("hello world")); + let agent_pos = args.iter().position(|a| a == "--agent").unwrap(); + let task_pos = args.iter().position(|a| a == "hello world").unwrap(); + assert!(agent_pos < task_pos, "--agent must precede task"); + } + + #[test] + fn http_api_spawn_spec_rejects_unknown_headless_providers() { + let error = build_http_api_spawn_spec( + "worker-a".to_string(), + "codex".to_string(), + Some("headless".to_string()), + None, + vec![], + vec!["general".to_string()], + None, + None, + None, + None, + None, + ) + .expect_err("unsupported headless provider should fail"); + + assert!( + error + .to_string() + .contains("does not support headless transport"), + "unexpected error: {error}" + ); + } + + // ==================== model flag injection tests ==================== + // Tests for the --model flag injection logic used in WorkerRegistry::spawn(). + // When spec.model is set and non-empty, the broker should inject --model + // into the spawned CLI's argv, unless the user already specified --model. + + /// Mirror of the model flag logic in WorkerRegistry::spawn(). + fn compute_model_flag(model: Option<&str>, existing_args: &[String]) -> Option { + model.and_then(|m| { + if m.is_empty() + || existing_args + .iter() + .any(|a| a == "--model" || a.starts_with("--model=") || a == "-m") + { + None + } else { + Some(m.to_string()) + } + }) + } + + #[test] + fn model_flag_injected_when_present() { + assert_eq!( + compute_model_flag(Some("haiku"), &[]), + Some("haiku".to_string()), + "model should be injected when set and args are empty" + ); + } + + #[test] + fn model_flag_not_injected_when_none() { + assert_eq!( + compute_model_flag(None, &[]), + None, + "model should not be injected when not set" + ); + } + + #[test] + fn model_flag_not_injected_when_empty() { + assert_eq!( + compute_model_flag(Some(""), &[]), + None, + "model should not be injected when empty string" + ); + } + + #[test] + fn model_flag_not_injected_when_already_in_args() { + let args = vec!["--model".to_string(), "opus".to_string()]; + assert_eq!( + compute_model_flag(Some("haiku"), &args), + None, + "model should not be injected when --model already in args" + ); + } + + #[test] + fn model_flag_not_injected_when_short_flag_in_args() { + let args = vec!["-m".to_string(), "opus".to_string()]; + assert_eq!( + compute_model_flag(Some("haiku"), &args), + None, + "model should not be injected when -m already in args" + ); + } + + #[test] + fn model_flag_not_injected_when_equals_format_in_args() { + let args = vec!["--model=opus".to_string()]; + assert_eq!( + compute_model_flag(Some("haiku"), &args), + None, + "model should not be injected when --model=value already in args" + ); + } + + #[test] + fn model_flag_injected_with_other_args() { + let args = vec!["--verbose".to_string()]; + assert_eq!( + compute_model_flag(Some("gpt-4o"), &args), + Some("gpt-4o".to_string()), + "model should be injected when other unrelated args exist" + ); + } +} diff --git a/crates/broker/src/worker.rs b/crates/broker/src/worker.rs index ead181b5a..78b70f827 100644 --- a/crates/broker/src/worker.rs +++ b/crates/broker/src/worker.rs @@ -21,9 +21,9 @@ use tokio::{ }; use crate::{ - headless_provider_cli_name, helpers::{normalize_cli_name, parse_cli_command}, routing, + runtime::headless_provider_cli_name, spawner::terminate_child, }; diff --git a/crates/broker/src/wrap.rs b/crates/broker/src/wrap.rs index 2bd401405..a4ec17c41 100644 --- a/crates/broker/src/wrap.rs +++ b/crates/broker/src/wrap.rs @@ -1,14 +1,34 @@ -use std::collections::VecDeque; +use std::collections::{HashMap, VecDeque}; use std::time::{Duration, Instant}; -use super::*; +use anyhow::{Context, Result}; +use relay_broker::{ + control::{can_release_child, is_human_sender}, + dedup::DedupCache, + message_bridge::{map_ws_broker_command, map_ws_event}, + pty::PtySession, + relaycast_ws::{retry_agent_registration, RegRetryOutcome, WsControl}, + telemetry::{ActionSource, TelemetryClient, TelemetryEvent}, + types::{BrokerCommandPayload, InboundKind, SenderKind}, +}; +use tokio::{sync::mpsc, time::MissedTickBehavior}; + use crate::helpers::{ - agent_name_eq, check_echo_in_output, floor_char_boundary, - format_injection_for_worker_with_workspace, is_self_name, resolve_dm_participants_cached, - ActivityDetector, DeliveryOutcome, PendingActivity, PendingVerification, ThrottleState, - ACTIVITY_BUFFER_KEEP_BYTES, ACTIVITY_BUFFER_MAX_BYTES, ACTIVITY_WINDOW, - MAX_VERIFICATION_ATTEMPTS, VERIFICATION_WINDOW, + agent_name_eq, check_echo_in_output, detect_bypass_permissions_prompt, + detect_claude_trust_prompt, detect_codex_model_prompt, detect_gemini_action_required, + detect_gemini_trust_prompt, detect_gemini_untrusted_banner, detect_opencode_permission_prompt, + floor_char_boundary, format_injection_for_worker_with_workspace, is_auto_suggestion, + is_bypass_selection_menu, is_in_editor_mode, is_self_name, parse_cli_command, + resolve_dm_participants_cached, strip_ansi, ActivityDetector, DeliveryOutcome, PendingActivity, + PendingVerification, ThrottleState, ACTIVITY_BUFFER_KEEP_BYTES, ACTIVITY_BUFFER_MAX_BYTES, + ACTIVITY_WINDOW, MAX_VERIFICATION_ATTEMPTS, VERIFICATION_WINDOW, +}; +use crate::runtime::{ + channels_from_csv, command_targets_self, connect_relay, ensure_runtime_paths, env_flag_enabled, + extract_mcp_message_ids, get_terminal_size, terminal_cols, terminal_rows, RelaySession, + RelaySessionOptions, RelayWorkspace, }; +use crate::spawner::{spawn_env_vars, Spawner}; // PTY auto-response constants (shared by wrap and pty workers) const BYPASS_PERMS_COOLDOWN: Duration = Duration::from_secs(2); From cb59df6a462436f7edde9dde6338b532cfc4a8d0 Mon Sep 17 00:00:00 2001 From: Will Washburn Date: Mon, 18 May 2026 21:09:33 -0400 Subject: [PATCH 2/8] chore: update issue 875 trajectory metadata --- .../completed/2026-05/traj_f9wxa8ujeg78.json | 17 +++++++++++++---- 1 file changed, 13 insertions(+), 4 deletions(-) diff --git a/.trajectories/completed/2026-05/traj_f9wxa8ujeg78.json b/.trajectories/completed/2026-05/traj_f9wxa8ujeg78.json index edd1dc7cc..8de23c4b4 100644 --- a/.trajectories/completed/2026-05/traj_f9wxa8ujeg78.json +++ b/.trajectories/completed/2026-05/traj_f9wxa8ujeg78.json @@ -46,12 +46,21 @@ "approach": "Standard approach", "confidence": 0.9 }, - "commits": [], - "filesChanged": [], + "commits": ["2ecfb018"], + "filesChanged": [ + "crates/broker/src/main.rs", + "crates/broker/src/cli/mod.rs", + "crates/broker/src/runtime.rs", + "crates/broker/src/cli_mcp_args.rs", + "crates/broker/src/pty_worker.rs", + "crates/broker/src/routing.rs", + "crates/broker/src/worker.rs", + "crates/broker/src/wrap.rs" + ], "projectId": "/Users/will/Projects/AgentWorkforce/relay", "tags": [], "_trace": { - "startRef": "8a6b9b41b6d2de072e41ecd62f382419e1efb764", - "endRef": "8a6b9b41b6d2de072e41ecd62f382419e1efb764" + "startRef": "040e6d9fb2b9d1fa544a54d176dfc8a437ca76b9", + "endRef": "2ecfb01884a7cd3eae7a67768c20b84feb994c29" } } From d52c1476dc8b28f8504cf9d60dce10b1719b8c59 Mon Sep 17 00:00:00 2001 From: Will Washburn Date: Mon, 18 May 2026 21:17:02 -0400 Subject: [PATCH 3/8] chore: complete issue 875 trajectory file list --- .trajectories/completed/2026-05/traj_f9wxa8ujeg78.json | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.trajectories/completed/2026-05/traj_f9wxa8ujeg78.json b/.trajectories/completed/2026-05/traj_f9wxa8ujeg78.json index 8de23c4b4..69ff2ea2e 100644 --- a/.trajectories/completed/2026-05/traj_f9wxa8ujeg78.json +++ b/.trajectories/completed/2026-05/traj_f9wxa8ujeg78.json @@ -48,6 +48,9 @@ }, "commits": ["2ecfb018"], "filesChanged": [ + ".trajectories/completed/2026-05/traj_f9wxa8ujeg78.json", + ".trajectories/completed/2026-05/traj_f9wxa8ujeg78.md", + ".trajectories/index.json", "crates/broker/src/main.rs", "crates/broker/src/cli/mod.rs", "crates/broker/src/runtime.rs", From 7182810ce4863543c825d747a8f159644981d485 Mon Sep 17 00:00:00 2001 From: Will Washburn Date: Mon, 18 May 2026 21:38:22 -0400 Subject: [PATCH 4/8] refactor: split broker runtime modules --- crates/broker/src/runtime/connection.rs | 193 + crates/broker/src/runtime/delivery.rs | 455 ++ crates/broker/src/runtime/headless.rs | 387 ++ .../src/{runtime.rs => runtime/init.rs} | 4305 +---------------- crates/broker/src/runtime/io.rs | 70 + crates/broker/src/runtime/messages.rs | 572 +++ crates/broker/src/runtime/mod.rs | 89 + crates/broker/src/runtime/paths.rs | 184 + crates/broker/src/runtime/session.rs | 264 + crates/broker/src/runtime/spawn_spec.rs | 68 + crates/broker/src/runtime/system.rs | 94 + crates/broker/src/runtime/tests.rs | 1743 +++++++ crates/broker/src/runtime/util.rs | 211 + 13 files changed, 4331 insertions(+), 4304 deletions(-) create mode 100644 crates/broker/src/runtime/connection.rs create mode 100644 crates/broker/src/runtime/delivery.rs create mode 100644 crates/broker/src/runtime/headless.rs rename crates/broker/src/{runtime.rs => runtime/init.rs} (55%) create mode 100644 crates/broker/src/runtime/io.rs create mode 100644 crates/broker/src/runtime/messages.rs create mode 100644 crates/broker/src/runtime/mod.rs create mode 100644 crates/broker/src/runtime/paths.rs create mode 100644 crates/broker/src/runtime/session.rs create mode 100644 crates/broker/src/runtime/spawn_spec.rs create mode 100644 crates/broker/src/runtime/system.rs create mode 100644 crates/broker/src/runtime/tests.rs create mode 100644 crates/broker/src/runtime/util.rs diff --git a/crates/broker/src/runtime/connection.rs b/crates/broker/src/runtime/connection.rs new file mode 100644 index 000000000..1e04d461d --- /dev/null +++ b/crates/broker/src/runtime/connection.rs @@ -0,0 +1,193 @@ +use super::*; + +/// Connection metadata discovered from a running broker — typically by +/// reading `/connection.json` or from explicit CLI flags / env. +pub(crate) struct BrokerConnection { + base_url: String, + api_key: Option, +} + +/// Resolve the broker connection by checking, in order: +/// +/// 1. Explicit CLI args (`--broker-url`, `--api-key`). When `--broker-url` +/// is supplied without an API key, we still attempt to fall back to the +/// API key from env / `.agent-relay/connection.json` so users don't have +/// to repeat `--api-key` for every dump-pty invocation. +/// 2. Env vars `RELAY_BROKER_URL` / `RELAY_BROKER_API_KEY`. +/// 3. `connection.json` in the supplied state dir, otherwise +/// `.agent-relay/connection.json` directly under the current working +/// directory. The bare `cwd` is intentionally NOT probed — an unrelated +/// `connection.json` sitting in the user's repo root must not silently +/// redirect the snapshot request (and its broker API key) elsewhere. +pub(crate) fn discover_broker_connection( + explicit_url: Option<&str>, + explicit_api_key: Option<&str>, + state_dir: Option<&Path>, +) -> Result { + // Walk the same search roots used for the URL fallback, but only to + // pull out a stored `api_key`. Lets `--broker-url` reuse the broker's + // saved key when the env var and `--api-key` are both unset. + let api_key_from_connection_file = || -> Option { + let cwd = std::env::current_dir().ok()?; + let roots: Vec = match state_dir { + Some(dir) => vec![dir.to_path_buf()], + None => vec![cwd.join(".agent-relay")], + }; + for root in roots { + let path = root.join("connection.json"); + if !path.is_file() { + continue; + } + let body = std::fs::read_to_string(&path).ok()?; + let value: Value = serde_json::from_str(&body).ok()?; + if let Some(key) = value.get("api_key").and_then(Value::as_str) { + if !key.trim().is_empty() { + return Some(key.to_string()); + } + } + } + None + }; + + let resolve_api_key = |explicit: Option<&str>| -> Option { + explicit + .map(ToString::to_string) + .or_else(|| std::env::var("RELAY_BROKER_API_KEY").ok()) + .or_else(api_key_from_connection_file) + .filter(|value| !value.trim().is_empty()) + }; + + if let Some(url) = explicit_url { + return Ok(BrokerConnection { + base_url: url.trim_end_matches('/').to_string(), + api_key: resolve_api_key(explicit_api_key), + }); + } + + if let Ok(url) = std::env::var("RELAY_BROKER_URL") { + let trimmed = url.trim(); + if !trimmed.is_empty() { + return Ok(BrokerConnection { + base_url: trimmed.trim_end_matches('/').to_string(), + api_key: resolve_api_key(explicit_api_key), + }); + } + } + + let cwd = std::env::current_dir().context("failed to read current directory")?; + let search_roots: Vec = match state_dir { + Some(dir) => vec![dir.to_path_buf()], + None => vec![cwd.join(".agent-relay")], + }; + + for root in &search_roots { + let path = root.join("connection.json"); + if !path.is_file() { + continue; + } + let body = std::fs::read_to_string(&path) + .with_context(|| format!("failed reading {}", path.display()))?; + let value: Value = serde_json::from_str(&body) + .with_context(|| format!("failed parsing {}", path.display()))?; + let url = value + .get("url") + .and_then(Value::as_str) + .with_context(|| format!("connection file missing 'url': {}", path.display()))? + .to_string(); + let api_key = explicit_api_key + .map(ToString::to_string) + .or_else(|| std::env::var("RELAY_BROKER_API_KEY").ok()) + .or_else(|| { + value + .get("api_key") + .and_then(Value::as_str) + .map(ToString::to_string) + }) + .filter(|value| !value.trim().is_empty()); + return Ok(BrokerConnection { + base_url: url.trim_end_matches('/').to_string(), + api_key, + }); + } + + anyhow::bail!( + "could not locate broker connection. Pass --broker-url, set RELAY_BROKER_URL, \ + or run from a directory containing .agent-relay/connection.json" + ); +} + +/// `agent-relay-broker dump-pty ` — capture and print a worker's +/// current visible screen by hitting the broker's snapshot route. +pub(crate) async fn run_dump_pty(cmd: DumpPtyCommand) -> Result<()> { + use base64::Engine; + + let connection = discover_broker_connection( + cmd.broker_url.as_deref(), + cmd.api_key.as_deref(), + cmd.state_dir.as_deref(), + )?; + + let url = format!( + "{}/api/spawned/{}/snapshot?format={}", + connection.base_url, + urlencoding::encode(&cmd.name), + cmd.format.as_wire_str(), + ); + + let client = reqwest::Client::builder() + .timeout(Duration::from_secs(10)) + .build() + .context("failed to build http client")?; + + let mut request = client.get(&url); + if let Some(key) = connection.api_key.as_deref() { + request = request.header("X-API-Key", key); + } + let response = request + .send() + .await + .with_context(|| format!("failed reaching broker at {url}"))?; + let status = response.status(); + let body_bytes = response + .bytes() + .await + .context("failed reading broker response body")?; + + if !status.is_success() { + let body_str = String::from_utf8_lossy(&body_bytes); + anyhow::bail!("broker returned {status}: {body_str}"); + } + + let body: Value = + serde_json::from_slice(&body_bytes).context("broker response was not valid JSON")?; + let screen = body + .get("screen") + .and_then(Value::as_str) + .context("broker response missing 'screen' field")?; + + match cmd.format { + DumpPtyFormat::Plain => { + // The plain payload already includes the trailing newline per row. + // Print as-is so pipelines see a stable terminator. + use std::io::Write; + let mut stdout = std::io::stdout().lock(); + stdout + .write_all(screen.as_bytes()) + .context("failed writing snapshot to stdout")?; + stdout.flush().ok(); + } + DumpPtyFormat::Ansi => { + let bytes = base64::engine::general_purpose::STANDARD + .decode(screen) + .context("broker returned non-base64 ansi screen")?; + use std::io::Write; + let mut stdout = std::io::stdout().lock(); + stdout + .write_all(&bytes) + .context("failed writing snapshot to stdout")?; + stdout.flush().ok(); + } + } + + Ok(()) +} diff --git a/crates/broker/src/runtime/delivery.rs b/crates/broker/src/runtime/delivery.rs new file mode 100644 index 000000000..48b101af3 --- /dev/null +++ b/crates/broker/src/runtime/delivery.rs @@ -0,0 +1,455 @@ +use super::*; + +#[derive(Debug, Clone)] +pub(crate) struct PendingDelivery { + pub(super) worker_name: String, + pub(super) delivery: RelayDelivery, + pub(super) attempts: u32, + pub(super) next_retry_at: Instant, +} + +/// Serializable snapshot of pending deliveries for crash recovery. +#[derive(Debug, Clone, Serialize, Deserialize)] +pub(crate) struct PersistedPendingDelivery { + pub(super) worker_name: String, + pub(super) delivery: RelayDelivery, + pub(super) attempts: u32, +} + +pub(crate) fn save_pending_deliveries( + path: &Path, + deliveries: &HashMap, +) -> Result<()> { + let persisted: Vec = deliveries + .values() + .map(|pd| PersistedPendingDelivery { + worker_name: pd.worker_name.clone(), + delivery: pd.delivery.clone(), + attempts: pd.attempts, + }) + .collect(); + let json = serde_json::to_string_pretty(&persisted)?; + let dir = path.parent().unwrap_or(path); + let mut tmp = tempfile::NamedTempFile::new_in(dir) + .with_context(|| format!("failed creating temp file in {}", dir.display()))?; + std::io::Write::write_all(&mut tmp, json.as_bytes())?; + tmp.persist(path) + .with_context(|| format!("failed persisting pending deliveries to {}", path.display()))?; + Ok(()) +} + +pub(crate) fn load_pending_deliveries(path: &Path) -> HashMap { + let data = match std::fs::read_to_string(path) { + Ok(d) => d, + Err(_) => return HashMap::new(), + }; + let persisted: Vec = match serde_json::from_str(&data) { + Ok(v) => v, + Err(_) => return HashMap::new(), + }; + persisted + .into_iter() + .map(|p| { + let id = p.delivery.delivery_id.clone(); + ( + id, + PendingDelivery { + worker_name: p.worker_name, + delivery: p.delivery, + attempts: p.attempts, + next_retry_at: Instant::now(), // retry immediately on restart + }, + ) + }) + .collect() +} + +// These payload structs were used by the stdio protocol handler (handle_sdk_frame). +#[derive(Debug, Serialize)] +pub(crate) struct AgentMetrics { + pub(super) name: String, + pub(super) pid: u32, + pub(super) memory_bytes: u64, + pub(super) uptime_secs: u64, +} + +#[derive(Debug, Deserialize)] +pub(crate) struct DeliveryAckPayload { + pub(super) delivery_id: String, + pub(super) event_id: String, +} + +/// Outcome of [`queue_inbound_for_delivery_mode`]. Distinguishes the +/// three cases broker call sites care about: the message is queued and +/// should wait for an explicit flush, the queue should be drained now, +/// or there's no worker (caller falls through to existing target handling). +#[derive(Debug, Clone, PartialEq, Eq)] +pub(crate) enum InboundQueueOutcome { + Queued, + DrainNow(Vec), + WorkerMissing, +} + +/// Bundle of routing context captured into the pending queue. Mirrors the +/// args `queue_and_try_delivery_raw` +/// expects so a drain reproduces the original delivery exactly — same +/// target (channel / DM / thread sentinel), thread, workspace, +/// priority, and injection mode. +pub(crate) struct InboundContext<'a> { + pub(super) from: &'a str, + pub(super) body: &'a str, + pub(super) target: &'a str, + pub(super) thread_id: Option<&'a str>, + pub(super) workspace_id: Option<&'a str>, + pub(super) workspace_alias: Option<&'a str>, + pub(super) priority: u8, + pub(super) mode: MessageInjectionMode, + pub(super) event_id: Option<&'a str>, +} + +/// Queue an inbound relay message through the per-worker [`InboundDeliveryMode`]. +/// +/// Every inbound message is appended to the per-worker pending queue. In +/// [`InboundDeliveryMode::AutoInject`] the caller immediately drains the queue +/// in the same broker turn; in [`InboundDeliveryMode::ManualFlush`] the message +/// stays parked until an explicit flush or mode transition. +/// +/// Pulled out so the broker has one obvious choke point for the two +/// inbound paths (`/api/send` and the relaycast inbound feed) that the +/// `drive` client needs to intercept. Internal broker-driven injections +/// (`worker_ready` initial task, continuity restore) bypass this queue by +/// not calling this helper. +pub(crate) fn queue_inbound_for_delivery_mode( + delivery_states: &mut HashMap, + workers: &WorkerRegistry, + worker_name: &str, + ctx: InboundContext<'_>, +) -> InboundQueueOutcome { + if !workers.has_worker(worker_name) { + return InboundQueueOutcome::WorkerMissing; + } + let state = delivery_states.entry(worker_name.to_string()).or_default(); + let should_drain = state.should_drain_immediately(); + let queued_at_ms = chrono::Utc::now().timestamp_millis().max(0) as u64; + let msg = PendingRelayMessage { + from: ctx.from.to_string(), + body: ctx.body.to_string(), + target: ctx.target.to_string(), + thread_id: ctx.thread_id.map(str::to_string), + workspace_id: ctx.workspace_id.map(str::to_string), + workspace_alias: ctx.workspace_alias.map(str::to_string), + priority: ctx.priority, + mode: ctx.mode, + queued_at_ms, + event_id: ctx.event_id.map(str::to_string), + }; + match state.accept_inbound(msg) { + InboundDeliveryDispatch::Queued { queue_len } => { + tracing::debug!( + target = "agent_relay::broker", + worker = %worker_name, + from = %ctx.from, + mode = state.mode.as_wire_str(), + queue_len, + "queued inbound relay message" + ); + } + InboundDeliveryDispatch::QueuedEvicted { + queue_len, + dropped_from, + } => { + tracing::warn!( + target = "agent_relay::broker", + worker = %worker_name, + from = %ctx.from, + dropped_from = %dropped_from, + mode = state.mode.as_wire_str(), + queue_len, + max_pending = relay_broker::types::MAX_PENDING_PER_WORKER, + "pending queue full — evicting oldest message" + ); + } + } + if should_drain { + let to_drain = state.drain_pending(); + tracing::debug!( + target = "agent_relay::broker", + worker = %worker_name, + drained = to_drain.len(), + "draining inbound queue immediately (auto_inject delivery mode)" + ); + InboundQueueOutcome::DrainNow(to_drain) + } else { + InboundQueueOutcome::Queued + } +} + +pub(crate) async fn try_inject_pending_relay_message( + workers: &mut WorkerRegistry, + pending_deliveries: &mut HashMap, + worker_name: &str, + msg: &PendingRelayMessage, + retry_interval: Duration, +) -> Result<()> { + let event_id = msg + .event_id + .clone() + .unwrap_or_else(|| format!("flush_{}", Uuid::new_v4().simple())); + match timeout( + retry_interval, + queue_and_try_delivery_raw( + workers, + pending_deliveries, + worker_name, + &event_id, + &msg.from, + // Use the ORIGINAL routing target captured at queue time — + // `#general`, the DM recipient name, `"thread"`, etc. Falling + // back to `worker_name` here would silently reframe channel + // messages as direct-to-worker messages on drain. + &msg.target, + &msg.body, + msg.thread_id.clone(), + msg.workspace_id.clone(), + msg.workspace_alias.clone(), + msg.priority, + msg.mode.clone(), + retry_interval, + ), + ) + .await + { + Ok(result) => result, + Err(_) => Err(anyhow::anyhow!( + "pending relay delivery timed out after {}ms", + retry_interval.as_millis() + )), + } +} + +/// Inject a previously-queued pending relay message into the worker via +/// the existing `queue_and_try_delivery_raw` path. Used by the +/// `/api/spawned/{name}/flush` handler and by the auto-drain on a +/// `manual_flush → auto_inject` transition. Failures are logged but not +/// propagated — the broker treats `flush` as best-effort fire-and-forget +/// the same way `/api/send` does for individual targets. +pub(crate) async fn inject_pending_relay_message( + workers: &mut WorkerRegistry, + pending_deliveries: &mut HashMap, + worker_name: &str, + msg: &PendingRelayMessage, + retry_interval: Duration, +) { + let event_id = msg.event_id.as_deref().unwrap_or(""); + if let Err(error) = try_inject_pending_relay_message( + workers, + pending_deliveries, + worker_name, + msg, + retry_interval, + ) + .await + { + tracing::warn!( + target = "agent_relay::broker", + worker = %worker_name, + from = %msg.from, + event_id = %event_id, + error = %error, + "failed to inject pending relay message during flush" + ); + } +} + +pub(crate) async fn queue_and_try_delivery( + workers: &mut WorkerRegistry, + pending_deliveries: &mut HashMap, + worker_name: &str, + mapped: &relay_broker::types::InboundRelayEvent, + retry_interval: Duration, +) -> Result<()> { + queue_and_try_delivery_raw( + workers, + pending_deliveries, + worker_name, + &mapped.event_id, + &mapped.from, + &mapped.target, + &mapped.text, + mapped.thread_id.clone(), + Some(mapped.workspace_id.clone()), + mapped.workspace_alias.clone(), + mapped.priority.as_u8(), + MessageInjectionMode::Wait, + retry_interval, + ) + .await +} + +#[allow(clippy::too_many_arguments)] +pub(crate) async fn queue_and_try_delivery_raw( + workers: &mut WorkerRegistry, + pending_deliveries: &mut HashMap, + worker_name: &str, + event_id: &str, + from: &str, + target: &str, + body: &str, + thread_id: Option, + workspace_id: Option, + workspace_alias: Option, + priority: u8, + injection_mode: MessageInjectionMode, + retry_interval: Duration, +) -> Result<()> { + let delivery = RelayDelivery { + delivery_id: format!("del_{}", Uuid::new_v4().simple()), + event_id: event_id.to_string(), + workspace_id, + workspace_alias, + from: from.to_string(), + target: target.to_string(), + body: body.to_string(), + thread_id, + priority: Some(priority), + injection_mode, + }; + let delivery_id = delivery.delivery_id.clone(); + pending_deliveries.insert( + delivery_id.clone(), + PendingDelivery { + worker_name: worker_name.to_string(), + delivery, + attempts: 0, + next_retry_at: Instant::now(), + }, + ); + + let _ = + retry_pending_delivery(&delivery_id, workers, pending_deliveries, retry_interval).await?; + Ok(()) +} + +pub(crate) async fn retry_pending_delivery( + delivery_id: &str, + workers: &mut WorkerRegistry, + pending_deliveries: &mut HashMap, + retry_interval: Duration, +) -> Result> { + let pending = match pending_deliveries.get(delivery_id) { + Some(pending) => pending.clone(), + None => return Ok(None), + }; + + if pending.attempts >= MAX_DELIVERY_RETRIES { + pending_deliveries.remove(delivery_id); + return Ok(None); + } + + if !workers.has_worker(&pending.worker_name) { + pending_deliveries.remove(delivery_id); + return Ok(None); + } + + match workers + .deliver(&pending.worker_name, pending.delivery.clone()) + .await + { + Ok(()) => { + if let Some(current) = pending_deliveries.get_mut(delivery_id) { + current.attempts = current.attempts.saturating_add(1); + current.next_retry_at = Instant::now() + retry_interval; + return Ok(Some(( + current.worker_name.clone(), + current.attempts, + current.delivery.event_id.clone(), + ))); + } + Ok(None) + } + Err(error) => { + if let Some(current) = pending_deliveries.get_mut(delivery_id) { + current.next_retry_at = Instant::now() + retry_interval; + } + Err(error) + } + } +} + +pub(crate) fn drop_pending_for_worker( + pending_deliveries: &mut HashMap, + worker_name: &str, +) -> usize { + let before = pending_deliveries.len(); + pending_deliveries.retain(|_, pending| pending.worker_name != worker_name); + before.saturating_sub(pending_deliveries.len()) +} + +/// Drain every in-flight worker request targeting `worker_name` and +/// notify each awaiter with [`worker_request::RequestWorkerError::WorkerDisappeared`]. +/// Called from every worker-teardown path (explicit release, +/// `worker_exited` frame, `reap_exited` periodic sweep) so HTTP callers +/// don't have to wait out the request deadline when the worker has +/// clearly gone. Logs one structured warning per drained request. +pub(crate) fn fail_pending_requests_for_worker( + pending_requests: &mut HashMap, + worker_name: &str, + reason: &'static str, +) -> usize { + let failed = worker_request::fail_for_worker(pending_requests, worker_name); + for (req_id, kind) in &failed { + tracing::warn!( + target = "agent_relay::broker", + request_id = %req_id, + worker = %worker_name, + kind = %kind, + reason = reason, + "failed pending worker request because worker is gone" + ); + } + failed.len() +} + +pub(crate) fn should_clear_pending_delivery_for_event( + pending: Option<&PendingDelivery>, + event_id: Option<&str>, +) -> bool { + let Some(pending) = pending else { + return true; + }; + + let Some(event_id) = event_id + .map(str::trim) + .filter(|event_id| !event_id.is_empty()) + else { + return true; + }; + + pending.delivery.event_id == event_id +} + +pub(crate) fn clear_pending_delivery_if_event_matches( + pending_deliveries: &mut HashMap, + delivery_id: &str, + event_id: Option<&str>, + worker_name: &str, + worker_signal: &str, +) { + let pending = pending_deliveries.get(delivery_id); + if should_clear_pending_delivery_for_event(pending, event_id) { + pending_deliveries.remove(delivery_id); + return; + } + + if let Some(pending) = pending { + tracing::warn!( + target = "agent_relay::broker", + worker = %worker_name, + signal = %worker_signal, + delivery_id = %delivery_id, + expected_event_id = %pending.delivery.event_id, + received_event_id = %event_id.unwrap_or(""), + "ignoring stale delivery lifecycle event due to event_id mismatch" + ); + } +} diff --git a/crates/broker/src/runtime/headless.rs b/crates/broker/src/runtime/headless.rs new file mode 100644 index 000000000..6138b3ac4 --- /dev/null +++ b/crates/broker/src/runtime/headless.rs @@ -0,0 +1,387 @@ +use super::*; + +pub(crate) fn headless_provider_cli_name(provider: &ProtocolHeadlessProvider) -> &'static str { + match provider { + ProtocolHeadlessProvider::Claude => "claude", + ProtocolHeadlessProvider::Opencode => "opencode", + } +} + +pub(crate) fn headless_provider_command( + provider: &ProtocolHeadlessProvider, + task: &str, + extra_args: &[String], +) -> (String, Vec) { + match provider { + ProtocolHeadlessProvider::Claude => { + let mut args = vec![ + "-p".to_string(), + "--dangerously-skip-permissions".to_string(), + ]; + args.extend(extra_args.iter().cloned()); + args.push(task.to_string()); + ("claude".to_string(), args) + } + ProtocolHeadlessProvider::Opencode => { + let mut args = vec!["run".to_string()]; + args.extend(extra_args.iter().cloned()); + args.push(task.to_string()); + ("opencode".to_string(), args) + } + } +} + +pub(crate) fn headless_provider_from_cli(value: &str) -> Option { + match value.trim().to_ascii_lowercase().as_str() { + "claude" => Some(ProtocolHeadlessProvider::Claude), + "opencode" => Some(ProtocolHeadlessProvider::Opencode), + _ => None, + } +} + +pub(crate) async fn run_headless_worker(cmd: HeadlessCommand) -> Result<()> { + let provider: ProtocolHeadlessProvider = cmd.provider.into(); + let provider_name = headless_provider_cli_name(&provider); + let provider_args = cmd.args.clone(); + + let (out_tx, mut out_rx) = mpsc::channel::>(512); + let writer_task = tokio::spawn(async move { + // Keep one async stdout handle for this process. Tokio's `write_all` + // is not cancel-safe if the task is aborted mid-write, so shutdown + // below drops `out_tx` and awaits this task before returning. + let mut stdout = tokio::io::stdout(); + while let Some(frame) = out_rx.recv().await { + if let Ok(mut line) = serde_json::to_string(&frame) { + line.push('\n'); + if stdout.write_all(line.as_bytes()).await.is_err() || stdout.flush().await.is_err() + { + break; + } + } + } + }); + + let mut lines = BufReader::new(tokio::io::stdin()).lines(); + let mut worker_name = cmd + .agent_name + .clone() + .unwrap_or_else(|| format!("headless-{provider_name}")); + let mut final_exit_code: Option = None; + let mut final_exit_signal: Option = None; + + while let Ok(Some(line)) = lines.next_line().await { + let frame: ProtocolEnvelope = match serde_json::from_str(&line) { + Ok(frame) => frame, + Err(error) => { + let _ = send_frame( + &out_tx, + "worker_error", + None, + json!({ + "code":"invalid_frame", + "message": error.to_string(), + "retryable": false, + }), + ) + .await; + continue; + } + }; + + match frame.msg_type.as_str() { + "init_worker" => { + worker_name = cmd + .agent_name + .clone() + .or_else(|| { + frame + .payload + .get("agent") + .and_then(|a| a.get("name")) + .and_then(Value::as_str) + .map(ToOwned::to_owned) + }) + .unwrap_or_else(|| format!("headless-{provider_name}")); + + let _ = send_frame( + &out_tx, + "worker_ready", + frame.request_id, + json!({ + "name": &worker_name, + "runtime": "headless", + }), + ) + .await; + } + "deliver_relay" => { + let request_id = frame.request_id.clone(); + let delivery: RelayDelivery = match serde_json::from_value(frame.payload) { + Ok(d) => d, + Err(error) => { + let _ = send_frame( + &out_tx, + "worker_error", + request_id, + json!({ + "code":"invalid_delivery", + "message": error.to_string(), + "retryable": false, + }), + ) + .await; + continue; + } + }; + + let timestamp = chrono::Utc::now().timestamp_millis(); + let delivery_id = delivery.delivery_id; + let event_id = delivery.event_id; + + let _ = send_frame( + &out_tx, + "delivery_queued", + None, + json!({ + "delivery_id": delivery_id, + "event_id": event_id, + "agent": &worker_name, + "timestamp": timestamp, + }), + ) + .await; + + let _ = send_frame( + &out_tx, + "delivery_injected", + None, + json!({ + "delivery_id": delivery_id, + "event_id": event_id, + "agent": &worker_name, + "timestamp": timestamp, + }), + ) + .await; + + let _ = send_frame( + &out_tx, + "delivery_active", + None, + json!({ + "delivery_id": delivery_id, + "event_id": event_id, + "pattern": format!("headless:{}", provider_name), + }), + ) + .await; + + let task_text = delivery.body.clone(); + let (binary, args) = + headless_provider_command(&provider, &task_text, &provider_args); + + let mut child_cmd = tokio::process::Command::new(&binary); + child_cmd + .args(&args) + .stdin(Stdio::null()) + .stdout(Stdio::piped()) + .stderr(Stdio::piped()); + + // Auto-approve tool permissions for opencode in headless mode. + if matches!(provider, ProtocolHeadlessProvider::Opencode) { + child_cmd.env( + "OPENCODE_PERMISSION", + r#"{"*":"allow","external_directory":{"*":"allow"}}"#, + ); + } + + let mut child = match child_cmd.spawn() { + Ok(child) => child, + Err(error) => { + let _ = send_frame( + &out_tx, + "delivery_failed", + None, + json!({ + "delivery_id": delivery_id, + "event_id": event_id, + "reason": format!("failed to spawn {}: {}", binary, error), + }), + ) + .await; + let _ = send_frame( + &out_tx, + "worker_error", + request_id, + json!({ + "code":"spawn_failed", + "message": format!("failed to spawn {}: {}", binary, error), + "retryable": false, + }), + ) + .await; + final_exit_code = Some(1); + break; + } + }; + + let _ = send_frame( + &out_tx, + "delivery_ack", + request_id.clone(), + json!({ + "delivery_id": delivery_id, + "event_id": event_id, + }), + ) + .await; + + let stdout = child.stdout.take(); + let stderr = child.stderr.take(); + + let stream_stdout = { + let out_tx = out_tx.clone(); + async move { + if let Some(stdout) = stdout { + let mut lines = BufReader::new(stdout).lines(); + while let Ok(Some(chunk)) = lines.next_line().await { + let _ = send_frame( + &out_tx, + "worker_stream", + None, + json!({ + "stream": "stdout", + "chunk": chunk, + }), + ) + .await; + } + } + } + }; + + let stream_stderr = { + let out_tx = out_tx.clone(); + async move { + if let Some(stderr) = stderr { + let mut lines = BufReader::new(stderr).lines(); + while let Ok(Some(chunk)) = lines.next_line().await { + let _ = send_frame( + &out_tx, + "worker_stream", + None, + json!({ + "stream": "stderr", + "chunk": chunk, + }), + ) + .await; + } + } + } + }; + + let (status, _, _) = tokio::join!(child.wait(), stream_stdout, stream_stderr); + + match status { + Ok(exit_status) => { + final_exit_code = exit_status.code(); + final_exit_signal = None; + if exit_status.success() { + let _ = send_frame( + &out_tx, + "delivery_verified", + None, + json!({ + "delivery_id": delivery_id, + "event_id": event_id, + }), + ) + .await; + } else { + let reason = match exit_status.code() { + Some(code) => format!("{} exited with code {}", binary, code), + None => format!("{} exited without an exit code", binary), + }; + let _ = send_frame( + &out_tx, + "delivery_failed", + None, + json!({ + "delivery_id": delivery_id, + "event_id": event_id, + "reason": reason, + }), + ) + .await; + } + } + Err(error) => { + let reason = format!("failed waiting for {}: {}", binary, error); + let _ = send_frame( + &out_tx, + "delivery_failed", + None, + json!({ + "delivery_id": delivery_id, + "event_id": event_id, + "reason": reason, + }), + ) + .await; + let _ = send_frame( + &out_tx, + "worker_error", + request_id, + json!({ + "code":"wait_failed", + "message": format!("failed waiting for {}: {}", binary, error), + "retryable": false, + }), + ) + .await; + final_exit_code = Some(1); + } + } + + break; + } + "ping" => { + let ts = frame + .payload + .get("ts_ms") + .and_then(Value::as_u64) + .unwrap_or_default(); + let _ = send_frame(&out_tx, "pong", frame.request_id, json!({"ts_ms": ts})).await; + } + "shutdown_worker" => { + break; + } + other => { + let _ = send_frame( + &out_tx, + "worker_error", + frame.request_id, + json!({ + "code":"unknown_type", + "message": format!("unsupported message type '{}'", other), + "retryable": false, + }), + ) + .await; + } + } + } + + let _ = send_frame( + &out_tx, + "worker_exited", + None, + json!({"code": final_exit_code, "signal": final_exit_signal}), + ) + .await; + drop(out_tx); + let _ = writer_task.await; + + Ok(()) +} diff --git a/crates/broker/src/runtime.rs b/crates/broker/src/runtime/init.rs similarity index 55% rename from crates/broker/src/runtime.rs rename to crates/broker/src/runtime/init.rs index 2b10759a0..0b6f7181c 100644 --- a/crates/broker/src/runtime.rs +++ b/crates/broker/src/runtime/init.rs @@ -1,779 +1,4 @@ -use std::{ - collections::{HashMap, HashSet, VecDeque}, - path::{Path, PathBuf}, - process::Stdio, - sync::{Arc, OnceLock}, - time::{Duration, Instant}, -}; - -use crate::helpers::{ - agent_name_eq, floor_char_boundary, is_self_name, normalize_cli_name, parse_cli_command, -}; -use crate::listen_api::{ - broadcast_if_relevant, listen_api_router, DeliveryRouteError, ListenApiConfig, - ListenApiRequest, SetInboundDeliveryModeOk, -}; -use crate::routing::display_target_for_dashboard; - -use anyhow::{Context, Result}; -use relaycast::WsEvent; -use serde::{Deserialize, Serialize}; -use serde_json::{json, Value}; -use tokio::{ - io::{AsyncBufReadExt, AsyncReadExt, AsyncWriteExt, BufReader}, - sync::{broadcast, mpsc, Notify, RwLock}, - time::{timeout, MissedTickBehavior}, -}; -use uuid::Uuid; - -use relay_broker::{ - auth::AuthClient, - dedup::DedupCache, - message_bridge::map_ws_event, - multi_workspace::{MultiWorkspaceSession, WorkspaceInboundMessage, WorkspaceMembershipSummary}, - protocol::{ - AgentRuntime, AgentSpec, HeadlessProvider as ProtocolHeadlessProvider, - MessageInjectionMode, ProtocolEnvelope, RelayDelivery, PROTOCOL_VERSION, - }, - relaycast_ws::{ - format_worker_preregistration_error, registration_retry_after_secs, - retry_agent_registration, RegRetryOutcome, RelaycastHttpClient, WsControl, - }, - replay_buffer::{ReplayBuffer, DEFAULT_REPLAY_CAPACITY}, - snippets::ensure_relaycast_mcp_config, - telemetry::{ActionSource, TelemetryClient, TelemetryEvent}, - types::{ - BrokerCommandEvent, InboundDeliveryDispatch, InboundDeliveryMode, InboundDeliveryState, - InboundKind, PendingRelayMessage, - }, -}; - -use crate::cli::{DumpPtyCommand, DumpPtyFormat, HeadlessCommand, InitCommand}; -use crate::worker::{WorkerEvent, WorkerHandle, WorkerRegistry}; -use crate::{broker, listen_api, routing, worker_request}; - -const DEFAULT_DELIVERY_RETRY_MS: u64 = 1_000; -const MAX_DELIVERY_RETRIES: u32 = 10; -const DEFAULT_RELAYCAST_BASE_URL: &str = "https://api.relaycast.dev"; -use crate::helpers::resolve_dm_participants_cached; -const THREAD_HISTORY_LIMIT: usize = 1_000; -const DEFAULT_HTTP_API_LOCAL_DELIVERY_TIMEOUT_MS: u64 = 3_000; -const DEFAULT_HTTP_API_RELAYCAST_SEND_TIMEOUT_MS: u64 = 20_000; -const DEFAULT_HTTP_API_EVENT_EMIT_TIMEOUT_MS: u64 = 200; -static TRACING_GUARD: OnceLock = OnceLock::new(); - -pub(crate) fn startup_debug_enabled() -> bool { - std::env::var("AGENT_RELAY_STARTUP_DEBUG") - .map(|value| { - let trimmed = value.trim(); - !trimmed.is_empty() && trimmed != "0" && !trimmed.eq_ignore_ascii_case("false") - }) - .unwrap_or(false) -} - -pub(crate) fn log_startup_phase(enabled: bool, started_at: Instant, message: impl AsRef) { - if enabled { - eprintln!( - "[agent-relay][startup +{}ms] {}", - started_at.elapsed().as_millis(), - message.as_ref() - ); - } -} - -pub(crate) fn headless_provider_cli_name(provider: &ProtocolHeadlessProvider) -> &'static str { - match provider { - ProtocolHeadlessProvider::Claude => "claude", - ProtocolHeadlessProvider::Opencode => "opencode", - } -} - -pub(crate) fn headless_provider_command( - provider: &ProtocolHeadlessProvider, - task: &str, - extra_args: &[String], -) -> (String, Vec) { - match provider { - ProtocolHeadlessProvider::Claude => { - let mut args = vec![ - "-p".to_string(), - "--dangerously-skip-permissions".to_string(), - ]; - args.extend(extra_args.iter().cloned()); - args.push(task.to_string()); - ("claude".to_string(), args) - } - ProtocolHeadlessProvider::Opencode => { - let mut args = vec!["run".to_string()]; - args.extend(extra_args.iter().cloned()); - args.push(task.to_string()); - ("opencode".to_string(), args) - } - } -} - -pub(crate) fn headless_provider_from_cli(value: &str) -> Option { - match value.trim().to_ascii_lowercase().as_str() { - "claude" => Some(ProtocolHeadlessProvider::Claude), - "opencode" => Some(ProtocolHeadlessProvider::Opencode), - _ => None, - } -} - -pub(crate) fn runtime_label(runtime: &AgentRuntime) -> &'static str { - match runtime { - AgentRuntime::Pty => "pty", - AgentRuntime::Headless => "headless", - } -} - -#[allow(clippy::too_many_arguments)] -pub(crate) fn build_http_api_spawn_spec( - name: String, - cli: String, - transport: Option, - model: Option, - args: Vec, - channels: Vec, - cwd: Option, - team: Option, - shadow_of: Option, - shadow_mode: Option, - restart_policy: Option, -) -> Result { - let runtime = match transport - .as_deref() - .map(str::trim) - .filter(|value| !value.is_empty()) - .map(|value| value.to_ascii_lowercase()) - { - None => AgentRuntime::Pty, - Some(value) if value == "pty" => AgentRuntime::Pty, - Some(value) if value == "headless" => AgentRuntime::Headless, - Some(other) => { - anyhow::bail!("unsupported transport '{other}' (expected 'pty' or 'headless')") - } - }; - let parsed_restart_policy = match restart_policy { - Some(v) => Some(serde_json::from_value(v).context("invalid restart_policy")?), - None => None, - }; - - let (provider, cli_command, model) = match runtime { - AgentRuntime::Pty => (None, Some(cli), model), - AgentRuntime::Headless => { - let provider = headless_provider_from_cli(&cli).with_context(|| { - format!( - "provider '{cli}' does not support headless transport (supported: claude, opencode)" - ) - })?; - (Some(provider), None, model) - } - }; - - Ok(AgentSpec { - name, - runtime, - provider, - cli: cli_command, - model, - cwd, - team, - shadow_of, - shadow_mode, - args, - channels, - restart_policy: parsed_restart_policy, - }) -} - -#[derive(Debug)] -pub(crate) struct RuntimePaths { - persist: bool, - state: PathBuf, - pending: PathBuf, - /// Held for process lifetime to prevent concurrent broker instances (persist mode only). - #[allow(dead_code)] - _lock: Option, -} - -/// Shared Relaycast connection state used by run_init and run_wrap. -#[derive(Clone)] -pub(crate) struct RelayWorkspace { - pub(crate) workspace_id: String, - pub(crate) workspace_alias: Option, - pub(crate) relay_workspace_key: String, - pub(crate) self_name: String, - pub(crate) self_agent_id: String, - pub(crate) self_names: HashSet, - pub(crate) self_agent_ids: HashSet, - pub(crate) http_client: RelaycastHttpClient, - pub(crate) ws_control_tx: mpsc::Sender, -} - -pub(crate) struct RelaySession { - pub(crate) http_base: String, - pub(crate) default_workspace_id: Option, - pub(crate) workspaces: Vec, - pub(crate) ws_inbound_rx: mpsc::Receiver, -} - -#[derive(Clone)] -pub(crate) struct RelayReadyState { - workspace_key: String, - memberships: Vec, - default_workspace_id: Option, -} - -pub(crate) async fn serve_startup_api_until_ready( - listener: tokio::net::TcpListener, - relay_ready: Arc, -) -> tokio::net::TcpListener { - loop { - tokio::select! { - _ = relay_ready.notified() => { - return listener; - } - accepted = listener.accept() => { - match accepted { - Ok((stream, _addr)) => { - tokio::spawn(handle_startup_api_connection(stream)); - } - Err(error) => { - tracing::warn!(error = %error, "startup API accept failed"); - tokio::time::sleep(Duration::from_millis(50)).await; - } - } - } - } - } -} - -pub(crate) async fn handle_startup_api_connection(mut stream: tokio::net::TcpStream) { - let mut buffer = [0_u8; 1024]; - let read = match timeout(Duration::from_secs(5), stream.read(&mut buffer)).await { - Ok(Ok(read)) => read, - Ok(Err(error)) => { - tracing::debug!(error = %error, "failed reading startup API request"); - return; - } - Err(_) => return, - }; - - let request = String::from_utf8_lossy(&buffer[..read]); - let path = request - .lines() - .next() - .and_then(|line| line.split_whitespace().nth(1)) - .unwrap_or("/"); - let (status, content_type, body) = if path == "/health" { - ( - "200 OK", - "application/json", - listen_api::listen_api_health_payload(None, vec![]).to_string(), - ) - } else { - ( - "503 Service Unavailable", - "text/plain; charset=utf-8", - "Broker is starting, please retry".to_string(), - ) - }; - let response = format!( - "HTTP/1.1 {status}\r\ncontent-type: {content_type}\r\ncontent-length: {}\r\nconnection: close\r\n\r\n{body}", - body.len() - ); - if let Err(error) = stream.write_all(response.as_bytes()).await { - tracing::debug!(error = %error, "failed writing startup API response"); - } -} - -/// Build the standard env-var array passed to every spawned child agent. -pub(crate) fn normalize_initial_task(task: Option) -> Option { - task.and_then(|value| { - if value.trim().is_empty() { - None - } else { - Some(value) - } - }) -} - -pub(crate) struct RelaySessionOptions<'a> { - pub(crate) paths: &'a RuntimePaths, - pub(crate) requested_name: &'a str, - pub(crate) channels: Vec, - pub(crate) strict_name: bool, - pub(crate) agent_type: Option<&'a str>, - /// Read .mcp.json for additional self-name identities - pub(crate) read_mcp_identity: bool, - /// Write relaycast server entry to .mcp.json - pub(crate) ensure_mcp_config: bool, - pub(crate) runtime_cwd: &'a Path, -} - -pub(crate) async fn connect_relay(opts: RelaySessionOptions<'_>) -> Result { - let startup_debug = startup_debug_enabled(); - let connect_started = Instant::now(); - let http_base = std::env::var("RELAYCAST_BASE_URL") - .ok() - .or_else(|| std::env::var("RELAY_BASE_URL").ok()) - .unwrap_or_else(|| DEFAULT_RELAYCAST_BASE_URL.to_string()); - let ws_base = std::env::var("RELAYCAST_WS_URL") - .unwrap_or_else(|_| derive_ws_base_url_from_http(&http_base)); - - log_startup_phase( - startup_debug, - connect_started, - format!( - "connect_relay begin requested_name='{}' channels={}", - opts.requested_name, - opts.channels.join(",") - ), - ); - let auth = AuthClient::new(http_base.clone()); - let sessions = auth - .startup_session_set_with_options( - Some(opts.requested_name), - opts.strict_name, - opts.agent_type, - ) - .await - .context("failed to initialize relaycast session")?; - log_startup_phase( - startup_debug, - connect_started, - format!( - "startup_session_set_with_options complete memberships={}", - sessions.memberships.len() - ), - ); - - let default_session = sessions - .default_session() - .or_else(|| sessions.memberships.first()) - .context("no relaycast memberships were initialized")?; - let relay_workspace_key = default_session.credentials.api_key.clone(); - let self_agent_id = default_session.credentials.agent_id.clone(); - let self_token = default_session.token.clone(); - let agent_name = default_session - .credentials - .agent_name - .clone() - .unwrap_or_else(|| opts.requested_name.to_string()); - - let identity_debug = format!( - "agent_name='{}' -requested='{}' -agent_id='{}' -token_prefix='{}' -default_workspace='{}' -workspace_count='{}' -timestamp='{}' -", - agent_name, - opts.requested_name, - self_agent_id, - &self_token[..self_token.len().min(16)], - default_session.credentials.workspace_id, - sessions.memberships.len(), - chrono::Utc::now().to_rfc3339() - ); - let debug_path = opts - .paths - .state - .parent() - .unwrap() - .join("identity-debug.txt"); - if std::env::var("AGENT_RELAY_NO_DEBUG_FILES").is_err() { - let _ = std::fs::write(&debug_path, &identity_debug); - eprintln!( - "[agent-relay] identity debug written to {}", - debug_path.display() - ); - } - if agent_name != opts.requested_name { - eprintln!( - "[agent-relay] WARNING: registered as '{}' (requested '{}')", - agent_name, opts.requested_name - ); - } - - if opts.ensure_mcp_config { - if let Err(error) = ensure_relaycast_mcp_config( - opts.runtime_cwd, - Some(relay_workspace_key.as_str()), - Some(http_base.as_str()), - None, - ) { - tracing::warn!("failed to ensure .mcp.json: {error}"); - } - } - - log_startup_phase( - startup_debug, - connect_started, - "MultiWorkspaceSession::new begin", - ); - let mut multi = MultiWorkspaceSession::new( - http_base.clone(), - ws_base, - auth, - sessions, - opts.channels, - opts.read_mcp_identity, - opts.runtime_cwd, - relay_broker::events::EventEmitter::new(false), - ); - log_startup_phase( - startup_debug, - connect_started, - format!( - "MultiWorkspaceSession::new complete handles={} default_workspace={:?}", - multi.handles.len(), - multi.default_workspace_id - ), - ); - - let default_workspace_id = multi.default_workspace_id.clone(); - let workspaces = multi - .handles - .drain(..) - .map(|handle| RelayWorkspace { - workspace_id: handle.workspace_id, - workspace_alias: handle.workspace_alias, - relay_workspace_key: handle.relay_workspace_key, - self_name: handle.self_name, - self_agent_id: handle.self_agent_id, - self_names: handle.self_names, - self_agent_ids: handle.self_agent_ids, - http_client: handle.http_client, - ws_control_tx: handle.ws_control_tx, - }) - .collect(); - - Ok(RelaySession { - http_base, - default_workspace_id, - workspaces, - ws_inbound_rx: multi.inbound_rx, - }) -} - -#[derive(Debug, Clone)] -pub(crate) struct PendingDelivery { - worker_name: String, - delivery: RelayDelivery, - attempts: u32, - next_retry_at: Instant, -} - -/// Serializable snapshot of pending deliveries for crash recovery. -#[derive(Debug, Clone, Serialize, Deserialize)] -pub(crate) struct PersistedPendingDelivery { - worker_name: String, - delivery: RelayDelivery, - attempts: u32, -} - -pub(crate) fn save_pending_deliveries( - path: &Path, - deliveries: &HashMap, -) -> Result<()> { - let persisted: Vec = deliveries - .values() - .map(|pd| PersistedPendingDelivery { - worker_name: pd.worker_name.clone(), - delivery: pd.delivery.clone(), - attempts: pd.attempts, - }) - .collect(); - let json = serde_json::to_string_pretty(&persisted)?; - let dir = path.parent().unwrap_or(path); - let mut tmp = tempfile::NamedTempFile::new_in(dir) - .with_context(|| format!("failed creating temp file in {}", dir.display()))?; - std::io::Write::write_all(&mut tmp, json.as_bytes())?; - tmp.persist(path) - .with_context(|| format!("failed persisting pending deliveries to {}", path.display()))?; - Ok(()) -} - -pub(crate) fn load_pending_deliveries(path: &Path) -> HashMap { - let data = match std::fs::read_to_string(path) { - Ok(d) => d, - Err(_) => return HashMap::new(), - }; - let persisted: Vec = match serde_json::from_str(&data) { - Ok(v) => v, - Err(_) => return HashMap::new(), - }; - persisted - .into_iter() - .map(|p| { - let id = p.delivery.delivery_id.clone(); - ( - id, - PendingDelivery { - worker_name: p.worker_name, - delivery: p.delivery, - attempts: p.attempts, - next_retry_at: Instant::now(), // retry immediately on restart - }, - ) - }) - .collect() -} - -// These payload structs were used by the stdio protocol handler (handle_sdk_frame). -#[derive(Debug, Serialize)] -pub(crate) struct AgentMetrics { - name: String, - pid: u32, - memory_bytes: u64, - uptime_secs: u64, -} - -#[derive(Debug, Deserialize)] -pub(crate) struct DeliveryAckPayload { - delivery_id: String, - event_id: String, -} - -#[derive(Debug, Clone, Serialize, PartialEq, Eq)] -#[serde(rename_all = "camelCase")] -pub(crate) struct ThreadInfo { - thread_id: String, - name: String, - unread_count: usize, - #[serde(skip_serializing_if = "Option::is_none")] - last_message: Option, - #[serde(skip_serializing_if = "Option::is_none")] - last_message_at: Option, -} - -#[derive(Debug, Clone)] -pub(crate) struct ThreadAccumulator { - info: ThreadInfo, - sort_key: i64, -} - -pub(crate) fn normalize_sender(sender: Option) -> String { - let raw = sender - .unwrap_or_else(|| "human:orchestrator".to_string()) - .trim() - .to_string(); - if raw.is_empty() { - return "human:orchestrator".to_string(); - } - if let Some(rest) = raw.strip_prefix("human:") { - let normalized_rest = rest.trim(); - if normalized_rest.is_empty() { - return "human:orchestrator".to_string(); - } - return format!("human:{normalized_rest}"); - } - raw -} - -pub(crate) fn sender_is_dashboard_label(sender: &str, self_name: &str) -> bool { - let trimmed = sender.trim(); - trimmed.eq_ignore_ascii_case("Dashboard") - || trimmed.eq_ignore_ascii_case("human:Dashboard") - || trimmed.eq_ignore_ascii_case("human:orchestrator") - || trimmed.eq_ignore_ascii_case(self_name) -} - -/// Connection metadata discovered from a running broker — typically by -/// reading `/connection.json` or from explicit CLI flags / env. -pub(crate) struct BrokerConnection { - base_url: String, - api_key: Option, -} - -/// Resolve the broker connection by checking, in order: -/// -/// 1. Explicit CLI args (`--broker-url`, `--api-key`). When `--broker-url` -/// is supplied without an API key, we still attempt to fall back to the -/// API key from env / `.agent-relay/connection.json` so users don't have -/// to repeat `--api-key` for every dump-pty invocation. -/// 2. Env vars `RELAY_BROKER_URL` / `RELAY_BROKER_API_KEY`. -/// 3. `connection.json` in the supplied state dir, otherwise -/// `.agent-relay/connection.json` directly under the current working -/// directory. The bare `cwd` is intentionally NOT probed — an unrelated -/// `connection.json` sitting in the user's repo root must not silently -/// redirect the snapshot request (and its broker API key) elsewhere. -pub(crate) fn discover_broker_connection( - explicit_url: Option<&str>, - explicit_api_key: Option<&str>, - state_dir: Option<&Path>, -) -> Result { - // Walk the same search roots used for the URL fallback, but only to - // pull out a stored `api_key`. Lets `--broker-url` reuse the broker's - // saved key when the env var and `--api-key` are both unset. - let api_key_from_connection_file = || -> Option { - let cwd = std::env::current_dir().ok()?; - let roots: Vec = match state_dir { - Some(dir) => vec![dir.to_path_buf()], - None => vec![cwd.join(".agent-relay")], - }; - for root in roots { - let path = root.join("connection.json"); - if !path.is_file() { - continue; - } - let body = std::fs::read_to_string(&path).ok()?; - let value: Value = serde_json::from_str(&body).ok()?; - if let Some(key) = value.get("api_key").and_then(Value::as_str) { - if !key.trim().is_empty() { - return Some(key.to_string()); - } - } - } - None - }; - - let resolve_api_key = |explicit: Option<&str>| -> Option { - explicit - .map(ToString::to_string) - .or_else(|| std::env::var("RELAY_BROKER_API_KEY").ok()) - .or_else(api_key_from_connection_file) - .filter(|value| !value.trim().is_empty()) - }; - - if let Some(url) = explicit_url { - return Ok(BrokerConnection { - base_url: url.trim_end_matches('/').to_string(), - api_key: resolve_api_key(explicit_api_key), - }); - } - - if let Ok(url) = std::env::var("RELAY_BROKER_URL") { - let trimmed = url.trim(); - if !trimmed.is_empty() { - return Ok(BrokerConnection { - base_url: trimmed.trim_end_matches('/').to_string(), - api_key: resolve_api_key(explicit_api_key), - }); - } - } - - let cwd = std::env::current_dir().context("failed to read current directory")?; - let search_roots: Vec = match state_dir { - Some(dir) => vec![dir.to_path_buf()], - None => vec![cwd.join(".agent-relay")], - }; - - for root in &search_roots { - let path = root.join("connection.json"); - if !path.is_file() { - continue; - } - let body = std::fs::read_to_string(&path) - .with_context(|| format!("failed reading {}", path.display()))?; - let value: Value = serde_json::from_str(&body) - .with_context(|| format!("failed parsing {}", path.display()))?; - let url = value - .get("url") - .and_then(Value::as_str) - .with_context(|| format!("connection file missing 'url': {}", path.display()))? - .to_string(); - let api_key = explicit_api_key - .map(ToString::to_string) - .or_else(|| std::env::var("RELAY_BROKER_API_KEY").ok()) - .or_else(|| { - value - .get("api_key") - .and_then(Value::as_str) - .map(ToString::to_string) - }) - .filter(|value| !value.trim().is_empty()); - return Ok(BrokerConnection { - base_url: url.trim_end_matches('/').to_string(), - api_key, - }); - } - - anyhow::bail!( - "could not locate broker connection. Pass --broker-url, set RELAY_BROKER_URL, \ - or run from a directory containing .agent-relay/connection.json" - ); -} - -/// `agent-relay-broker dump-pty ` — capture and print a worker's -/// current visible screen by hitting the broker's snapshot route. -pub(crate) async fn run_dump_pty(cmd: DumpPtyCommand) -> Result<()> { - use base64::Engine; - - let connection = discover_broker_connection( - cmd.broker_url.as_deref(), - cmd.api_key.as_deref(), - cmd.state_dir.as_deref(), - )?; - - let url = format!( - "{}/api/spawned/{}/snapshot?format={}", - connection.base_url, - urlencoding::encode(&cmd.name), - cmd.format.as_wire_str(), - ); - - let client = reqwest::Client::builder() - .timeout(Duration::from_secs(10)) - .build() - .context("failed to build http client")?; - - let mut request = client.get(&url); - if let Some(key) = connection.api_key.as_deref() { - request = request.header("X-API-Key", key); - } - let response = request - .send() - .await - .with_context(|| format!("failed reaching broker at {url}"))?; - let status = response.status(); - let body_bytes = response - .bytes() - .await - .context("failed reading broker response body")?; - - if !status.is_success() { - let body_str = String::from_utf8_lossy(&body_bytes); - anyhow::bail!("broker returned {status}: {body_str}"); - } - - let body: Value = - serde_json::from_slice(&body_bytes).context("broker response was not valid JSON")?; - let screen = body - .get("screen") - .and_then(Value::as_str) - .context("broker response missing 'screen' field")?; - - match cmd.format { - DumpPtyFormat::Plain => { - // The plain payload already includes the trailing newline per row. - // Print as-is so pipelines see a stable terminator. - use std::io::Write; - let mut stdout = std::io::stdout().lock(); - stdout - .write_all(screen.as_bytes()) - .context("failed writing snapshot to stdout")?; - stdout.flush().ok(); - } - DumpPtyFormat::Ansi => { - let bytes = base64::engine::general_purpose::STANDARD - .decode(screen) - .context("broker returned non-base64 ansi screen")?; - use std::io::Write; - let mut stdout = std::io::stdout().lock(); - stdout - .write_all(&bytes) - .context("failed writing snapshot to stdout")?; - stdout.flush().ok(); - } - } - - Ok(()) -} +use super::*; pub(crate) async fn run_init(cmd: InitCommand, telemetry: TelemetryClient) -> Result<()> { let broker_start = Instant::now(); @@ -3982,3531 +3207,3 @@ pub(crate) async fn run_init(cmd: InitCommand, telemetry: TelemetryClient) -> Re Ok(()) } - -/// Get terminal rows from TIOCGWINSZ. -#[cfg(unix)] -pub(crate) fn terminal_rows() -> Option { - use nix::libc; - use nix::pty::Winsize; - let mut ws = Winsize { - ws_row: 0, - ws_col: 0, - ws_xpixel: 0, - ws_ypixel: 0, - }; - unsafe { - if libc::ioctl(libc::STDOUT_FILENO, libc::TIOCGWINSZ, &mut ws) == 0 && ws.ws_row > 0 { - Some(ws.ws_row) - } else { - None - } - } -} - -/// Get terminal cols from TIOCGWINSZ. -#[cfg(unix)] -pub(crate) fn terminal_cols() -> Option { - use nix::libc; - use nix::pty::Winsize; - let mut ws = Winsize { - ws_row: 0, - ws_col: 0, - ws_xpixel: 0, - ws_ypixel: 0, - }; - unsafe { - if libc::ioctl(libc::STDOUT_FILENO, libc::TIOCGWINSZ, &mut ws) == 0 && ws.ws_col > 0 { - Some(ws.ws_col) - } else { - None - } - } -} - -#[cfg(not(unix))] -pub(crate) fn terminal_rows() -> Option { - None -} -#[cfg(not(unix))] -pub(crate) fn terminal_cols() -> Option { - None -} - -#[cfg(target_os = "linux")] -pub(crate) fn memory_bytes_for_pid(pid: u32) -> u64 { - let statm_path = format!("/proc/{pid}/statm"); - let statm = match std::fs::read_to_string(statm_path) { - Ok(contents) => contents, - Err(_) => return 0, - }; - - let rss_pages = match statm - .split_whitespace() - .nth(1) - .and_then(|value| value.parse::().ok()) - { - Some(value) => value, - None => return 0, - }; - - let page_size = unsafe { nix::libc::sysconf(nix::libc::_SC_PAGESIZE) }; - if page_size <= 0 { - return 0; - } - - rss_pages.saturating_mul(page_size as u64) -} - -#[cfg(not(target_os = "linux"))] -pub(crate) fn memory_bytes_for_pid(_pid: u32) -> u64 { - 0 -} - -pub(crate) fn build_agent_metrics(handle: &WorkerHandle) -> AgentMetrics { - let pid = handle.child.id().unwrap_or_default(); - AgentMetrics { - name: handle.spec.name.clone(), - pid, - memory_bytes: if pid == 0 { - 0 - } else { - memory_bytes_for_pid(pid) - }, - uptime_secs: handle.spawned_at.elapsed().as_secs(), - } -} - -/// Outcome of [`queue_inbound_for_delivery_mode`]. Distinguishes the -/// three cases broker call sites care about: the message is queued and -/// should wait for an explicit flush, the queue should be drained now, -/// or there's no worker (caller falls through to existing target handling). -#[derive(Debug, Clone, PartialEq, Eq)] -pub(crate) enum InboundQueueOutcome { - Queued, - DrainNow(Vec), - WorkerMissing, -} - -/// Bundle of routing context captured into the pending queue. Mirrors the -/// args `queue_and_try_delivery_raw` -/// expects so a drain reproduces the original delivery exactly — same -/// target (channel / DM / thread sentinel), thread, workspace, -/// priority, and injection mode. -pub(crate) struct InboundContext<'a> { - from: &'a str, - body: &'a str, - target: &'a str, - thread_id: Option<&'a str>, - workspace_id: Option<&'a str>, - workspace_alias: Option<&'a str>, - priority: u8, - mode: MessageInjectionMode, - event_id: Option<&'a str>, -} - -/// Queue an inbound relay message through the per-worker [`InboundDeliveryMode`]. -/// -/// Every inbound message is appended to the per-worker pending queue. In -/// [`InboundDeliveryMode::AutoInject`] the caller immediately drains the queue -/// in the same broker turn; in [`InboundDeliveryMode::ManualFlush`] the message -/// stays parked until an explicit flush or mode transition. -/// -/// Pulled out so the broker has one obvious choke point for the two -/// inbound paths (`/api/send` and the relaycast inbound feed) that the -/// `drive` client needs to intercept. Internal broker-driven injections -/// (`worker_ready` initial task, continuity restore) bypass this queue by -/// not calling this helper. -pub(crate) fn queue_inbound_for_delivery_mode( - delivery_states: &mut HashMap, - workers: &WorkerRegistry, - worker_name: &str, - ctx: InboundContext<'_>, -) -> InboundQueueOutcome { - if !workers.has_worker(worker_name) { - return InboundQueueOutcome::WorkerMissing; - } - let state = delivery_states.entry(worker_name.to_string()).or_default(); - let should_drain = state.should_drain_immediately(); - let queued_at_ms = chrono::Utc::now().timestamp_millis().max(0) as u64; - let msg = PendingRelayMessage { - from: ctx.from.to_string(), - body: ctx.body.to_string(), - target: ctx.target.to_string(), - thread_id: ctx.thread_id.map(str::to_string), - workspace_id: ctx.workspace_id.map(str::to_string), - workspace_alias: ctx.workspace_alias.map(str::to_string), - priority: ctx.priority, - mode: ctx.mode, - queued_at_ms, - event_id: ctx.event_id.map(str::to_string), - }; - match state.accept_inbound(msg) { - InboundDeliveryDispatch::Queued { queue_len } => { - tracing::debug!( - target = "agent_relay::broker", - worker = %worker_name, - from = %ctx.from, - mode = state.mode.as_wire_str(), - queue_len, - "queued inbound relay message" - ); - } - InboundDeliveryDispatch::QueuedEvicted { - queue_len, - dropped_from, - } => { - tracing::warn!( - target = "agent_relay::broker", - worker = %worker_name, - from = %ctx.from, - dropped_from = %dropped_from, - mode = state.mode.as_wire_str(), - queue_len, - max_pending = relay_broker::types::MAX_PENDING_PER_WORKER, - "pending queue full — evicting oldest message" - ); - } - } - if should_drain { - let to_drain = state.drain_pending(); - tracing::debug!( - target = "agent_relay::broker", - worker = %worker_name, - drained = to_drain.len(), - "draining inbound queue immediately (auto_inject delivery mode)" - ); - InboundQueueOutcome::DrainNow(to_drain) - } else { - InboundQueueOutcome::Queued - } -} - -pub(crate) async fn try_inject_pending_relay_message( - workers: &mut WorkerRegistry, - pending_deliveries: &mut HashMap, - worker_name: &str, - msg: &PendingRelayMessage, - retry_interval: Duration, -) -> Result<()> { - let event_id = msg - .event_id - .clone() - .unwrap_or_else(|| format!("flush_{}", Uuid::new_v4().simple())); - match timeout( - retry_interval, - queue_and_try_delivery_raw( - workers, - pending_deliveries, - worker_name, - &event_id, - &msg.from, - // Use the ORIGINAL routing target captured at queue time — - // `#general`, the DM recipient name, `"thread"`, etc. Falling - // back to `worker_name` here would silently reframe channel - // messages as direct-to-worker messages on drain. - &msg.target, - &msg.body, - msg.thread_id.clone(), - msg.workspace_id.clone(), - msg.workspace_alias.clone(), - msg.priority, - msg.mode.clone(), - retry_interval, - ), - ) - .await - { - Ok(result) => result, - Err(_) => Err(anyhow::anyhow!( - "pending relay delivery timed out after {}ms", - retry_interval.as_millis() - )), - } -} - -/// Inject a previously-queued pending relay message into the worker via -/// the existing `queue_and_try_delivery_raw` path. Used by the -/// `/api/spawned/{name}/flush` handler and by the auto-drain on a -/// `manual_flush → auto_inject` transition. Failures are logged but not -/// propagated — the broker treats `flush` as best-effort fire-and-forget -/// the same way `/api/send` does for individual targets. -pub(crate) async fn inject_pending_relay_message( - workers: &mut WorkerRegistry, - pending_deliveries: &mut HashMap, - worker_name: &str, - msg: &PendingRelayMessage, - retry_interval: Duration, -) { - let event_id = msg.event_id.as_deref().unwrap_or(""); - if let Err(error) = try_inject_pending_relay_message( - workers, - pending_deliveries, - worker_name, - msg, - retry_interval, - ) - .await - { - tracing::warn!( - target = "agent_relay::broker", - worker = %worker_name, - from = %msg.from, - event_id = %event_id, - error = %error, - "failed to inject pending relay message during flush" - ); - } -} - -pub(crate) async fn queue_and_try_delivery( - workers: &mut WorkerRegistry, - pending_deliveries: &mut HashMap, - worker_name: &str, - mapped: &relay_broker::types::InboundRelayEvent, - retry_interval: Duration, -) -> Result<()> { - queue_and_try_delivery_raw( - workers, - pending_deliveries, - worker_name, - &mapped.event_id, - &mapped.from, - &mapped.target, - &mapped.text, - mapped.thread_id.clone(), - Some(mapped.workspace_id.clone()), - mapped.workspace_alias.clone(), - mapped.priority.as_u8(), - MessageInjectionMode::Wait, - retry_interval, - ) - .await -} - -#[allow(clippy::too_many_arguments)] -pub(crate) async fn queue_and_try_delivery_raw( - workers: &mut WorkerRegistry, - pending_deliveries: &mut HashMap, - worker_name: &str, - event_id: &str, - from: &str, - target: &str, - body: &str, - thread_id: Option, - workspace_id: Option, - workspace_alias: Option, - priority: u8, - injection_mode: MessageInjectionMode, - retry_interval: Duration, -) -> Result<()> { - let delivery = RelayDelivery { - delivery_id: format!("del_{}", Uuid::new_v4().simple()), - event_id: event_id.to_string(), - workspace_id, - workspace_alias, - from: from.to_string(), - target: target.to_string(), - body: body.to_string(), - thread_id, - priority: Some(priority), - injection_mode, - }; - let delivery_id = delivery.delivery_id.clone(); - pending_deliveries.insert( - delivery_id.clone(), - PendingDelivery { - worker_name: worker_name.to_string(), - delivery, - attempts: 0, - next_retry_at: Instant::now(), - }, - ); - - let _ = - retry_pending_delivery(&delivery_id, workers, pending_deliveries, retry_interval).await?; - Ok(()) -} - -pub(crate) async fn retry_pending_delivery( - delivery_id: &str, - workers: &mut WorkerRegistry, - pending_deliveries: &mut HashMap, - retry_interval: Duration, -) -> Result> { - let pending = match pending_deliveries.get(delivery_id) { - Some(pending) => pending.clone(), - None => return Ok(None), - }; - - if pending.attempts >= MAX_DELIVERY_RETRIES { - pending_deliveries.remove(delivery_id); - return Ok(None); - } - - if !workers.has_worker(&pending.worker_name) { - pending_deliveries.remove(delivery_id); - return Ok(None); - } - - match workers - .deliver(&pending.worker_name, pending.delivery.clone()) - .await - { - Ok(()) => { - if let Some(current) = pending_deliveries.get_mut(delivery_id) { - current.attempts = current.attempts.saturating_add(1); - current.next_retry_at = Instant::now() + retry_interval; - return Ok(Some(( - current.worker_name.clone(), - current.attempts, - current.delivery.event_id.clone(), - ))); - } - Ok(None) - } - Err(error) => { - if let Some(current) = pending_deliveries.get_mut(delivery_id) { - current.next_retry_at = Instant::now() + retry_interval; - } - Err(error) - } - } -} - -pub(crate) fn drop_pending_for_worker( - pending_deliveries: &mut HashMap, - worker_name: &str, -) -> usize { - let before = pending_deliveries.len(); - pending_deliveries.retain(|_, pending| pending.worker_name != worker_name); - before.saturating_sub(pending_deliveries.len()) -} - -/// Drain every in-flight worker request targeting `worker_name` and -/// notify each awaiter with [`worker_request::RequestWorkerError::WorkerDisappeared`]. -/// Called from every worker-teardown path (explicit release, -/// `worker_exited` frame, `reap_exited` periodic sweep) so HTTP callers -/// don't have to wait out the request deadline when the worker has -/// clearly gone. Logs one structured warning per drained request. -pub(crate) fn fail_pending_requests_for_worker( - pending_requests: &mut HashMap, - worker_name: &str, - reason: &'static str, -) -> usize { - let failed = worker_request::fail_for_worker(pending_requests, worker_name); - for (req_id, kind) in &failed { - tracing::warn!( - target = "agent_relay::broker", - request_id = %req_id, - worker = %worker_name, - kind = %kind, - reason = reason, - "failed pending worker request because worker is gone" - ); - } - failed.len() -} - -pub(crate) fn should_clear_pending_delivery_for_event( - pending: Option<&PendingDelivery>, - event_id: Option<&str>, -) -> bool { - let Some(pending) = pending else { - return true; - }; - - let Some(event_id) = event_id - .map(str::trim) - .filter(|event_id| !event_id.is_empty()) - else { - return true; - }; - - pending.delivery.event_id == event_id -} - -pub(crate) fn clear_pending_delivery_if_event_matches( - pending_deliveries: &mut HashMap, - delivery_id: &str, - event_id: Option<&str>, - worker_name: &str, - worker_signal: &str, -) { - let pending = pending_deliveries.get(delivery_id); - if should_clear_pending_delivery_for_event(pending, event_id) { - pending_deliveries.remove(delivery_id); - return; - } - - if let Some(pending) = pending { - tracing::warn!( - target = "agent_relay::broker", - worker = %worker_name, - signal = %worker_signal, - delivery_id = %delivery_id, - expected_event_id = %pending.delivery.event_id, - received_event_id = %event_id.unwrap_or(""), - "ignoring stale delivery lifecycle event due to event_id mismatch" - ); - } -} - -pub(crate) async fn run_headless_worker(cmd: HeadlessCommand) -> Result<()> { - let provider: ProtocolHeadlessProvider = cmd.provider.into(); - let provider_name = headless_provider_cli_name(&provider); - let provider_args = cmd.args.clone(); - - let (out_tx, mut out_rx) = mpsc::channel::>(512); - let writer_task = tokio::spawn(async move { - // Keep one async stdout handle for this process. Tokio's `write_all` - // is not cancel-safe if the task is aborted mid-write, so shutdown - // below drops `out_tx` and awaits this task before returning. - let mut stdout = tokio::io::stdout(); - while let Some(frame) = out_rx.recv().await { - if let Ok(mut line) = serde_json::to_string(&frame) { - line.push('\n'); - if stdout.write_all(line.as_bytes()).await.is_err() || stdout.flush().await.is_err() - { - break; - } - } - } - }); - - let mut lines = BufReader::new(tokio::io::stdin()).lines(); - let mut worker_name = cmd - .agent_name - .clone() - .unwrap_or_else(|| format!("headless-{provider_name}")); - let mut final_exit_code: Option = None; - let mut final_exit_signal: Option = None; - - while let Ok(Some(line)) = lines.next_line().await { - let frame: ProtocolEnvelope = match serde_json::from_str(&line) { - Ok(frame) => frame, - Err(error) => { - let _ = send_frame( - &out_tx, - "worker_error", - None, - json!({ - "code":"invalid_frame", - "message": error.to_string(), - "retryable": false, - }), - ) - .await; - continue; - } - }; - - match frame.msg_type.as_str() { - "init_worker" => { - worker_name = cmd - .agent_name - .clone() - .or_else(|| { - frame - .payload - .get("agent") - .and_then(|a| a.get("name")) - .and_then(Value::as_str) - .map(ToOwned::to_owned) - }) - .unwrap_or_else(|| format!("headless-{provider_name}")); - - let _ = send_frame( - &out_tx, - "worker_ready", - frame.request_id, - json!({ - "name": &worker_name, - "runtime": "headless", - }), - ) - .await; - } - "deliver_relay" => { - let request_id = frame.request_id.clone(); - let delivery: RelayDelivery = match serde_json::from_value(frame.payload) { - Ok(d) => d, - Err(error) => { - let _ = send_frame( - &out_tx, - "worker_error", - request_id, - json!({ - "code":"invalid_delivery", - "message": error.to_string(), - "retryable": false, - }), - ) - .await; - continue; - } - }; - - let timestamp = chrono::Utc::now().timestamp_millis(); - let delivery_id = delivery.delivery_id; - let event_id = delivery.event_id; - - let _ = send_frame( - &out_tx, - "delivery_queued", - None, - json!({ - "delivery_id": delivery_id, - "event_id": event_id, - "agent": &worker_name, - "timestamp": timestamp, - }), - ) - .await; - - let _ = send_frame( - &out_tx, - "delivery_injected", - None, - json!({ - "delivery_id": delivery_id, - "event_id": event_id, - "agent": &worker_name, - "timestamp": timestamp, - }), - ) - .await; - - let _ = send_frame( - &out_tx, - "delivery_active", - None, - json!({ - "delivery_id": delivery_id, - "event_id": event_id, - "pattern": format!("headless:{}", provider_name), - }), - ) - .await; - - let task_text = delivery.body.clone(); - let (binary, args) = - headless_provider_command(&provider, &task_text, &provider_args); - - let mut child_cmd = tokio::process::Command::new(&binary); - child_cmd - .args(&args) - .stdin(Stdio::null()) - .stdout(Stdio::piped()) - .stderr(Stdio::piped()); - - // Auto-approve tool permissions for opencode in headless mode. - if matches!(provider, ProtocolHeadlessProvider::Opencode) { - child_cmd.env( - "OPENCODE_PERMISSION", - r#"{"*":"allow","external_directory":{"*":"allow"}}"#, - ); - } - - let mut child = match child_cmd.spawn() { - Ok(child) => child, - Err(error) => { - let _ = send_frame( - &out_tx, - "delivery_failed", - None, - json!({ - "delivery_id": delivery_id, - "event_id": event_id, - "reason": format!("failed to spawn {}: {}", binary, error), - }), - ) - .await; - let _ = send_frame( - &out_tx, - "worker_error", - request_id, - json!({ - "code":"spawn_failed", - "message": format!("failed to spawn {}: {}", binary, error), - "retryable": false, - }), - ) - .await; - final_exit_code = Some(1); - break; - } - }; - - let _ = send_frame( - &out_tx, - "delivery_ack", - request_id.clone(), - json!({ - "delivery_id": delivery_id, - "event_id": event_id, - }), - ) - .await; - - let stdout = child.stdout.take(); - let stderr = child.stderr.take(); - - let stream_stdout = { - let out_tx = out_tx.clone(); - async move { - if let Some(stdout) = stdout { - let mut lines = BufReader::new(stdout).lines(); - while let Ok(Some(chunk)) = lines.next_line().await { - let _ = send_frame( - &out_tx, - "worker_stream", - None, - json!({ - "stream": "stdout", - "chunk": chunk, - }), - ) - .await; - } - } - } - }; - - let stream_stderr = { - let out_tx = out_tx.clone(); - async move { - if let Some(stderr) = stderr { - let mut lines = BufReader::new(stderr).lines(); - while let Ok(Some(chunk)) = lines.next_line().await { - let _ = send_frame( - &out_tx, - "worker_stream", - None, - json!({ - "stream": "stderr", - "chunk": chunk, - }), - ) - .await; - } - } - } - }; - - let (status, _, _) = tokio::join!(child.wait(), stream_stdout, stream_stderr); - - match status { - Ok(exit_status) => { - final_exit_code = exit_status.code(); - final_exit_signal = None; - if exit_status.success() { - let _ = send_frame( - &out_tx, - "delivery_verified", - None, - json!({ - "delivery_id": delivery_id, - "event_id": event_id, - }), - ) - .await; - } else { - let reason = match exit_status.code() { - Some(code) => format!("{} exited with code {}", binary, code), - None => format!("{} exited without an exit code", binary), - }; - let _ = send_frame( - &out_tx, - "delivery_failed", - None, - json!({ - "delivery_id": delivery_id, - "event_id": event_id, - "reason": reason, - }), - ) - .await; - } - } - Err(error) => { - let reason = format!("failed waiting for {}: {}", binary, error); - let _ = send_frame( - &out_tx, - "delivery_failed", - None, - json!({ - "delivery_id": delivery_id, - "event_id": event_id, - "reason": reason, - }), - ) - .await; - let _ = send_frame( - &out_tx, - "worker_error", - request_id, - json!({ - "code":"wait_failed", - "message": format!("failed waiting for {}: {}", binary, error), - "retryable": false, - }), - ) - .await; - final_exit_code = Some(1); - } - } - - break; - } - "ping" => { - let ts = frame - .payload - .get("ts_ms") - .and_then(Value::as_u64) - .unwrap_or_default(); - let _ = send_frame(&out_tx, "pong", frame.request_id, json!({"ts_ms": ts})).await; - } - "shutdown_worker" => { - break; - } - other => { - let _ = send_frame( - &out_tx, - "worker_error", - frame.request_id, - json!({ - "code":"unknown_type", - "message": format!("unsupported message type '{}'", other), - "retryable": false, - }), - ) - .await; - } - } - } - - let _ = send_frame( - &out_tx, - "worker_exited", - None, - json!({"code": final_exit_code, "signal": final_exit_signal}), - ) - .await; - drop(out_tx); - let _ = writer_task.await; - - Ok(()) -} - -pub(crate) async fn send_error( - tx: &mpsc::Sender>, - request_id: Option, - code: &str, - message: String, - retryable: bool, - data: Option, -) -> Result<()> { - send_frame( - tx, - "error", - request_id, - json!({ - "code": code, - "message": message, - "retryable": retryable, - "data": data, - }), - ) - .await -} - -pub(crate) async fn send_event( - tx: &mpsc::Sender>, - payload: Value, -) -> Result<()> { - send_frame(tx, "event", None, payload).await -} - -pub(crate) async fn emit_http_api_event_with_timeout( - tx: &mpsc::Sender>, - payload: Value, - timeout_window: Duration, -) { - match timeout(timeout_window, send_event(tx, payload)).await { - Ok(Ok(())) => {} - Ok(Err(error)) => { - tracing::warn!( - target = "relay_broker::http_api", - error = %error, - "failed to enqueue HTTP API event" - ); - } - Err(_) => { - tracing::warn!( - target = "relay_broker::http_api", - timeout_ms = %timeout_window.as_millis(), - "timed out enqueuing HTTP API event" - ); - } - } -} - -pub(crate) async fn send_frame( - tx: &mpsc::Sender>, - msg_type: &str, - request_id: Option, - payload: Value, -) -> Result<()> { - tx.send(ProtocolEnvelope { - v: PROTOCOL_VERSION, - msg_type: msg_type.to_string(), - request_id, - payload, - }) - .await - .context("failed to enqueue outbound frame") -} - -pub(crate) fn init_tracing() { - let (writer, guard) = tracing_appender::non_blocking(std::io::stderr()); - let subscriber = tracing_subscriber::fmt::Subscriber::builder() - .with_env_filter( - tracing_subscriber::EnvFilter::try_from_default_env() - .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info")), - ) - .with_target(true) - .with_writer(writer) - .finish(); - if tracing::subscriber::set_global_default(subscriber).is_ok() { - let _ = TRACING_GUARD.set(guard); - } -} - -pub(crate) fn channels_from_csv(raw: &str) -> Vec { - raw.split(',') - .map(str::trim) - .filter(|s| !s.is_empty()) - .map(ToOwned::to_owned) - .collect() -} - -/// Default channels for freshly spawned agents. -/// Reads RELAY_DEFAULT_CHANNELS (comma-separated) or falls back to the -/// broker's default channels: vec!["general", "engineering"] — both created -/// at startup by ensure_default_channels(). -pub(crate) fn default_spawn_channels() -> Vec { - if let Ok(raw) = std::env::var("RELAY_DEFAULT_CHANNELS") { - let parsed = channels_from_csv(&raw); - if !parsed.is_empty() { - return parsed; - } - } - // channels: ["general", "engineering"] (must match ensure_default_channels) - vec!["general".to_string(), "engineering".to_string()] -} - -pub(crate) fn command_targets_self(cmd_event: &BrokerCommandEvent, self_agent_id: &str) -> bool { - match cmd_event.handler_agent_id.as_deref() { - Some(handler_id) => handler_id == self_agent_id, - None => { - tracing::warn!( - command = %cmd_event.command, - invoked_by = %cmd_event.invoked_by, - "command has no handler_agent_id; accepting by default (multi-broker setups should scope commands)" - ); - true - } - } -} - -pub(crate) fn env_flag_enabled(name: &str) -> bool { - std::env::var(name) - .ok() - .map(|value| value.trim().to_ascii_lowercase()) - .is_some_and(|value| matches!(value.as_str(), "1" | "true" | "yes" | "on")) -} - -pub(crate) fn delivery_retry_interval() -> Duration { - let ms = std::env::var("AGENT_RELAY_DELIVERY_RETRY_MS") - .ok() - .and_then(|raw| raw.trim().parse::().ok()) - .unwrap_or(DEFAULT_DELIVERY_RETRY_MS); - Duration::from_millis(ms.max(50)) -} - -pub(crate) fn http_api_local_delivery_timeout() -> Duration { - let ms = std::env::var("AGENT_RELAY_HTTP_API_LOCAL_DELIVERY_TIMEOUT_MS") - .ok() - .and_then(|raw| raw.trim().parse::().ok()) - .unwrap_or(DEFAULT_HTTP_API_LOCAL_DELIVERY_TIMEOUT_MS); - Duration::from_millis(ms.max(100)) -} - -pub(crate) fn http_api_relaycast_send_timeout() -> Duration { - let ms = std::env::var("AGENT_RELAY_HTTP_API_RELAYCAST_SEND_TIMEOUT_MS") - .ok() - .and_then(|raw| raw.trim().parse::().ok()) - .unwrap_or(DEFAULT_HTTP_API_RELAYCAST_SEND_TIMEOUT_MS); - Duration::from_millis(ms.max(500)) -} - -pub(crate) fn http_api_event_emit_timeout() -> Duration { - let ms = std::env::var("AGENT_RELAY_HTTP_API_EVENT_EMIT_TIMEOUT_MS") - .ok() - .and_then(|raw| raw.trim().parse::().ok()) - .unwrap_or(DEFAULT_HTTP_API_EVENT_EMIT_TIMEOUT_MS); - Duration::from_millis(ms.max(25)) -} - -pub(crate) fn normalize_channel(raw: &str) -> String { - let trimmed = raw.trim(); - if trimmed.starts_with('#') { - trimmed.to_string() - } else { - format!("#{trimmed}") - } -} - -pub(crate) fn build_agent_state_transition_event( - name: &str, - state: &str, - reason: Option<&str>, -) -> Value { - let mut payload = json!({ - "type": "agent.state", - "state": state, - "agent": { "name": name }, - "timestamp": chrono::Utc::now().to_rfc3339(), - }); - if let Some(reason) = reason.map(str::trim).filter(|value| !value.is_empty()) { - payload["reason"] = json!(reason); - } - payload -} - -pub(crate) async fn publish_agent_state_transition( - ws_control_tx: &mpsc::Sender, - name: &str, - state: &str, - reason: Option<&str>, -) { - let event = build_agent_state_transition_event(name, state, reason); - if let Err(error) = ws_control_tx.send(WsControl::Publish(event)).await { - tracing::debug!( - agent = %name, - state = %state, - error = %error, - "failed to publish agent state transition" - ); - } -} - -pub(crate) fn normalize_identity_for_thread(raw: &str) -> String { - raw.trim().trim_start_matches('@').to_ascii_lowercase() -} - -pub(crate) fn json_scalar_to_string(value: &Value) -> Option { - match value { - Value::String(text) => { - let trimmed = text.trim(); - if trimmed.is_empty() { - None - } else { - Some(trimmed.to_string()) - } - } - Value::Number(number) => Some(number.to_string()), - _ => None, - } -} - -pub(crate) fn first_string(value: &Value, pointers: &[&str]) -> Option { - pointers - .iter() - .find_map(|pointer| value.pointer(pointer).and_then(json_scalar_to_string)) -} - -pub(crate) fn first_bool(value: &Value, pointers: &[&str]) -> Option { - pointers - .iter() - .find_map(|pointer| value.pointer(pointer).and_then(Value::as_bool)) -} - -pub(crate) fn first_u64(value: &Value, pointers: &[&str]) -> Option { - pointers - .iter() - .find_map(|pointer| value.pointer(pointer).and_then(Value::as_u64)) -} - -pub(crate) fn first_i64(value: &Value, pointers: &[&str]) -> Option { - pointers - .iter() - .find_map(|pointer| value.pointer(pointer).and_then(Value::as_i64)) -} - -pub(crate) fn relaycast_ws_control_dedup_key( - workspace_id: &str, - ws_type: &str, - value: &Value, -) -> Option { - let identity = if ws_type == "agent.spawn_requested" { - relaycast_ws_spawn_token(value) - .or_else(|| { - first_string( - value, - &[ - "/event_id", - "/id", - "/payload/id", - "/payload/event_id", - "/agent/id", - "/agent/event_id", - "/message/id", - "/message/event_id", - "/message_id", - ], - ) - }) - .or_else(|| first_string(value, &["/agent/name", "/payload/agent/name", "/name"])) - } else { - first_string( - value, - &[ - "/event_id", - "/id", - "/payload/id", - "/payload/event_id", - "/agent/id", - "/agent/event_id", - "/message/id", - "/message/event_id", - "/message_id", - ], - ) - } - .or_else(|| serde_json::to_string(value).ok())?; - Some(format!("control:{workspace_id}:{ws_type}:{identity}")) -} - -pub(crate) fn relaycast_ws_spawn_token(value: &Value) -> Option { - first_string( - value, - &[ - "/agent/token", - "/agent/relay_key", - "/agent/api_key", - "/token", - ], - ) -} - -pub(crate) fn relaycast_spawn_control_dedup_key(workspace_id: &str, identity: &str) -> String { - format!("control:{workspace_id}:agent.spawn_requested:{identity}") -} - -pub(crate) fn relaycast_ws_should_apply_local_spawn_echo_dedup( - control_dedup_key: Option<&str>, - local_spawn_echo_key: &str, -) -> bool { - control_dedup_key != Some(local_spawn_echo_key) -} - -pub(crate) fn note_local_spawn_control_dedup( - dedup: &mut DedupCache, - workspace_id: Option<&str>, - agent_name: &str, - relay_key: Option<&str>, -) { - let Some(workspace_id) = workspace_id else { - return; - }; - let agent_name = agent_name.trim(); - if !agent_name.is_empty() { - let key = relaycast_spawn_control_dedup_key(workspace_id, agent_name); - dedup.insert_if_new(&key, Instant::now()); - } - if let Some(relay_key) = relay_key.map(str::trim).filter(|value| !value.is_empty()) { - let key = relaycast_spawn_control_dedup_key(workspace_id, relay_key); - dedup.insert_if_new(&key, Instant::now()); - } -} - -pub(crate) fn is_unknown_worker_error_message(message: &str) -> bool { - message.contains("unknown worker '") -} - -pub(crate) fn is_relaycast_self_control_target( - name: &str, - workspace_self_name: &str, - workspace_self_names: &HashSet, -) -> bool { - let normalized = normalize_identity_for_thread(name); - normalized == normalize_identity_for_thread(workspace_self_name) - || workspace_self_names.contains(&normalized) -} - -pub(crate) fn message_sender(value: &Value) -> Option { - first_string( - value, - &[ - "/from", - "/sender", - "/author", - "/agent_name", - "/message/from", - "/message/sender", - "/message/author", - "/payload/from", - "/payload/sender", - "/payload/author", - "/payload/message/from", - "/payload/message/sender", - "/payload/message/author", - ], - ) -} - -pub(crate) fn message_target(value: &Value) -> Option { - first_string( - value, - &[ - "/target", - "/to", - "/recipient", - "/channel", - "/conversation_id", - "/conversationId", - "/message/target", - "/message/to", - "/message/recipient", - "/message/channel", - "/message/conversation_id", - "/message/conversationId", - "/payload/target", - "/payload/to", - "/payload/recipient", - "/payload/channel", - "/payload/conversation_id", - "/payload/conversationId", - "/payload/message/target", - "/payload/message/to", - "/payload/message/recipient", - "/payload/message/channel", - "/payload/message/conversation_id", - "/payload/message/conversationId", - ], - ) -} - -pub(crate) fn message_preview(value: &Value) -> Option { - let text = first_string( - value, - &[ - "/text", - "/body", - "/content", - "/message/text", - "/message/body", - "/message/content", - "/payload/text", - "/payload/body", - "/payload/content", - "/payload/message/text", - "/payload/message/body", - "/payload/message/content", - "/message", - "/payload/message", - ], - )?; - Some(truncate_thread_preview(&text, 200)) -} - -pub(crate) fn truncate_thread_preview(input: &str, max_len: usize) -> String { - let trimmed = input.trim(); - if trimmed.len() <= max_len { - return trimmed.to_string(); - } - let boundary = floor_char_boundary(trimmed, max_len); - let mut out = trimmed[..boundary].to_string(); - out.push_str("..."); - out -} - -pub(crate) fn parse_sort_key_from_raw_timestamp(raw: &str) -> Option { - let trimmed = raw.trim(); - if trimmed.is_empty() { - return None; - } - if let Ok(epoch) = trimmed.parse::() { - return Some(epoch); - } - chrono::DateTime::parse_from_rfc3339(trimmed) - .ok() - .map(|parsed| parsed.timestamp_millis()) -} - -pub(crate) fn message_timestamp_string(value: &Value) -> Option { - first_string( - value, - &[ - "/created_at", - "/createdAt", - "/timestamp", - "/ts", - "/message/created_at", - "/message/createdAt", - "/message/timestamp", - "/message/ts", - "/payload/created_at", - "/payload/createdAt", - "/payload/timestamp", - "/payload/ts", - "/payload/message/created_at", - "/payload/message/createdAt", - "/payload/message/timestamp", - "/payload/message/ts", - ], - ) -} - -pub(crate) fn message_sort_key(value: &Value, index: usize) -> i64 { - if let Some(raw) = message_timestamp_string(value) { - if let Some(parsed) = parse_sort_key_from_raw_timestamp(&raw) { - return parsed; - } - } - - first_i64( - value, - &[ - "/created_at", - "/createdAt", - "/timestamp", - "/ts", - "/message/created_at", - "/message/createdAt", - "/message/timestamp", - "/message/ts", - "/payload/created_at", - "/payload/createdAt", - "/payload/timestamp", - "/payload/ts", - ], - ) - .unwrap_or(index as i64) -} - -pub(crate) fn message_thread_id(value: &Value) -> Option { - if let Some(explicit) = first_string( - value, - &[ - "/thread_id", - "/threadId", - "/parent_id", - "/conversation_id", - "/conversationId", - "/message/thread_id", - "/message/threadId", - "/message/parent_id", - "/message/conversation_id", - "/message/conversationId", - "/payload/thread_id", - "/payload/threadId", - "/payload/parent_id", - "/payload/conversation_id", - "/payload/conversationId", - "/payload/message/thread_id", - "/payload/message/threadId", - "/payload/message/parent_id", - "/payload/message/conversation_id", - "/payload/message/conversationId", - ], - ) { - return Some(explicit); - } - - let target = message_target(value)?; - if target.starts_with('#') { - return Some(normalize_channel(&target)); - } - if target.starts_with("conv_") - || target.starts_with("dm_") - || target.chars().all(|ch| ch.is_ascii_digit()) - { - return Some(target); - } - - let sender = message_sender(value)?; - let sender = normalize_identity_for_thread(&sender); - let target = normalize_identity_for_thread(&target); - if sender.is_empty() || target.is_empty() { - return None; - } - let (first, second) = if sender <= target { - (sender, target) - } else { - (target, sender) - }; - Some(format!("direct:{first}:{second}")) -} - -pub(crate) fn is_self_identity(value: &str, self_names: &HashSet) -> bool { - let normalized = normalize_identity_for_thread(value); - !normalized.is_empty() - && self_names - .iter() - .any(|self_name| normalize_identity_for_thread(self_name) == normalized) -} - -pub(crate) fn derive_thread_name( - message: &Value, - thread_id: &str, - self_names: &HashSet, -) -> String { - if let Some(explicit) = first_string( - message, - &[ - "/thread_name", - "/threadName", - "/title", - "/subject", - "/conversation_name", - "/conversationName", - ], - ) { - return explicit; - } - - if thread_id.starts_with('#') { - return thread_id.to_string(); - } - - // Use participants array (from workspace-level DM data) to build a combined name - // like "WorkerA ↔ WorkerB" for DMs between non-broker agents. - if let Some(participants) = message.get("participants").and_then(|v| v.as_array()) { - let names: Vec<&str> = participants - .iter() - .filter_map(|p| p.as_str()) - .filter(|name| !is_self_identity(name, self_names)) - .collect(); - if names.len() >= 2 { - return format!("{} ↔ {}", names[0], names[1]); - } else if names.len() == 1 { - return names[0].to_string(); - } - } - - if let Some(sender) = message_sender(message) { - if !is_self_identity(&sender, self_names) { - return sender.trim().trim_start_matches('@').to_string(); - } - } - - if let Some(target) = message_target(message) { - let trimmed = target.trim().trim_start_matches('@'); - if trimmed.starts_with('#') { - return normalize_channel(trimmed); - } - if !trimmed.is_empty() - && !trimmed.eq_ignore_ascii_case(thread_id) - && !is_self_identity(trimmed, self_names) - && !trimmed.starts_with("conv_") - && !trimmed.starts_with("dm_") - && !trimmed.chars().all(|ch| ch.is_ascii_digit()) - { - return trimmed.to_string(); - } - } - - thread_id.to_string() -} - -pub(crate) fn thread_unread_increment(message: &Value, self_names: &HashSet) -> usize { - if let Some(read) = first_bool( - message, - &[ - "/read", - "/is_read", - "/isRead", - "/message/read", - "/message/is_read", - "/message/isRead", - "/payload/read", - "/payload/is_read", - "/payload/isRead", - "/payload/message/read", - "/payload/message/is_read", - "/payload/message/isRead", - ], - ) { - return usize::from(!read); - } - - if let Some(sender) = message_sender(message) { - return usize::from(!is_self_identity(&sender, self_names)); - } - 0 -} - -pub(crate) fn build_thread_infos( - messages: &[Value], - self_names: &HashSet, -) -> Vec { - let mut by_thread: HashMap = HashMap::new(); - - for (index, message) in messages.iter().enumerate() { - let Some(thread_id) = message_thread_id(message) else { - continue; - }; - - let name = derive_thread_name(message, &thread_id, self_names); - let sort_key = message_sort_key(message, index); - let preview = message_preview(message); - let timestamp = message_timestamp_string(message); - let explicit_unread = first_u64( - message, - &[ - "/unread_count", - "/unreadCount", - "/message/unread_count", - "/message/unreadCount", - "/payload/unread_count", - "/payload/unreadCount", - "/payload/message/unread_count", - "/payload/message/unreadCount", - ], - ) - .map(|value| value as usize); - let unread_delta = thread_unread_increment(message, self_names); - - let entry = by_thread - .entry(thread_id.clone()) - .or_insert_with(|| ThreadAccumulator { - info: ThreadInfo { - thread_id: thread_id.clone(), - name: name.clone(), - unread_count: 0, - last_message: None, - last_message_at: None, - }, - sort_key, - }); - - if entry.info.name == entry.info.thread_id && name != entry.info.thread_id { - entry.info.name = name.clone(); - } - - if let Some(explicit_unread) = explicit_unread { - entry.info.unread_count = entry.info.unread_count.max(explicit_unread); - } else { - entry.info.unread_count = entry.info.unread_count.saturating_add(unread_delta); - } - - if sort_key >= entry.sort_key { - entry.sort_key = sort_key; - entry.info.name = name; - entry.info.last_message = preview; - entry.info.last_message_at = timestamp; - } - } - - let mut threads: Vec = by_thread.into_values().collect(); - threads.sort_by(|left, right| { - right - .sort_key - .cmp(&left.sort_key) - .then_with(|| left.info.thread_id.cmp(&right.info.thread_id)) - }); - - threads.into_iter().map(|entry| entry.info).collect() -} - -pub(crate) fn record_thread_history_event(history: &mut VecDeque, event: Value) { - if history.len() >= THREAD_HISTORY_LIMIT { - let _ = history.pop_front(); - } - history.push_back(event); -} - -/// Get current terminal size. Returns (rows, cols). -/// -/// Uses `crossterm::terminal::size()`, which is cross-platform: -/// TIOCGWINSZ on unix, GetConsoleScreenBufferInfo on Windows. -pub(crate) fn get_terminal_size() -> Option<(u16, u16)> { - crossterm::terminal::size() - .ok() - .map(|(cols, rows)| (rows, cols)) -} - -/// Detect Claude Code auto-suggestion ghost text. -/// -/// Auto-suggestions are rendered with reverse-video cursor + dim ghost text, -/// and often include the "↵ send" hint. -/// Extract Relaycast message IDs from MCP tool response output. -/// -/// When the agent sends a message via MCP (send_dm, send_message, etc.), -/// the response JSON contains `"id": ""`. We extract these IDs -/// and pre-seed the dedup cache so the WS echo of the same message is dropped. -/// This is more robust than name-based filtering since it works regardless -/// of what identity the MCP server registers with. -pub(crate) fn extract_mcp_message_ids(buffer: &str) -> Vec { - let mut ids = Vec::new(); - // Match patterns like "id": "147310274064424960" (Relaycast snowflake IDs are 18-digit numbers) - let mut search_start = 0; - while let Some(key_pos) = buffer[search_start..].find("\"id\"") { - let abs_pos = search_start + key_pos + 4; // skip past "id" - if abs_pos >= buffer.len() { - break; - } - let rest = &buffer[abs_pos..]; - // Skip whitespace and colon - let rest = rest.trim_start(); - let rest = if let Some(r) = rest.strip_prefix(':') { - r.trim_start() - } else { - search_start = abs_pos; - continue; - }; - // Extract quoted value - if let Some(r) = rest.strip_prefix('"') { - if let Some(end) = r.find('"') { - let value = &r[..end]; - // Only match numeric snowflake IDs (15-20 digits) - if value.len() >= 15 - && value.len() <= 20 - && value.chars().all(|c| c.is_ascii_digit()) - { - ids.push(value.to_string()); - } - } - } - search_start = abs_pos; - } - ids -} - -/// Returns the continuity directory path derived from the state file path. -/// State path is always `{cwd}/.agent-relay/state.json`, so parent is `{cwd}/.agent-relay/`. -pub(crate) fn continuity_dir(state_path: &Path) -> PathBuf { - state_path - .parent() - .expect("state_path always has a parent (.agent-relay/)") - .join("continuity") -} - -/// Create ephemeral runtime paths in the system temp directory. -/// -/// Unlike `ensure_runtime_paths`, this function: -/// - Writes nothing to the project directory -/// - Uses a deterministic temp directory derived from cwd+broker name so -/// duplicate brokers still collide on the same lock/PID files -/// -/// The temp directory is NOT removed on exit — the OS cleans it up on reboot. -/// State and pending-delivery files are still written there so they don't -/// interfere with the project tree; they're just ephemeral. -/// Ephemeral mode: no lock file, no PID file, no temp directory. -/// The broker lifecycle is tied to the parent process via stdin — when the -/// parent (SDK client) exits, stdin gets EOF and the broker shuts down. -/// Single-instance enforcement is unnecessary here because each SDK client -/// manages its own child process. -pub(crate) fn ensure_ephemeral_paths(_cwd: &Path, _broker_name: &str) -> Result { - // Use a random temp subdir so concurrent ephemeral brokers don't collide - // on state files. - let root = std::env::temp_dir().join(format!("agent-relay-ephemeral-{}", std::process::id())); - std::fs::create_dir_all(&root) - .with_context(|| format!("failed to create ephemeral temp dir {}", root.display()))?; - - Ok(RuntimePaths { - persist: false, - state: root.join("state.json"), - pending: root.join("pending.json"), - _lock: None, - }) -} - -pub(crate) fn ensure_runtime_paths( - cwd: &Path, - broker_name: &str, - state_dir: Option<&Path>, -) -> Result { - let root = state_dir - .map(PathBuf::from) - .unwrap_or_else(|| cwd.join(".agent-relay")); - std::fs::create_dir_all(&root) - .with_context(|| format!("failed to create runtime dir {}", root.display()))?; - - // Sanitise name for use in filenames — keep only alphanumeric and hyphens - let safe_name: String = broker_name - .chars() - .map(|c| { - if c.is_alphanumeric() || c == '-' { - c - } else { - '-' - } - }) - .collect(); - - // Lock and PID files are per-broker-name so concurrent workflows can coexist. - let lock_path = root.join(format!("broker-{safe_name}.lock")); - let lock_file = std::fs::File::create(&lock_path) - .with_context(|| format!("failed to create lock file {}", lock_path.display()))?; - - #[cfg(unix)] - { - use std::os::unix::io::AsRawFd; - let fd = lock_file.as_raw_fd(); - let rc = unsafe { nix::libc::flock(fd, nix::libc::LOCK_EX | nix::libc::LOCK_NB) }; - if rc != 0 { - // Lock acquisition failed — check if the holder is still alive - // by reading the PID from connection.json. - let connection_path = root.join("connection.json"); - let old_pid = std::fs::read_to_string(&connection_path) - .ok() - .and_then(|c| serde_json::from_str::(&c).ok()) - .and_then(|v| v.get("pid").and_then(|p| p.as_u64())) - .map(|p| p as u32); - if let Some(old_pid) = old_pid { - if !broker::is_pid_alive(old_pid) { - tracing::warn!( - old_pid = old_pid, - "stale broker lock detected (PID {} is dead), recovering", - old_pid - ); - // The old process is dead — remove stale PID file and retry lock. - // We drop and re-create the lock file to clear the stale flock. - drop(lock_file); - let lock_file = std::fs::File::create(&lock_path).with_context(|| { - format!( - "failed to re-create lock file after stale recovery {}", - lock_path.display() - ) - })?; - let fd = lock_file.as_raw_fd(); - let rc = - unsafe { nix::libc::flock(fd, nix::libc::LOCK_EX | nix::libc::LOCK_NB) }; - if rc != 0 { - anyhow::bail!( - "another broker instance is already running in this directory ({})", - root.display() - ); - } - // Successfully recovered — PID is written via connection.json at API start - return Ok(RuntimePaths { - persist: true, - state: root.join(format!("state-{safe_name}.json")), - pending: root.join(format!("pending-{safe_name}.json")), - _lock: Some(lock_file), - }); - } else { - anyhow::bail!( - "another broker instance is already running in this directory (pid: {}, {})", - old_pid, - root.display() - ); - } - } - // PID file missing or unreadable while lock is held — treat as stale. - // This happens when the user deletes .agent-relay/ while an old broker - // is still alive, or during the shutdown race (PID deleted before flock - // released). - tracing::warn!( - "broker lock held but no valid PID file found, treating as stale and recovering" - ); - drop(lock_file); - let lock_file = std::fs::File::create(&lock_path).with_context(|| { - format!( - "failed to re-create lock file after stale recovery {}", - lock_path.display() - ) - })?; - let fd = lock_file.as_raw_fd(); - let rc = unsafe { nix::libc::flock(fd, nix::libc::LOCK_EX | nix::libc::LOCK_NB) }; - if rc != 0 { - anyhow::bail!( - "another broker instance is already running in this directory ({})", - root.display() - ); - } - return Ok(RuntimePaths { - persist: true, - state: root.join(format!("state-{safe_name}.json")), - pending: root.join(format!("pending-{safe_name}.json")), - _lock: Some(lock_file), - }); - } - } - - // PID is written via connection.json at API start - - Ok(RuntimePaths { - persist: true, - state: root.join(format!("state-{safe_name}.json")), - pending: root.join(format!("pending-{safe_name}.json")), - _lock: Some(lock_file), - }) -} - -pub(crate) fn derive_ws_base_url_from_http(http_base: &str) -> String { - let trimmed = http_base.trim(); - if let Some(rest) = trimmed.strip_prefix("https://") { - format!("wss://{rest}") - } else if let Some(rest) = trimmed.strip_prefix("http://") { - format!("ws://{rest}") - } else { - trimmed.to_string() - } -} - -#[cfg(test)] -mod tests { - use std::{ - collections::{BTreeSet, HashMap, HashSet}, - path::PathBuf, - process::Stdio, - time::{Duration, Instant}, - }; - - use crate::helpers::{ - detect_bypass_permissions_prompt, detect_claude_trust_prompt, floor_char_boundary, - format_injection, is_auto_suggestion, is_bypass_selection_menu, is_in_editor_mode, - strip_ansi, - }; - use crate::worker::{WorkerEvent, WorkerHandle, WorkerRegistry}; - use relay_broker::protocol::{AgentSpec, MessageInjectionMode, RelayDelivery}; - use serde_json::{json, Value}; - use tokio::sync::mpsc; - - use super::{ - build_agent_state_transition_event, build_http_api_spawn_spec, build_thread_infos, - channels_from_csv, continuity_dir, delivery_retry_interval, derive_ws_base_url_from_http, - display_target_for_dashboard, drop_pending_for_worker, extract_mcp_message_ids, - http_api_event_emit_timeout, http_api_local_delivery_timeout, - http_api_relaycast_send_timeout, is_relaycast_self_control_target, - is_unknown_worker_error_message, normalize_channel, normalize_initial_task, - normalize_sender, queue_inbound_for_delivery_mode, relaycast_spawn_control_dedup_key, - relaycast_ws_control_dedup_key, relaycast_ws_should_apply_local_spawn_echo_dedup, - relaycast_ws_spawn_token, sender_is_dashboard_label, - should_clear_pending_delivery_for_event, AgentRuntime, InboundContext, InboundQueueOutcome, - PendingDelivery, ProtocolHeadlessProvider, - }; - use relay_broker::dedup::DedupCache; - use relay_broker::relaycast_ws::{ - format_worker_preregistration_error, RelaycastRegistrationError, - }; - use relay_broker::types::{InboundDeliveryMode, InboundDeliveryState}; - - async fn make_worker_registry_with_worker(name: &str) -> WorkerRegistry { - let (tx, _rx) = mpsc::channel::(16); - let mut registry = WorkerRegistry::new( - tx, - Vec::new(), - PathBuf::from("/tmp/agent-relay-broker-tests"), - Instant::now(), - ); - let mut child = tokio::process::Command::new("cat") - .stdin(Stdio::piped()) - .stdout(Stdio::null()) - .stderr(Stdio::null()) - .spawn() - .expect("test worker process should spawn"); - let stdin = child.stdin.take().expect("test worker stdin should exist"); - registry.workers.insert( - name.to_string(), - WorkerHandle { - spec: AgentSpec { - name: name.to_string(), - runtime: AgentRuntime::Pty, - provider: None, - cli: Some("cat".to_string()), - model: None, - cwd: None, - team: None, - shadow_of: None, - shadow_mode: None, - args: Vec::new(), - channels: Vec::new(), - restart_policy: None, - }, - parent: None, - workspace_id: Some("ws_demo".to_string()), - child, - stdin, - spawned_at: Instant::now(), - }, - ); - registry - } - - async fn cleanup_worker_registry(mut registry: WorkerRegistry) { - for handle in registry.workers.values_mut() { - let _ = handle.child.start_kill(); - let _ = handle.child.wait().await; - } - } - - fn inbound_ctx<'a>(event_id: &'a str) -> InboundContext<'a> { - InboundContext { - from: "Alice", - body: "hello from relay", - target: "#general", - thread_id: Some("thr_123"), - workspace_id: Some("ws_demo"), - workspace_alias: Some("Demo"), - priority: 1, - mode: MessageInjectionMode::Steer, - event_id: Some(event_id), - } - } - - #[tokio::test] - async fn inbound_queue_auto_inject_drains_immediately_with_full_context() { - let worker_name = "worker-a"; - let workers = make_worker_registry_with_worker(worker_name).await; - let mut delivery_states = HashMap::new(); - - let outcome = queue_inbound_for_delivery_mode( - &mut delivery_states, - &workers, - worker_name, - inbound_ctx("evt_auto"), - ); - - match outcome { - InboundQueueOutcome::DrainNow(messages) => { - assert_eq!(messages.len(), 1); - let msg = &messages[0]; - assert_eq!(msg.from, "Alice"); - assert_eq!(msg.body, "hello from relay"); - assert_eq!(msg.target, "#general"); - assert_eq!(msg.thread_id.as_deref(), Some("thr_123")); - assert_eq!(msg.workspace_id.as_deref(), Some("ws_demo")); - assert_eq!(msg.workspace_alias.as_deref(), Some("Demo")); - assert_eq!(msg.priority, 1); - assert_eq!(msg.mode, MessageInjectionMode::Steer); - assert_eq!(msg.event_id.as_deref(), Some("evt_auto")); - } - other => panic!("expected immediate drain, got {other:?}"), - } - assert_eq!( - delivery_states - .get(worker_name) - .expect("state should be created") - .pending_snapshot(), - Vec::new(), - "auto_inject drains the per-worker pending queue in the same broker turn" - ); - - cleanup_worker_registry(workers).await; - } - - #[tokio::test] - async fn inbound_queue_manual_flush_holds_until_explicit_drain() { - let worker_name = "worker-a"; - let workers = make_worker_registry_with_worker(worker_name).await; - let mut delivery_states = HashMap::from([( - worker_name.to_string(), - InboundDeliveryState::new(InboundDeliveryMode::ManualFlush), - )]); - - let outcome = queue_inbound_for_delivery_mode( - &mut delivery_states, - &workers, - worker_name, - inbound_ctx("evt_manual"), - ); - - assert_eq!(outcome, InboundQueueOutcome::Queued); - let snapshot = delivery_states - .get(worker_name) - .expect("manual state should remain present") - .pending_snapshot(); - assert_eq!(snapshot.len(), 1); - assert_eq!(snapshot[0].event_id.as_deref(), Some("evt_manual")); - assert_eq!(snapshot[0].target, "#general"); - - cleanup_worker_registry(workers).await; - } - - #[tokio::test] - async fn inbound_queue_worker_missing_does_not_create_state() { - let (tx, _rx) = mpsc::channel::(16); - let workers = WorkerRegistry::new( - tx, - Vec::new(), - PathBuf::from("/tmp/agent-relay-broker-tests"), - Instant::now(), - ); - let mut delivery_states = HashMap::new(); - - let outcome = queue_inbound_for_delivery_mode( - &mut delivery_states, - &workers, - "ghost", - inbound_ctx("evt_missing"), - ); - - assert_eq!(outcome, InboundQueueOutcome::WorkerMissing); - assert!(delivery_states.is_empty()); - } - - fn extract_kind_literals(source: &str) -> BTreeSet { - let marker = "\"kind\""; - let mut kinds = BTreeSet::new(); - let mut cursor = 0; - while let Some(offset) = source[cursor..].find(marker) { - let mut start = cursor + offset + marker.len(); - if start >= source.len() { - break; - } - if !source[start..].starts_with(':') { - cursor = start; - continue; - } - start += 1; - while start < source.len() && source.as_bytes()[start].is_ascii_whitespace() { - start += 1; - } - if start >= source.len() || source.as_bytes()[start] != b'"' { - cursor = start; - continue; - } - start += 1; - if let Some(end) = source[start..].find('"') { - let candidate = &source[start..start + end]; - if !candidate.is_empty() - && candidate - .chars() - .all(|c| c.is_ascii_lowercase() || c == '_' || c.is_ascii_digit()) - { - kinds.insert(candidate.to_string()); - } - } - cursor = start; - if cursor >= source.len() { - break; - } - } - kinds - } - - #[test] - fn parses_channels() { - assert_eq!(channels_from_csv("general,ops"), vec!["general", "ops"]); - } - - #[test] - fn channel_normalization() { - assert_eq!(normalize_channel("general"), "#general"); - assert_eq!(normalize_channel("#ops"), "#ops"); - } - - #[test] - fn normalize_initial_task_drops_empty_values() { - assert_eq!(normalize_initial_task(None), None); - assert_eq!(normalize_initial_task(Some(String::new())), None); - assert_eq!(normalize_initial_task(Some(" ".to_string())), None); - } - - #[test] - fn normalize_initial_task_keeps_non_empty_values() { - assert_eq!( - normalize_initial_task(Some("Ship the patch".to_string())), - Some("Ship the patch".to_string()) - ); - } - - #[test] - fn ws_base_derivation() { - assert_eq!( - derive_ws_base_url_from_http("https://api.relaycast.dev"), - "wss://api.relaycast.dev" - ); - assert_eq!( - derive_ws_base_url_from_http("http://localhost:8787"), - "ws://localhost:8787" - ); - } - - #[test] - fn relaycast_control_dedup_key_prefers_event_id() { - let value = json!({ - "type": "agent.spawn_requested", - "event_id": "evt_123", - "agent": { "name": "worker-a", "cli": "claude", "task": "Ship it" } - }); - - assert_eq!( - relaycast_ws_control_dedup_key("ws_1", "agent.spawn_requested", &value), - Some("control:ws_1:agent.spawn_requested:evt_123".to_string()) - ); - } - - #[test] - fn relaycast_control_dedup_key_prefers_spawn_token_for_spawn_requests() { - let value = json!({ - "type": "agent.spawn_requested", - "event_id": "evt_123", - "agent": { - "name": "worker-a", - "cli": "claude", - "task": "Ship it", - "token": "at_live_worker" - } - }); - - assert_eq!( - relaycast_ws_control_dedup_key("ws_1", "agent.spawn_requested", &value), - Some("control:ws_1:agent.spawn_requested:at_live_worker".to_string()) - ); - } - - #[test] - fn relaycast_control_dedup_key_falls_back_to_agent_name_for_spawn_requests() { - let value = json!({ - "type": "agent.spawn_requested", - "agent": { - "name": "worker-a", - "cli": "claude", - "task": "Ship it" - } - }); - - assert_eq!( - relaycast_ws_control_dedup_key("ws_1", "agent.spawn_requested", &value), - Some("control:ws_1:agent.spawn_requested:worker-a".to_string()) - ); - } - - #[test] - fn relaycast_control_dedup_key_falls_back_to_serialized_payload() { - let value = json!({ - "type": "agent.release_requested", - "agent": { "name": "worker-a" } - }); - - let key = relaycast_ws_control_dedup_key("ws_1", "agent.release_requested", &value) - .expect("fallback dedup key"); - assert!(key.starts_with("control:ws_1:agent.release_requested:{")); - assert!(key.contains("\"worker-a\"")); - } - - #[test] - fn relaycast_ws_spawn_token_extracts_agent_token() { - let value = json!({ - "type": "agent.spawn_requested", - "agent": { - "name": "worker-a", - "token": "at_live_worker" - } - }); - - assert_eq!( - relaycast_ws_spawn_token(&value), - Some("at_live_worker".to_string()) - ); - } - - #[test] - fn relaycast_ws_spawn_name_only_control_key_skips_second_name_dedup() { - let value = json!({ - "type": "agent.spawn_requested", - "agent": { - "name": "worker-a", - "cli": "claude", - "task": "Ship it" - } - }); - - let control_key = relaycast_ws_control_dedup_key("ws_1", "agent.spawn_requested", &value) - .expect("control dedup key"); - let local_key = relaycast_spawn_control_dedup_key("ws_1", "worker-a"); - - assert_eq!(control_key, local_key); - assert!(!relaycast_ws_should_apply_local_spawn_echo_dedup( - Some(control_key.as_str()), - &local_key - )); - } - - #[test] - fn relaycast_ws_spawn_event_id_echo_still_uses_local_name_dedup() { - let value = json!({ - "type": "agent.spawn_requested", - "event_id": "evt_123", - "agent": { - "name": "worker-a", - "cli": "claude", - "task": "Ship it" - } - }); - - let control_key = relaycast_ws_control_dedup_key("ws_1", "agent.spawn_requested", &value) - .expect("control dedup key"); - let local_key = relaycast_spawn_control_dedup_key("ws_1", "worker-a"); - - assert_ne!(control_key, local_key); - assert!(relaycast_ws_should_apply_local_spawn_echo_dedup( - Some(control_key.as_str()), - &local_key - )); - - let now = Instant::now(); - let mut dedup = DedupCache::new(Duration::from_secs(60), 16); - assert!(dedup.insert_if_new(&local_key, now)); - assert!(dedup.insert_if_new(&control_key, now + Duration::from_secs(1))); - assert!(!dedup.insert_if_new(&local_key, now + Duration::from_secs(2))); - } - - #[test] - fn unknown_worker_error_message_matches_release_failures() { - assert!(is_unknown_worker_error_message("unknown worker 'worker-a'")); - assert!(is_unknown_worker_error_message( - "failed to release 'worker-a': unknown worker 'worker-a'" - )); - assert!(!is_unknown_worker_error_message("failed to bind api port")); - } - - #[test] - fn relaycast_self_control_target_matches_aliases_case_insensitively() { - let self_names = HashSet::from([ - "relay-broker".to_string(), - "relay-broker@workspace".to_string(), - ]); - - assert!(is_relaycast_self_control_target( - "Relay-Broker", - "relay-broker", - &self_names - )); - assert!(is_relaycast_self_control_target( - "@relay-broker@workspace", - "relay-broker", - &self_names - )); - assert!(!is_relaycast_self_control_target( - "worker-a", - "relay-broker", - &self_names - )); - } - - #[tokio::test] - async fn contract_health_fixture_requires_rich_listen_health_shape() { - let fixture: Value = serde_json::from_str(include_str!( - "../../../packages/contracts/fixtures/health-fixtures.json" - )) - .expect("health fixture should be valid JSON"); - let expected_shape = fixture - .get("health_response") - .and_then(Value::as_object) - .expect("health fixture must include health_response object"); - - let actual = crate::listen_api::listen_api_health_payload(None, vec![]); - - for required_key in expected_shape.keys() { - // TODO(contract-wave1-health-shape): listen-mode /health should - // implement the shared BrokerHealthResponse contract fields. - assert!( - actual.get(required_key).is_some(), - "listen /health response is missing required contract field: {}", - required_key - ); - } - } - - #[tokio::test] - async fn contract_startup_429_fixture_requires_degraded_health_status() { - let fixture: Value = serde_json::from_str(include_str!( - "../../../packages/contracts/fixtures/health-fixtures.json" - )) - .expect("health fixture should be valid JSON"); - let expected = fixture - .get("wave0_startup_429_degraded") - .and_then(|v| v.get("expected_health_status")) - .and_then(Value::as_str) - .expect("health fixture must include expected degraded health status"); - let startup_error_code = fixture - .get("wave0_startup_429_degraded") - .and_then(|v| v.get("error")) - .and_then(|v| v.get("code")) - .and_then(Value::as_str) - .expect("health fixture must include startup error code"); - std::env::set_var("AGENT_RELAY_STARTUP_ERROR_CODE", startup_error_code); - let actual = crate::listen_api::listen_api_health_payload(None, vec![]) - .get("status") - .and_then(Value::as_str) - .unwrap_or("unknown") - .to_string(); - std::env::remove_var("AGENT_RELAY_STARTUP_ERROR_CODE"); - - assert_eq!( - actual, expected, - "listen /health status \"{}\" does not match startup 429 degraded contract \"{}\"", - actual, expected - ); - } - - #[test] - fn contract_replay_fixture_requires_replay_route_exposure() { - let replay_fixture: Value = serde_json::from_str(include_str!( - "../../../packages/contracts/fixtures/replay-fixtures.json" - )) - .expect("replay fixture should be valid JSON"); - assert!( - replay_fixture.get("replay_cursor_request").is_some(), - "replay fixture must include replay_cursor_request" - ); - assert!( - replay_fixture.get("replay_response").is_some(), - "replay fixture must include replay_response" - ); - - let source = include_str!("listen_api.rs"); - assert!( - source.contains(".route(\"/api/events/replay\""), - "listen API router does not expose /api/events/replay" - ); - } - - #[test] - fn contract_timeout_fixture_requires_terminal_failed_guard_before_late_ack() { - let replay_fixture: Value = serde_json::from_str(include_str!( - "../../../packages/contracts/fixtures/replay-fixtures.json" - )) - .expect("replay fixture should be valid JSON"); - let timeout_fixture = replay_fixture - .get("wave0_timeout_terminal_semantics") - .and_then(Value::as_object) - .expect("replay fixture must include wave0_timeout_terminal_semantics object"); - - let expected_terminal_status = timeout_fixture - .get("expected_terminal_status") - .and_then(Value::as_str) - .expect("timeout fixture requires expected_terminal_status"); - let late_event_kind = timeout_fixture - .get("late_event_kind") - .and_then(Value::as_str) - .expect("timeout fixture requires late_event_kind"); - - let source = include_str!("runtime.rs"); - let ack_branch = source - .find("msg_type == \"delivery_ack\"") - .map(|idx| { - let end = (idx + 1200).min(source.len()); - &source[idx..end] - }) - .expect("main.rs must include delivery_ack handling"); - - assert!( - ack_branch.contains(expected_terminal_status) || ack_branch.contains("terminal"), - "delivery_ack branch lacks terminal guard for timeout status \"{}\" and late event \"{}\"", - expected_terminal_status, - late_event_kind - ); - } - - #[test] - fn contract_broadcast_whitelist_fixture_requires_filtering_to_required_kinds() { - let event_fixture: Value = serde_json::from_str(include_str!( - "../../../packages/contracts/fixtures/event-fixtures.json" - )) - .expect("event fixture should be valid JSON"); - let required = event_fixture - .get("wave0_broadcast_whitelist") - .and_then(|v| v.get("required_kinds")) - .and_then(Value::as_array) - .expect("event fixture must include wave0_broadcast_whitelist.required_kinds") - .iter() - .filter_map(Value::as_str) - .map(str::to_owned) - .collect::>(); - - let emitted = extract_kind_literals(include_str!("runtime.rs")); - - assert!( - required.is_subset(&emitted), - "broker source is missing required broadcast kinds; expected {:?}, got {:?}", - required, - emitted - ); - } - - #[test] - fn build_thread_infos_groups_channel_messages() { - let messages = vec![ - json!({ - "from": "broker", - "target": "#general", - "text": "outbound", - "timestamp": "2026-02-23T10:00:00Z", - }), - json!({ - "from": "Lead", - "target": "#general", - "text": "inbound", - "timestamp": "2026-02-23T10:01:00Z", - }), - ]; - let self_names = HashSet::from(["broker".to_string()]); - let threads = build_thread_infos(&messages, &self_names); - - assert_eq!(threads.len(), 1); - assert_eq!(threads[0].thread_id, "#general"); - assert_eq!(threads[0].name, "#general"); - assert_eq!(threads[0].unread_count, 1); - assert_eq!(threads[0].last_message.as_deref(), Some("inbound")); - } - - #[test] - fn build_thread_infos_groups_direct_messages_case_insensitively() { - let messages = vec![ - json!({ - "from": "BROKER", - "to": "WorkerA", - "text": "ping", - "timestamp": "2026-02-23T10:00:00Z", - }), - json!({ - "from": "workera", - "to": "broker", - "text": "pong", - "timestamp": "2026-02-23T10:01:00Z", - }), - ]; - let self_names = HashSet::from(["broker".to_string()]); - let threads = build_thread_infos(&messages, &self_names); - - assert_eq!(threads.len(), 1); - assert_eq!(threads[0].thread_id, "direct:broker:workera"); - assert_eq!(threads[0].name, "workera"); - assert_eq!(threads[0].unread_count, 1); - assert_eq!(threads[0].last_message.as_deref(), Some("pong")); - } - - #[test] - fn build_thread_infos_uses_dm_conversation_id_and_sender_name() { - let messages = vec![json!({ - "from": "Planner", - "conversation_id": "conv_123", - "text": "dm payload", - "timestamp": "2026-02-23T10:01:00Z", - })]; - let self_names = HashSet::from(["broker".to_string()]); - let threads = build_thread_infos(&messages, &self_names); - - assert_eq!(threads.len(), 1); - assert_eq!(threads[0].thread_id, "conv_123"); - assert_eq!(threads[0].name, "Planner"); - assert_eq!(threads[0].unread_count, 1); - } - - #[test] - fn build_thread_infos_shows_dms_between_non_broker_agents() { - let messages = vec![ - json!({ - "from": "WorkerA", - "conversation_id": "dm_456", - "participants": ["WorkerA", "WorkerB"], - "text": "hello WorkerB", - "timestamp": "2026-02-23T10:00:00Z", - }), - json!({ - "from": "WorkerB", - "conversation_id": "dm_456", - "participants": ["WorkerA", "WorkerB"], - "text": "hi WorkerA", - "timestamp": "2026-02-23T10:01:00Z", - }), - ]; - let self_names = HashSet::from(["broker".to_string()]); - let threads = build_thread_infos(&messages, &self_names); - - assert_eq!(threads.len(), 1, "should group into one conversation"); - assert_eq!(threads[0].thread_id, "dm_456"); - assert_eq!(threads[0].name, "WorkerA ↔ WorkerB"); - assert_eq!( - threads[0].unread_count, 2, - "both messages unread (neither from broker)" - ); - assert_eq!(threads[0].last_message.as_deref(), Some("hi WorkerA")); - } - - #[test] - fn build_thread_infos_dm_with_participants_filters_broker() { - let messages = vec![json!({ - "from": "WorkerA", - "conversation_id": "dm_789", - "participants": ["broker", "WorkerA"], - "text": "hello broker", - "timestamp": "2026-02-23T10:00:00Z", - })]; - let self_names = HashSet::from(["broker".to_string()]); - let threads = build_thread_infos(&messages, &self_names); - - assert_eq!(threads.len(), 1); - assert_eq!( - threads[0].name, "WorkerA", - "should filter out broker from participants" - ); - } - - #[test] - fn build_thread_infos_multiple_independent_dm_conversations() { - let messages = vec![ - json!({ - "from": "Alice", - "conversation_id": "dm_aaa", - "participants": ["Alice", "Bob"], - "text": "hi Bob", - "timestamp": "2026-02-23T10:00:00Z", - }), - json!({ - "from": "Charlie", - "conversation_id": "dm_bbb", - "participants": ["Charlie", "Diana"], - "text": "hi Diana", - "timestamp": "2026-02-23T10:01:00Z", - }), - json!({ - "from": "broker", - "conversation_id": "dm_ccc", - "participants": ["broker", "Eve"], - "text": "hi Eve", - "timestamp": "2026-02-23T10:02:00Z", - }), - ]; - let self_names = HashSet::from(["broker".to_string()]); - let threads = build_thread_infos(&messages, &self_names); - - assert_eq!( - threads.len(), - 3, - "should have three separate DM conversations" - ); - - let thread_aaa = threads.iter().find(|t| t.thread_id == "dm_aaa").unwrap(); - assert_eq!(thread_aaa.name, "Alice ↔ Bob"); - - let thread_bbb = threads.iter().find(|t| t.thread_id == "dm_bbb").unwrap(); - assert_eq!(thread_bbb.name, "Charlie ↔ Diana"); - - let thread_ccc = threads.iter().find(|t| t.thread_id == "dm_ccc").unwrap(); - assert_eq!(thread_ccc.name, "Eve", "broker filtered from participants"); - } - - #[test] - fn build_thread_infos_respects_explicit_unread_count() { - let messages = vec![json!({ - "from": "Planner", - "target": "broker", - "text": "status", - "unread_count": 7, - "timestamp": "2026-02-23T10:01:00Z", - })]; - let self_names = HashSet::from(["broker".to_string()]); - let threads = build_thread_infos(&messages, &self_names); - - assert_eq!(threads.len(), 1); - assert_eq!(threads[0].unread_count, 7); - } - - #[test] - fn build_agent_state_transition_event_has_expected_shape() { - let payload = build_agent_state_transition_event("worker-a", "spawned", Some("sdk_spawn")); - assert_eq!(payload["type"], "agent.state"); - assert_eq!(payload["state"], "spawned"); - assert_eq!(payload["agent"]["name"], "worker-a"); - assert_eq!(payload["reason"], "sdk_spawn"); - assert!(payload["timestamp"].as_str().is_some()); - - let no_reason = build_agent_state_transition_event("worker-a", "idle", None); - assert!(no_reason.get("reason").is_none()); - } - - #[test] - fn preregistration_error_message_dedupes_retry_after_for_rate_limit() { - let error = RelaycastRegistrationError::RateLimited { - agent_name: "Foobar".to_string(), - retry_after_secs: 60, - detail: "{\"ok\":false}".to_string(), - }; - let message = format_worker_preregistration_error("Foobar", &error); - assert_eq!(message.matches("retry after").count(), 1); - } - - #[test] - fn preregistration_error_message_does_not_invent_retry_after_for_transport_errors() { - let error = RelaycastRegistrationError::Transport { - agent_name: "Foobar".to_string(), - detail: "timeout".to_string(), - }; - let message = format_worker_preregistration_error("Foobar", &error); - assert!(!message.contains("retry after")); - } - - #[test] - fn injection_format_preserved() { - let rendered = format_injection("alice", "evt_1", "hello", "bob"); - assert!(rendered.contains("")); - assert!(rendered.contains("mcp__relaycast__message_dm_send")); - assert!(rendered.contains("Relay message from alice [evt_1]: hello")); - } - - #[test] - fn injection_format_includes_channel() { - let rendered = format_injection("alice", "evt_1", "hello", "#general"); - assert!(rendered.contains("mcp__relaycast__message_post")); - assert!(rendered.contains("channel: \"general\"")); - assert!(rendered.contains("Relay message from alice in #general [evt_1]: hello")); - } - - #[test] - fn normalize_sender_defaults_to_human_orchestrator() { - assert_eq!(normalize_sender(None), "human:orchestrator"); - assert_eq!(normalize_sender(Some(String::new())), "human:orchestrator"); - assert_eq!( - normalize_sender(Some(" ".to_string())), - "human:orchestrator" - ); - } - - #[test] - fn normalize_sender_normalizes_human_prefix() { - assert_eq!( - normalize_sender(Some("human: Dashboard ".to_string())), - "human:Dashboard" - ); - } - - #[test] - fn normalize_sender_preserves_worker_names() { - assert_eq!( - normalize_sender(Some("WorkerOne".to_string())), - "WorkerOne".to_string() - ); - } - - #[test] - fn sender_is_dashboard_label_accepts_legacy_dashboard_senders() { - assert!(sender_is_dashboard_label("Dashboard", "my-project")); - assert!(sender_is_dashboard_label("human:Dashboard", "my-project")); - assert!(sender_is_dashboard_label( - "human:orchestrator", - "my-project" - )); - assert!(sender_is_dashboard_label("my-project", "my-project")); - assert!(!sender_is_dashboard_label("Lead", "my-project")); - } - - #[test] - fn display_target_for_dashboard_maps_self_identity() { - let mut self_names = HashSet::new(); - self_names.insert("broker-951762d5".to_string()); - self_names.insert("DashProbe".to_string()); - let primary = "my-project"; - - assert_eq!( - display_target_for_dashboard("broker-951762d5", &self_names, primary), - "my-project" - ); - assert_eq!( - display_target_for_dashboard("dashprobe", &self_names, primary), - "my-project" - ); - assert_eq!( - display_target_for_dashboard("Lead", &self_names, primary), - "Lead".to_string() - ); - } - - #[test] - fn delivery_retry_interval_uses_default_and_env_override() { - std::env::remove_var("AGENT_RELAY_DELIVERY_RETRY_MS"); - assert_eq!(delivery_retry_interval().as_millis(), 1_000); - - std::env::set_var("AGENT_RELAY_DELIVERY_RETRY_MS", "250"); - assert_eq!(delivery_retry_interval().as_millis(), 250); - - std::env::set_var("AGENT_RELAY_DELIVERY_RETRY_MS", "1"); - assert_eq!(delivery_retry_interval().as_millis(), 50); - - std::env::remove_var("AGENT_RELAY_DELIVERY_RETRY_MS"); - } - - #[test] - fn http_api_timeout_windows_use_default_and_env_override() { - std::env::remove_var("AGENT_RELAY_HTTP_API_LOCAL_DELIVERY_TIMEOUT_MS"); - std::env::remove_var("AGENT_RELAY_HTTP_API_RELAYCAST_SEND_TIMEOUT_MS"); - std::env::remove_var("AGENT_RELAY_HTTP_API_EVENT_EMIT_TIMEOUT_MS"); - - assert_eq!(http_api_local_delivery_timeout().as_millis(), 3_000); - assert_eq!(http_api_relaycast_send_timeout().as_millis(), 20_000); - assert_eq!(http_api_event_emit_timeout().as_millis(), 200); - - std::env::set_var("AGENT_RELAY_HTTP_API_LOCAL_DELIVERY_TIMEOUT_MS", "10"); - std::env::set_var("AGENT_RELAY_HTTP_API_RELAYCAST_SEND_TIMEOUT_MS", "100"); - std::env::set_var("AGENT_RELAY_HTTP_API_EVENT_EMIT_TIMEOUT_MS", "1"); - - assert_eq!(http_api_local_delivery_timeout().as_millis(), 100); - assert_eq!(http_api_relaycast_send_timeout().as_millis(), 500); - assert_eq!(http_api_event_emit_timeout().as_millis(), 25); - - std::env::set_var("AGENT_RELAY_HTTP_API_LOCAL_DELIVERY_TIMEOUT_MS", "1500"); - std::env::set_var("AGENT_RELAY_HTTP_API_RELAYCAST_SEND_TIMEOUT_MS", "12000"); - std::env::set_var("AGENT_RELAY_HTTP_API_EVENT_EMIT_TIMEOUT_MS", "150"); - - assert_eq!(http_api_local_delivery_timeout().as_millis(), 1_500); - assert_eq!(http_api_relaycast_send_timeout().as_millis(), 12_000); - assert_eq!(http_api_event_emit_timeout().as_millis(), 150); - - std::env::remove_var("AGENT_RELAY_HTTP_API_LOCAL_DELIVERY_TIMEOUT_MS"); - std::env::remove_var("AGENT_RELAY_HTTP_API_RELAYCAST_SEND_TIMEOUT_MS"); - std::env::remove_var("AGENT_RELAY_HTTP_API_EVENT_EMIT_TIMEOUT_MS"); - } - - #[test] - fn drop_pending_for_worker_removes_only_matching_entries() { - let mut pending = HashMap::new(); - pending.insert( - "del_1".to_string(), - PendingDelivery { - worker_name: "A".to_string(), - delivery: RelayDelivery { - delivery_id: "del_1".to_string(), - event_id: "evt_1".to_string(), - workspace_id: Some("ws_test".to_string()), - workspace_alias: Some("test".to_string()), - from: "x".to_string(), - target: "#general".to_string(), - body: "hello".to_string(), - thread_id: None, - priority: None, - injection_mode: MessageInjectionMode::Wait, - }, - attempts: 1, - next_retry_at: Instant::now(), - }, - ); - pending.insert( - "del_2".to_string(), - PendingDelivery { - worker_name: "B".to_string(), - delivery: RelayDelivery { - delivery_id: "del_2".to_string(), - event_id: "evt_2".to_string(), - workspace_id: Some("ws_test".to_string()), - workspace_alias: Some("test".to_string()), - from: "y".to_string(), - target: "#general".to_string(), - body: "world".to_string(), - thread_id: None, - priority: None, - injection_mode: MessageInjectionMode::Wait, - }, - attempts: 1, - next_retry_at: Instant::now(), - }, - ); - - let dropped = drop_pending_for_worker(&mut pending, "A"); - assert_eq!(dropped, 1); - assert!(pending.contains_key("del_2")); - assert!(!pending.contains_key("del_1")); - } - - #[test] - fn should_clear_pending_delivery_when_event_id_matches() { - let pending = PendingDelivery { - worker_name: "A".to_string(), - delivery: RelayDelivery { - delivery_id: "del_1".to_string(), - event_id: "evt_1".to_string(), - workspace_id: Some("ws_test".to_string()), - workspace_alias: Some("test".to_string()), - from: "x".to_string(), - target: "#general".to_string(), - body: "hello".to_string(), - thread_id: None, - priority: None, - injection_mode: MessageInjectionMode::Wait, - }, - attempts: 1, - next_retry_at: Instant::now(), - }; - - assert!(should_clear_pending_delivery_for_event( - Some(&pending), - Some("evt_1") - )); - assert!(!should_clear_pending_delivery_for_event( - Some(&pending), - Some("evt_2") - )); - } - - #[test] - fn should_clear_pending_delivery_without_event_id_for_compatibility() { - let pending = PendingDelivery { - worker_name: "A".to_string(), - delivery: RelayDelivery { - delivery_id: "del_1".to_string(), - event_id: "evt_1".to_string(), - workspace_id: Some("ws_test".to_string()), - workspace_alias: Some("test".to_string()), - from: "x".to_string(), - target: "#general".to_string(), - body: "hello".to_string(), - thread_id: None, - priority: None, - injection_mode: MessageInjectionMode::Wait, - }, - attempts: 1, - next_retry_at: Instant::now(), - }; - - assert!(should_clear_pending_delivery_for_event( - Some(&pending), - None - )); - assert!(should_clear_pending_delivery_for_event( - Some(&pending), - Some("") - )); - assert!(should_clear_pending_delivery_for_event(None, Some("evt_1"))); - } - - // ==================== strip_ansi tests ==================== - - #[test] - fn strip_ansi_removes_csi_sequences() { - assert_eq!(strip_ansi("\x1b[32mHello\x1b[0m"), "Hello"); - assert_eq!(strip_ansi("\x1b[1;31mred bold\x1b[0m"), "red bold"); - } - - #[test] - fn strip_ansi_removes_osc_sequences() { - assert_eq!(strip_ansi("\x1b]0;title\x07rest"), "rest"); - assert_eq!(strip_ansi("\x1b]0;title\x1b\\rest"), "rest"); - } - - #[test] - fn strip_ansi_preserves_plain_text() { - assert_eq!(strip_ansi("Hello world"), "Hello world"); - assert_eq!(strip_ansi(""), ""); - } - - #[test] - fn strip_ansi_handles_mixed_content() { - let input = "\x1b[33m⚠️ bypass\x1b[0m permissions mode\n\x1b[1m(yes/no)\x1b[0m"; - let clean = strip_ansi(input); - assert!(clean.contains("bypass")); - assert!(clean.contains("(yes/no)")); - assert!(!clean.contains("\x1b")); - } - - #[test] - fn strip_ansi_handles_cursor_forward_sequences() { - // Claude Code uses \x1b[1C (cursor forward) instead of spaces - // These should be replaced with spaces so echo detection works - let input = "\x1b[1CYes,\x1b[1CI\x1b[1Caccept"; - let clean = strip_ansi(input); - assert_eq!(clean, " Yes, I accept"); - } - - // ==================== floor_char_boundary tests ==================== - - #[test] - fn floor_char_boundary_at_valid_positions() { - let s = "Hello 世界"; - assert_eq!(floor_char_boundary(s, 0), 0); - assert_eq!(floor_char_boundary(s, 6), 6); - assert_eq!(floor_char_boundary(s, 9), 9); - } - - #[test] - fn floor_char_boundary_mid_multibyte() { - let s = "Hello 世界"; - assert_eq!(floor_char_boundary(s, 7), 6); - assert_eq!(floor_char_boundary(s, 8), 6); - } - - #[test] - fn floor_char_boundary_past_end() { - let s = "Hello 世界"; - assert_eq!(floor_char_boundary(s, 100), s.len()); - } - - // ==================== detect_bypass_permissions_prompt tests ==================== - - #[test] - fn bypass_perms_yes_no_prompt() { - let output = "⚠️ Bypassing all permission checks.\nDo you want to proceed? (yes/no)"; - let (has_ref, has_confirm) = detect_bypass_permissions_prompt(output); - assert!(has_ref); - assert!(has_confirm); - } - - #[test] - fn bypass_perms_dangerously_with_yn() { - let output = "Running with --dangerously-skip-permissions\nAccept the risks? (y/n)"; - let (has_ref, has_confirm) = detect_bypass_permissions_prompt(output); - assert!(has_ref); - assert!(has_confirm); - } - - #[test] - fn bypass_perms_accept_risk_variant() { - let output = - "bypass permissions mode enabled\nDo you accept the risk of running in this mode?"; - let (has_ref, has_confirm) = detect_bypass_permissions_prompt(output); - assert!(has_ref); - assert!(has_confirm); - } - - #[test] - fn bypass_perms_no_match_normal_output() { - let output = "I'll help you fix that bug. Let me read the file first."; - let (has_ref, has_confirm) = detect_bypass_permissions_prompt(output); - assert!(!has_ref); - assert!(!has_confirm); - } - - #[test] - fn bypass_perms_no_false_positive_permission_without_bypass() { - let output = "File permission denied. (yes/no)"; - let (has_ref, has_confirm) = detect_bypass_permissions_prompt(output); - assert!(!has_ref, "permission without bypass should not match"); - assert!(has_confirm, "yes/no detected but insufficient alone"); - } - - #[test] - fn bypass_perms_no_false_positive_status_bar() { - let output = "-- INSERT -- ⏵⏵ bypass permissions on (shift+tab to cycle)"; - let (has_ref, has_confirm) = detect_bypass_permissions_prompt(output); - assert!(has_ref, "status bar has bypass+permissions"); - assert!(!has_confirm, "but no confirmation prompt"); - } - - #[test] - fn bypass_perms_selection_menu_format() { - let output = "WARNING: ClaudeCoderunninginBypassPermissionsmode\n\ - Byproceeding,youacceptallresponsibility\n\ - No,exit\nYes,Iaccept\nEntertoconfirm"; - let (has_ref, has_confirm) = detect_bypass_permissions_prompt(output); - assert!(has_ref); - assert!(has_confirm); - assert!(is_bypass_selection_menu(output)); - } - - #[test] - fn bypass_perms_selection_menu_with_spaces() { - let output = "WARNING: Claude Code running in Bypass Permissions mode\n\ - 1. No, exit\n2. Yes, I accept\nEnter to confirm"; - let (has_ref, has_confirm) = detect_bypass_permissions_prompt(output); - assert!(has_ref && has_confirm); - assert!(is_bypass_selection_menu(output)); - } - - #[test] - fn bypass_perms_legacy_not_selection_menu() { - let output = "bypass permissions mode\nProceed? (yes/no)"; - let (has_ref, has_confirm) = detect_bypass_permissions_prompt(output); - assert!(has_ref && has_confirm, "legacy should still detect"); - assert!( - !is_bypass_selection_menu(output), - "legacy should NOT be selection menu" - ); - } - - #[test] - fn bypass_perms_with_raw_ansi() { - let raw = "\x1b[33m⚠️ bypass permissions\x1b[0m mode\nProceed? \x1b[1m(yes/no)\x1b[0m"; - let clean = strip_ansi(raw); - let (has_ref, has_confirm) = detect_bypass_permissions_prompt(&clean); - assert!(has_ref && has_confirm); - } - - // ==================== detect_claude_trust_prompt tests ==================== - - #[test] - fn claude_trust_prompt_full_match() { - let output = "take a moment to review what's in this folder first.\n\ - Claude Code'll be able to read, edit, and execute files here.\n\ - Security guide\n\ - ❯ 1. Yes, I trust this folder\n\ - 2. No, exit\n\ - Enter to confirm · Esc to cancel"; - let (has_trust_ref, has_confirmation) = detect_claude_trust_prompt(output); - assert!(has_trust_ref); - assert!(has_confirmation); - } - - #[test] - fn claude_trust_prompt_stripped_spaces() { - let output = "Yes,Itrustthisfolder\nNo,exit"; - let (has_trust_ref, has_confirmation) = detect_claude_trust_prompt(output); - assert!(has_trust_ref); - assert!(has_confirmation); - } - - #[test] - fn claude_trust_prompt_no_match_normal_output() { - let output = "I'll help you fix that bug. Let me read the file first."; - let (has_trust_ref, has_confirmation) = detect_claude_trust_prompt(output); - assert!(!has_trust_ref); - assert!(!has_confirmation); - } - - #[test] - fn claude_trust_prompt_partial_no_exit() { - let output = "Yes, I trust this folder"; - let (has_trust_ref, has_confirmation) = detect_claude_trust_prompt(output); - assert!(has_trust_ref); - assert!(!has_confirmation, "should not match without exit option"); - } - - #[test] - fn claude_trust_prompt_with_ansi() { - let raw = "\x1b[1m❯ 1. Yes, I trust this folder\x1b[0m\n 2. No, exit"; - let clean = strip_ansi(raw); - let (has_trust_ref, has_confirmation) = detect_claude_trust_prompt(&clean); - assert!(has_trust_ref && has_confirmation); - } - - // ==================== is_in_editor_mode tests ==================== - - #[test] - fn editor_mode_vim_insert() { - assert!(is_in_editor_mode("Some text\n-- INSERT --\n")); - assert!(is_in_editor_mode("Some text\n-- INSERT --")); - } - - #[test] - fn editor_mode_claude_cli_not_vim() { - let output = "-- INSERT -- ⏵⏵ bypass permissions on (shift+tab to cycle)"; - assert!(!is_in_editor_mode(output)); - } - - #[test] - fn editor_mode_nano() { - let output = " GNU nano 5.8\nFile: test.txt\n^G Get Help ^O Write Out"; - assert!(is_in_editor_mode(output)); - } - - #[test] - fn editor_mode_less_pager() { - assert!(is_in_editor_mode("some content\n(END)")); - assert!(is_in_editor_mode("some content\n--More--")); - } - - #[test] - fn editor_mode_normal_output() { - assert!(!is_in_editor_mode( - "I'll help you with that task. Let me search." - )); - assert!(!is_in_editor_mode("$ ls -la\ntotal 0\n$ ")); - } - - #[test] - fn editor_mode_with_ansi() { - let output = "\x1b[32mSome text\x1b[0m\n-- INSERT --\n"; - assert!(is_in_editor_mode(output)); - } - - #[test] - fn editor_mode_vim_visual_modes() { - assert!(is_in_editor_mode("text\n-- VISUAL --\n")); - assert!(is_in_editor_mode("text\n-- VISUAL LINE --\n")); - assert!(is_in_editor_mode("text\n-- VISUAL BLOCK --\n")); - assert!(is_in_editor_mode("text\n-- REPLACE --\n")); - } - - #[test] - fn editor_mode_claude_normal_not_vim() { - assert!(!is_in_editor_mode("-- NORMAL -- ► some Claude UI text")); - assert!(!is_in_editor_mode("-- VISUAL -- ▶ Claude UI")); - } - - #[test] - fn auto_suggestion_detects_cursor_plus_dim_pattern() { - assert!(is_auto_suggestion( - "\x1b[7mW\x1b[27m\x1b[2mhat's the task?\x1b[22m" - )); - } - - #[test] - fn auto_suggestion_detects_send_hint() { - assert!(is_auto_suggestion(" ↵ send")); - } - - #[test] - fn auto_suggestion_ignores_normal_output() { - assert!(!is_auto_suggestion("Relay message from Alice [abc]: hello")); - assert!(!is_auto_suggestion("Running tests...")); - assert!(!is_auto_suggestion("> \x1b[7m \x1b[27m")); - } - - #[test] - fn extract_mcp_ids_from_tool_response() { - let output = r#" ⎿ { - "id": "147310274064424960", - "conversation_id": "147310245874507776", - "from": "agent-a", - "text": "hello" - }"#; - let ids = extract_mcp_message_ids(output); - // Only extracts "id" keys, not "conversation_id" - assert_eq!(ids, vec!["147310274064424960"]); - } - - #[test] - fn extract_mcp_ids_ignores_short_ids() { - let output = r#""id": "123""#; - assert!(extract_mcp_message_ids(output).is_empty()); - } - - #[test] - fn extract_mcp_ids_ignores_non_numeric() { - let output = r#""id": "msg_abc123def456ghi""#; - assert!(extract_mcp_message_ids(output).is_empty()); - } - - #[test] - fn extract_mcp_ids_handles_no_ids() { - assert!(extract_mcp_message_ids("normal output with no JSON").is_empty()); - assert!(extract_mcp_message_ids("").is_empty()); - } - - // ==================== bypass flag selection logic tests ==================== - // Tests for the bypass flag logic used in WorkerRegistry::spawn(). - // The logic is: claude/claude:* → --dangerously-skip-permissions, codex → --dangerously-bypass-approvals-and-sandbox - - fn compute_bypass_flag(cli: &str, existing_args: &[String]) -> Option<&'static str> { - let cli_lower = cli.to_lowercase(); - if (cli_lower == "claude" || cli_lower.starts_with("claude:")) - && !existing_args - .iter() - .any(|a| a.contains("dangerously-skip-permissions")) - { - Some("--dangerously-skip-permissions") - } else if cli_lower == "codex" - && !existing_args - .iter() - .any(|a| a.contains("dangerously-bypass") || a.contains("full-auto")) - { - Some("--dangerously-bypass-approvals-and-sandbox") - } else if cli_lower == "gemini" && !existing_args.iter().any(|a| a == "--yolo" || a == "-y") - { - Some("--yolo") - } else { - None - } - } - - #[test] - fn bypass_flag_claude_gets_skip_permissions() { - assert_eq!( - compute_bypass_flag("claude", &[]), - Some("--dangerously-skip-permissions") - ); - } - - #[test] - fn bypass_flag_claude_variant_gets_skip_permissions() { - assert_eq!( - compute_bypass_flag("claude:latest", &[]), - Some("--dangerously-skip-permissions") - ); - assert_eq!( - compute_bypass_flag("Claude", &[]), - Some("--dangerously-skip-permissions") - ); - assert_eq!( - compute_bypass_flag("CLAUDE:v2", &[]), - Some("--dangerously-skip-permissions") - ); - } - - #[test] - fn bypass_flag_codex_gets_dangerously_bypass() { - assert_eq!( - compute_bypass_flag("codex", &[]), - Some("--dangerously-bypass-approvals-and-sandbox") - ); - } - - #[test] - fn bypass_flag_gemini_gets_yolo() { - assert_eq!(compute_bypass_flag("gemini", &[]), Some("--yolo")); - } - - #[test] - fn bypass_flag_gemini_dedup_when_yolo_present() { - let args = vec!["--yolo".to_string()]; - assert_eq!( - compute_bypass_flag("gemini", &args), - None, - "should not duplicate --yolo flag" - ); - } - - #[test] - fn bypass_flag_gemini_dedup_when_y_present() { - let args = vec!["-y".to_string()]; - assert_eq!( - compute_bypass_flag("gemini", &args), - None, - "should not duplicate when -y shorthand present" - ); - } - - #[test] - fn bypass_flag_aider_gets_none() { - assert_eq!(compute_bypass_flag("aider", &[]), None); - } - - #[test] - fn bypass_flag_goose_gets_none() { - assert_eq!(compute_bypass_flag("goose", &[]), None); - } - - #[test] - fn bypass_flag_unknown_cli_gets_none() { - assert_eq!(compute_bypass_flag("mystery-cli", &[]), None); - } - - #[test] - fn bypass_flag_claude_dedup_when_already_present() { - let args = vec!["--dangerously-skip-permissions".to_string()]; - assert_eq!( - compute_bypass_flag("claude", &args), - None, - "should not duplicate flag" - ); - } - - #[test] - fn bypass_flag_codex_dedup_when_already_present() { - let args = vec!["--dangerously-bypass-approvals-and-sandbox".to_string()]; - assert_eq!( - compute_bypass_flag("codex", &args), - None, - "should not duplicate flag" - ); - } - - #[test] - fn bypass_flag_codex_dedup_when_full_auto_present() { - let args = vec!["--full-auto".to_string()]; - assert_eq!( - compute_bypass_flag("codex", &args), - None, - "should not add bypass when --full-auto already present" - ); - } - - #[test] - fn bypass_flag_claude_dedup_partial_match() { - // If someone passes a different arg containing the substring, still dedup - let args = vec!["--my-dangerously-skip-permissions-flag".to_string()]; - assert_eq!( - compute_bypass_flag("claude", &args), - None, - "substring match should prevent duplication" - ); - } - - #[test] - fn bypass_flag_codex_with_other_args() { - let args = vec!["--model".to_string(), "gpt-4".to_string()]; - assert_eq!( - compute_bypass_flag("codex", &args), - Some("--dangerously-bypass-approvals-and-sandbox"), - "unrelated args should not prevent bypass flag" - ); - } - - // ==================== is_pid_alive ==================== - - #[test] - fn is_pid_alive_returns_true_for_self() { - let pid = std::process::id(); - assert!( - crate::broker::is_pid_alive(pid), - "current process PID should be alive" - ); - } - - #[test] - fn is_pid_alive_returns_false_for_dead_pid() { - // Spawn a short-lived child, wait for it to exit, then verify it's dead - let child = std::process::Command::new("true") - .spawn() - .expect("failed to spawn 'true'"); - let pid = child.id(); - let mut child = child; - child.wait().expect("failed to wait on child"); - // After the child exits, its PID should not be alive - // (the PID may be recycled, but on macOS/Linux it won't be immediately) - assert!( - !crate::broker::is_pid_alive(pid), - "exited child PID should be dead" - ); - } - - #[test] - fn is_pid_alive_returns_false_for_bogus_pid() { - // PID 0 is the kernel scheduler — kill(0, 0) signals the entire process group, - // not a real target. Use a very high PID that almost certainly doesn't exist. - // On macOS pid_max is ~99999; on Linux it's typically 32768 or 4194304. - // 4_000_000 is unlikely to be in use. - assert!( - !crate::broker::is_pid_alive(4_000_000), - "bogus PID 4_000_000 should not be alive (ESRCH)" - ); - } - - #[test] - fn is_pid_alive_eperm_means_alive() { - // PID 1 (launchd/init) is owned by root. When run as a normal user, - // kill(1, 0) returns EPERM — the process exists but we can't signal it. - // This is exactly the EPERM case our fix handles. - // Skip if running as root (e.g., in some CI containers) since root can - // signal any process and would get rc=0 instead of EPERM. - if unsafe { nix::libc::getuid() } == 0 { - eprintln!("skipping EPERM test: running as root"); - return; - } - assert!( - crate::broker::is_pid_alive(1), - "PID 1 (init/launchd) should report alive via EPERM" - ); - } - - // ==================== write_pid_file ==================== - - // ==================== continuity_dir ==================== - - #[test] - fn continuity_dir_derives_correct_path_from_state_json() { - let state_path = std::path::Path::new("/project/.agent-relay/state.json"); - let result = continuity_dir(state_path); - assert_eq!( - result, - std::path::PathBuf::from("/project/.agent-relay/continuity") - ); - } - - #[test] - fn continuity_dir_works_with_nested_project_path() { - let state_path = std::path::Path::new("/home/user/projects/my-app/.agent-relay/state.json"); - let result = continuity_dir(state_path); - assert_eq!( - result, - std::path::PathBuf::from("/home/user/projects/my-app/.agent-relay/continuity") - ); - } - - #[test] - fn continuity_dir_preserves_relative_paths() { - let state_path = std::path::Path::new(".agent-relay/state.json"); - let result = continuity_dir(state_path); - assert_eq!(result, std::path::PathBuf::from(".agent-relay/continuity")); - } - - #[test] - fn http_api_spawn_spec_defaults_to_pty_runtime() { - let spec = build_http_api_spawn_spec( - "worker-a".to_string(), - "codex".to_string(), - None, - Some("o3".to_string()), - vec!["--fast".to_string()], - vec!["general".to_string()], - Some("/tmp/project".to_string()), - Some("core".to_string()), - Some("Lead".to_string()), - Some("subagent".to_string()), - None, - ) - .expect("spec should build"); - - assert!(matches!(spec.runtime, AgentRuntime::Pty)); - assert!(spec.provider.is_none()); - assert_eq!(spec.cli.as_deref(), Some("codex")); - assert_eq!(spec.model.as_deref(), Some("o3")); - } - - #[test] - fn http_api_spawn_spec_uses_headless_runtime_for_supported_providers() { - let spec = build_http_api_spawn_spec( - "worker-a".to_string(), - "opencode".to_string(), - Some("headless".to_string()), - Some("ignored".to_string()), - vec![], - vec!["general".to_string()], - None, - None, - None, - None, - None, - ) - .expect("headless spec should build"); - - assert!(matches!(spec.runtime, AgentRuntime::Headless)); - assert!(matches!( - spec.provider, - Some(ProtocolHeadlessProvider::Opencode) - )); - assert!(spec.cli.is_none()); - assert_eq!(spec.model.as_deref(), Some("ignored")); - } - - #[test] - fn headless_provider_command_claude_places_flags_before_task() { - let (bin, args) = super::headless_provider_command( - &ProtocolHeadlessProvider::Claude, - "hello world", - &[ - "--mcp-config".to_string(), - "{\"mcpServers\":{}}".to_string(), - ], - ); - - assert_eq!(bin, "claude"); - assert_eq!(args.last().map(String::as_str), Some("hello world")); - let mcp_pos = args.iter().position(|a| a == "--mcp-config").unwrap(); - let task_pos = args.iter().position(|a| a == "hello world").unwrap(); - assert!(mcp_pos < task_pos, "--mcp-config must precede task"); - } - - #[test] - fn headless_provider_command_opencode_places_flags_before_task() { - let (bin, args) = super::headless_provider_command( - &ProtocolHeadlessProvider::Opencode, - "hello world", - &["--agent".to_string(), "relaycast".to_string()], - ); - - assert_eq!(bin, "opencode"); - assert_eq!(args.first().map(String::as_str), Some("run")); - assert_eq!(args.last().map(String::as_str), Some("hello world")); - let agent_pos = args.iter().position(|a| a == "--agent").unwrap(); - let task_pos = args.iter().position(|a| a == "hello world").unwrap(); - assert!(agent_pos < task_pos, "--agent must precede task"); - } - - #[test] - fn http_api_spawn_spec_rejects_unknown_headless_providers() { - let error = build_http_api_spawn_spec( - "worker-a".to_string(), - "codex".to_string(), - Some("headless".to_string()), - None, - vec![], - vec!["general".to_string()], - None, - None, - None, - None, - None, - ) - .expect_err("unsupported headless provider should fail"); - - assert!( - error - .to_string() - .contains("does not support headless transport"), - "unexpected error: {error}" - ); - } - - // ==================== model flag injection tests ==================== - // Tests for the --model flag injection logic used in WorkerRegistry::spawn(). - // When spec.model is set and non-empty, the broker should inject --model - // into the spawned CLI's argv, unless the user already specified --model. - - /// Mirror of the model flag logic in WorkerRegistry::spawn(). - fn compute_model_flag(model: Option<&str>, existing_args: &[String]) -> Option { - model.and_then(|m| { - if m.is_empty() - || existing_args - .iter() - .any(|a| a == "--model" || a.starts_with("--model=") || a == "-m") - { - None - } else { - Some(m.to_string()) - } - }) - } - - #[test] - fn model_flag_injected_when_present() { - assert_eq!( - compute_model_flag(Some("haiku"), &[]), - Some("haiku".to_string()), - "model should be injected when set and args are empty" - ); - } - - #[test] - fn model_flag_not_injected_when_none() { - assert_eq!( - compute_model_flag(None, &[]), - None, - "model should not be injected when not set" - ); - } - - #[test] - fn model_flag_not_injected_when_empty() { - assert_eq!( - compute_model_flag(Some(""), &[]), - None, - "model should not be injected when empty string" - ); - } - - #[test] - fn model_flag_not_injected_when_already_in_args() { - let args = vec!["--model".to_string(), "opus".to_string()]; - assert_eq!( - compute_model_flag(Some("haiku"), &args), - None, - "model should not be injected when --model already in args" - ); - } - - #[test] - fn model_flag_not_injected_when_short_flag_in_args() { - let args = vec!["-m".to_string(), "opus".to_string()]; - assert_eq!( - compute_model_flag(Some("haiku"), &args), - None, - "model should not be injected when -m already in args" - ); - } - - #[test] - fn model_flag_not_injected_when_equals_format_in_args() { - let args = vec!["--model=opus".to_string()]; - assert_eq!( - compute_model_flag(Some("haiku"), &args), - None, - "model should not be injected when --model=value already in args" - ); - } - - #[test] - fn model_flag_injected_with_other_args() { - let args = vec!["--verbose".to_string()]; - assert_eq!( - compute_model_flag(Some("gpt-4o"), &args), - Some("gpt-4o".to_string()), - "model should be injected when other unrelated args exist" - ); - } -} diff --git a/crates/broker/src/runtime/io.rs b/crates/broker/src/runtime/io.rs new file mode 100644 index 000000000..3383add9d --- /dev/null +++ b/crates/broker/src/runtime/io.rs @@ -0,0 +1,70 @@ +use super::*; + +pub(crate) async fn send_error( + tx: &mpsc::Sender>, + request_id: Option, + code: &str, + message: String, + retryable: bool, + data: Option, +) -> Result<()> { + send_frame( + tx, + "error", + request_id, + json!({ + "code": code, + "message": message, + "retryable": retryable, + "data": data, + }), + ) + .await +} + +pub(crate) async fn send_event( + tx: &mpsc::Sender>, + payload: Value, +) -> Result<()> { + send_frame(tx, "event", None, payload).await +} + +pub(crate) async fn emit_http_api_event_with_timeout( + tx: &mpsc::Sender>, + payload: Value, + timeout_window: Duration, +) { + match timeout(timeout_window, send_event(tx, payload)).await { + Ok(Ok(())) => {} + Ok(Err(error)) => { + tracing::warn!( + target = "relay_broker::http_api", + error = %error, + "failed to enqueue HTTP API event" + ); + } + Err(_) => { + tracing::warn!( + target = "relay_broker::http_api", + timeout_ms = %timeout_window.as_millis(), + "timed out enqueuing HTTP API event" + ); + } + } +} + +pub(crate) async fn send_frame( + tx: &mpsc::Sender>, + msg_type: &str, + request_id: Option, + payload: Value, +) -> Result<()> { + tx.send(ProtocolEnvelope { + v: PROTOCOL_VERSION, + msg_type: msg_type.to_string(), + request_id, + payload, + }) + .await + .context("failed to enqueue outbound frame") +} diff --git a/crates/broker/src/runtime/messages.rs b/crates/broker/src/runtime/messages.rs new file mode 100644 index 000000000..ca6d3f61a --- /dev/null +++ b/crates/broker/src/runtime/messages.rs @@ -0,0 +1,572 @@ +use super::*; + +#[derive(Debug, Clone, Serialize, PartialEq, Eq)] +#[serde(rename_all = "camelCase")] +pub(crate) struct ThreadInfo { + pub(super) thread_id: String, + pub(super) name: String, + pub(super) unread_count: usize, + #[serde(skip_serializing_if = "Option::is_none")] + pub(super) last_message: Option, + #[serde(skip_serializing_if = "Option::is_none")] + pub(super) last_message_at: Option, +} + +#[derive(Debug, Clone)] +pub(crate) struct ThreadAccumulator { + info: ThreadInfo, + sort_key: i64, +} + +pub(crate) fn normalize_sender(sender: Option) -> String { + let raw = sender + .unwrap_or_else(|| "human:orchestrator".to_string()) + .trim() + .to_string(); + if raw.is_empty() { + return "human:orchestrator".to_string(); + } + if let Some(rest) = raw.strip_prefix("human:") { + let normalized_rest = rest.trim(); + if normalized_rest.is_empty() { + return "human:orchestrator".to_string(); + } + return format!("human:{normalized_rest}"); + } + raw +} + +pub(crate) fn sender_is_dashboard_label(sender: &str, self_name: &str) -> bool { + let trimmed = sender.trim(); + trimmed.eq_ignore_ascii_case("Dashboard") + || trimmed.eq_ignore_ascii_case("human:Dashboard") + || trimmed.eq_ignore_ascii_case("human:orchestrator") + || trimmed.eq_ignore_ascii_case(self_name) +} + +pub(crate) fn normalize_identity_for_thread(raw: &str) -> String { + raw.trim().trim_start_matches('@').to_ascii_lowercase() +} + +pub(crate) fn json_scalar_to_string(value: &Value) -> Option { + match value { + Value::String(text) => { + let trimmed = text.trim(); + if trimmed.is_empty() { + None + } else { + Some(trimmed.to_string()) + } + } + Value::Number(number) => Some(number.to_string()), + _ => None, + } +} + +pub(crate) fn first_string(value: &Value, pointers: &[&str]) -> Option { + pointers + .iter() + .find_map(|pointer| value.pointer(pointer).and_then(json_scalar_to_string)) +} + +pub(crate) fn first_bool(value: &Value, pointers: &[&str]) -> Option { + pointers + .iter() + .find_map(|pointer| value.pointer(pointer).and_then(Value::as_bool)) +} + +pub(crate) fn first_u64(value: &Value, pointers: &[&str]) -> Option { + pointers + .iter() + .find_map(|pointer| value.pointer(pointer).and_then(Value::as_u64)) +} + +pub(crate) fn first_i64(value: &Value, pointers: &[&str]) -> Option { + pointers + .iter() + .find_map(|pointer| value.pointer(pointer).and_then(Value::as_i64)) +} + +pub(crate) fn relaycast_ws_control_dedup_key( + workspace_id: &str, + ws_type: &str, + value: &Value, +) -> Option { + let identity = if ws_type == "agent.spawn_requested" { + relaycast_ws_spawn_token(value) + .or_else(|| { + first_string( + value, + &[ + "/event_id", + "/id", + "/payload/id", + "/payload/event_id", + "/agent/id", + "/agent/event_id", + "/message/id", + "/message/event_id", + "/message_id", + ], + ) + }) + .or_else(|| first_string(value, &["/agent/name", "/payload/agent/name", "/name"])) + } else { + first_string( + value, + &[ + "/event_id", + "/id", + "/payload/id", + "/payload/event_id", + "/agent/id", + "/agent/event_id", + "/message/id", + "/message/event_id", + "/message_id", + ], + ) + } + .or_else(|| serde_json::to_string(value).ok())?; + Some(format!("control:{workspace_id}:{ws_type}:{identity}")) +} + +pub(crate) fn relaycast_ws_spawn_token(value: &Value) -> Option { + first_string( + value, + &[ + "/agent/token", + "/agent/relay_key", + "/agent/api_key", + "/token", + ], + ) +} + +pub(crate) fn relaycast_spawn_control_dedup_key(workspace_id: &str, identity: &str) -> String { + format!("control:{workspace_id}:agent.spawn_requested:{identity}") +} + +pub(crate) fn relaycast_ws_should_apply_local_spawn_echo_dedup( + control_dedup_key: Option<&str>, + local_spawn_echo_key: &str, +) -> bool { + control_dedup_key != Some(local_spawn_echo_key) +} + +pub(crate) fn note_local_spawn_control_dedup( + dedup: &mut DedupCache, + workspace_id: Option<&str>, + agent_name: &str, + relay_key: Option<&str>, +) { + let Some(workspace_id) = workspace_id else { + return; + }; + let agent_name = agent_name.trim(); + if !agent_name.is_empty() { + let key = relaycast_spawn_control_dedup_key(workspace_id, agent_name); + dedup.insert_if_new(&key, Instant::now()); + } + if let Some(relay_key) = relay_key.map(str::trim).filter(|value| !value.is_empty()) { + let key = relaycast_spawn_control_dedup_key(workspace_id, relay_key); + dedup.insert_if_new(&key, Instant::now()); + } +} + +pub(crate) fn is_unknown_worker_error_message(message: &str) -> bool { + message.contains("unknown worker '") +} + +pub(crate) fn is_relaycast_self_control_target( + name: &str, + workspace_self_name: &str, + workspace_self_names: &HashSet, +) -> bool { + let normalized = normalize_identity_for_thread(name); + normalized == normalize_identity_for_thread(workspace_self_name) + || workspace_self_names.contains(&normalized) +} + +pub(crate) fn message_sender(value: &Value) -> Option { + first_string( + value, + &[ + "/from", + "/sender", + "/author", + "/agent_name", + "/message/from", + "/message/sender", + "/message/author", + "/payload/from", + "/payload/sender", + "/payload/author", + "/payload/message/from", + "/payload/message/sender", + "/payload/message/author", + ], + ) +} + +pub(crate) fn message_target(value: &Value) -> Option { + first_string( + value, + &[ + "/target", + "/to", + "/recipient", + "/channel", + "/conversation_id", + "/conversationId", + "/message/target", + "/message/to", + "/message/recipient", + "/message/channel", + "/message/conversation_id", + "/message/conversationId", + "/payload/target", + "/payload/to", + "/payload/recipient", + "/payload/channel", + "/payload/conversation_id", + "/payload/conversationId", + "/payload/message/target", + "/payload/message/to", + "/payload/message/recipient", + "/payload/message/channel", + "/payload/message/conversation_id", + "/payload/message/conversationId", + ], + ) +} + +pub(crate) fn message_preview(value: &Value) -> Option { + let text = first_string( + value, + &[ + "/text", + "/body", + "/content", + "/message/text", + "/message/body", + "/message/content", + "/payload/text", + "/payload/body", + "/payload/content", + "/payload/message/text", + "/payload/message/body", + "/payload/message/content", + "/message", + "/payload/message", + ], + )?; + Some(truncate_thread_preview(&text, 200)) +} + +pub(crate) fn truncate_thread_preview(input: &str, max_len: usize) -> String { + let trimmed = input.trim(); + if trimmed.len() <= max_len { + return trimmed.to_string(); + } + let boundary = floor_char_boundary(trimmed, max_len); + let mut out = trimmed[..boundary].to_string(); + out.push_str("..."); + out +} + +pub(crate) fn parse_sort_key_from_raw_timestamp(raw: &str) -> Option { + let trimmed = raw.trim(); + if trimmed.is_empty() { + return None; + } + if let Ok(epoch) = trimmed.parse::() { + return Some(epoch); + } + chrono::DateTime::parse_from_rfc3339(trimmed) + .ok() + .map(|parsed| parsed.timestamp_millis()) +} + +pub(crate) fn message_timestamp_string(value: &Value) -> Option { + first_string( + value, + &[ + "/created_at", + "/createdAt", + "/timestamp", + "/ts", + "/message/created_at", + "/message/createdAt", + "/message/timestamp", + "/message/ts", + "/payload/created_at", + "/payload/createdAt", + "/payload/timestamp", + "/payload/ts", + "/payload/message/created_at", + "/payload/message/createdAt", + "/payload/message/timestamp", + "/payload/message/ts", + ], + ) +} + +pub(crate) fn message_sort_key(value: &Value, index: usize) -> i64 { + if let Some(raw) = message_timestamp_string(value) { + if let Some(parsed) = parse_sort_key_from_raw_timestamp(&raw) { + return parsed; + } + } + + first_i64( + value, + &[ + "/created_at", + "/createdAt", + "/timestamp", + "/ts", + "/message/created_at", + "/message/createdAt", + "/message/timestamp", + "/message/ts", + "/payload/created_at", + "/payload/createdAt", + "/payload/timestamp", + "/payload/ts", + ], + ) + .unwrap_or(index as i64) +} + +pub(crate) fn message_thread_id(value: &Value) -> Option { + if let Some(explicit) = first_string( + value, + &[ + "/thread_id", + "/threadId", + "/parent_id", + "/conversation_id", + "/conversationId", + "/message/thread_id", + "/message/threadId", + "/message/parent_id", + "/message/conversation_id", + "/message/conversationId", + "/payload/thread_id", + "/payload/threadId", + "/payload/parent_id", + "/payload/conversation_id", + "/payload/conversationId", + "/payload/message/thread_id", + "/payload/message/threadId", + "/payload/message/parent_id", + "/payload/message/conversation_id", + "/payload/message/conversationId", + ], + ) { + return Some(explicit); + } + + let target = message_target(value)?; + if target.starts_with('#') { + return Some(normalize_channel(&target)); + } + if target.starts_with("conv_") + || target.starts_with("dm_") + || target.chars().all(|ch| ch.is_ascii_digit()) + { + return Some(target); + } + + let sender = message_sender(value)?; + let sender = normalize_identity_for_thread(&sender); + let target = normalize_identity_for_thread(&target); + if sender.is_empty() || target.is_empty() { + return None; + } + let (first, second) = if sender <= target { + (sender, target) + } else { + (target, sender) + }; + Some(format!("direct:{first}:{second}")) +} + +pub(crate) fn is_self_identity(value: &str, self_names: &HashSet) -> bool { + let normalized = normalize_identity_for_thread(value); + !normalized.is_empty() + && self_names + .iter() + .any(|self_name| normalize_identity_for_thread(self_name) == normalized) +} + +pub(crate) fn derive_thread_name( + message: &Value, + thread_id: &str, + self_names: &HashSet, +) -> String { + if let Some(explicit) = first_string( + message, + &[ + "/thread_name", + "/threadName", + "/title", + "/subject", + "/conversation_name", + "/conversationName", + ], + ) { + return explicit; + } + + if thread_id.starts_with('#') { + return thread_id.to_string(); + } + + // Use participants array (from workspace-level DM data) to build a combined name + // like "WorkerA ↔ WorkerB" for DMs between non-broker agents. + if let Some(participants) = message.get("participants").and_then(|v| v.as_array()) { + let names: Vec<&str> = participants + .iter() + .filter_map(|p| p.as_str()) + .filter(|name| !is_self_identity(name, self_names)) + .collect(); + if names.len() >= 2 { + return format!("{} ↔ {}", names[0], names[1]); + } else if names.len() == 1 { + return names[0].to_string(); + } + } + + if let Some(sender) = message_sender(message) { + if !is_self_identity(&sender, self_names) { + return sender.trim().trim_start_matches('@').to_string(); + } + } + + if let Some(target) = message_target(message) { + let trimmed = target.trim().trim_start_matches('@'); + if trimmed.starts_with('#') { + return normalize_channel(trimmed); + } + if !trimmed.is_empty() + && !trimmed.eq_ignore_ascii_case(thread_id) + && !is_self_identity(trimmed, self_names) + && !trimmed.starts_with("conv_") + && !trimmed.starts_with("dm_") + && !trimmed.chars().all(|ch| ch.is_ascii_digit()) + { + return trimmed.to_string(); + } + } + + thread_id.to_string() +} + +pub(crate) fn thread_unread_increment(message: &Value, self_names: &HashSet) -> usize { + if let Some(read) = first_bool( + message, + &[ + "/read", + "/is_read", + "/isRead", + "/message/read", + "/message/is_read", + "/message/isRead", + "/payload/read", + "/payload/is_read", + "/payload/isRead", + "/payload/message/read", + "/payload/message/is_read", + "/payload/message/isRead", + ], + ) { + return usize::from(!read); + } + + if let Some(sender) = message_sender(message) { + return usize::from(!is_self_identity(&sender, self_names)); + } + 0 +} + +pub(crate) fn build_thread_infos( + messages: &[Value], + self_names: &HashSet, +) -> Vec { + let mut by_thread: HashMap = HashMap::new(); + + for (index, message) in messages.iter().enumerate() { + let Some(thread_id) = message_thread_id(message) else { + continue; + }; + + let name = derive_thread_name(message, &thread_id, self_names); + let sort_key = message_sort_key(message, index); + let preview = message_preview(message); + let timestamp = message_timestamp_string(message); + let explicit_unread = first_u64( + message, + &[ + "/unread_count", + "/unreadCount", + "/message/unread_count", + "/message/unreadCount", + "/payload/unread_count", + "/payload/unreadCount", + "/payload/message/unread_count", + "/payload/message/unreadCount", + ], + ) + .map(|value| value as usize); + let unread_delta = thread_unread_increment(message, self_names); + + let entry = by_thread + .entry(thread_id.clone()) + .or_insert_with(|| ThreadAccumulator { + info: ThreadInfo { + thread_id: thread_id.clone(), + name: name.clone(), + unread_count: 0, + last_message: None, + last_message_at: None, + }, + sort_key, + }); + + if entry.info.name == entry.info.thread_id && name != entry.info.thread_id { + entry.info.name = name.clone(); + } + + if let Some(explicit_unread) = explicit_unread { + entry.info.unread_count = entry.info.unread_count.max(explicit_unread); + } else { + entry.info.unread_count = entry.info.unread_count.saturating_add(unread_delta); + } + + if sort_key >= entry.sort_key { + entry.sort_key = sort_key; + entry.info.name = name; + entry.info.last_message = preview; + entry.info.last_message_at = timestamp; + } + } + + let mut threads: Vec = by_thread.into_values().collect(); + threads.sort_by(|left, right| { + right + .sort_key + .cmp(&left.sort_key) + .then_with(|| left.info.thread_id.cmp(&right.info.thread_id)) + }); + + threads.into_iter().map(|entry| entry.info).collect() +} + +pub(crate) fn record_thread_history_event(history: &mut VecDeque, event: Value) { + if history.len() >= THREAD_HISTORY_LIMIT { + let _ = history.pop_front(); + } + history.push_back(event); +} diff --git a/crates/broker/src/runtime/mod.rs b/crates/broker/src/runtime/mod.rs new file mode 100644 index 000000000..10ff87895 --- /dev/null +++ b/crates/broker/src/runtime/mod.rs @@ -0,0 +1,89 @@ +use std::{ + collections::{HashMap, HashSet, VecDeque}, + path::{Path, PathBuf}, + process::Stdio, + sync::{Arc, OnceLock}, + time::{Duration, Instant}, +}; + +use crate::helpers::{ + agent_name_eq, floor_char_boundary, is_self_name, normalize_cli_name, parse_cli_command, + resolve_dm_participants_cached, +}; +use crate::listen_api::{ + broadcast_if_relevant, listen_api_router, DeliveryRouteError, ListenApiConfig, + ListenApiRequest, SetInboundDeliveryModeOk, +}; +use crate::routing::display_target_for_dashboard; + +use anyhow::{Context, Result}; +use relaycast::WsEvent; +use serde::{Deserialize, Serialize}; +use serde_json::{json, Value}; +use tokio::{ + io::{AsyncBufReadExt, AsyncReadExt, AsyncWriteExt, BufReader}, + sync::{broadcast, mpsc, Notify, RwLock}, + time::{timeout, MissedTickBehavior}, +}; +use uuid::Uuid; + +use relay_broker::{ + auth::AuthClient, + dedup::DedupCache, + message_bridge::map_ws_event, + multi_workspace::{MultiWorkspaceSession, WorkspaceInboundMessage, WorkspaceMembershipSummary}, + protocol::{ + AgentRuntime, AgentSpec, HeadlessProvider as ProtocolHeadlessProvider, + MessageInjectionMode, ProtocolEnvelope, RelayDelivery, PROTOCOL_VERSION, + }, + relaycast_ws::{ + format_worker_preregistration_error, registration_retry_after_secs, + retry_agent_registration, RegRetryOutcome, RelaycastHttpClient, WsControl, + }, + replay_buffer::{ReplayBuffer, DEFAULT_REPLAY_CAPACITY}, + snippets::ensure_relaycast_mcp_config, + telemetry::{ActionSource, TelemetryClient, TelemetryEvent}, + types::{ + BrokerCommandEvent, InboundDeliveryDispatch, InboundDeliveryMode, InboundDeliveryState, + InboundKind, PendingRelayMessage, + }, +}; + +use crate::cli::{DumpPtyCommand, DumpPtyFormat, HeadlessCommand, InitCommand}; +use crate::worker::{WorkerEvent, WorkerHandle, WorkerRegistry}; +use crate::{broker, listen_api, routing, worker_request}; + +const DEFAULT_DELIVERY_RETRY_MS: u64 = 1_000; +const MAX_DELIVERY_RETRIES: u32 = 10; +const DEFAULT_RELAYCAST_BASE_URL: &str = "https://api.relaycast.dev"; +const THREAD_HISTORY_LIMIT: usize = 1_000; +const DEFAULT_HTTP_API_LOCAL_DELIVERY_TIMEOUT_MS: u64 = 3_000; +const DEFAULT_HTTP_API_RELAYCAST_SEND_TIMEOUT_MS: u64 = 20_000; +const DEFAULT_HTTP_API_EVENT_EMIT_TIMEOUT_MS: u64 = 200; +static TRACING_GUARD: OnceLock = OnceLock::new(); + +mod connection; +mod delivery; +mod headless; +mod init; +mod io; +mod messages; +mod paths; +mod session; +mod spawn_spec; +mod system; +#[cfg(test)] +mod tests; +mod util; + +pub(crate) use connection::*; +pub(crate) use delivery::*; +pub(crate) use headless::*; +pub(crate) use init::*; +pub(crate) use io::*; +pub(crate) use messages::*; +pub(crate) use paths::*; +pub(crate) use session::*; +pub(crate) use spawn_spec::*; +pub(crate) use system::*; +pub(crate) use util::*; diff --git a/crates/broker/src/runtime/paths.rs b/crates/broker/src/runtime/paths.rs new file mode 100644 index 000000000..eb0301b75 --- /dev/null +++ b/crates/broker/src/runtime/paths.rs @@ -0,0 +1,184 @@ +use super::*; + +#[derive(Debug)] +pub(crate) struct RuntimePaths { + pub(super) persist: bool, + pub(super) state: PathBuf, + pub(super) pending: PathBuf, + /// Held for process lifetime to prevent concurrent broker instances (persist mode only). + #[allow(dead_code)] + pub(super) _lock: Option, +} + +/// Returns the continuity directory path derived from the state file path. +/// State path is always `{cwd}/.agent-relay/state.json`, so parent is `{cwd}/.agent-relay/`. +pub(crate) fn continuity_dir(state_path: &Path) -> PathBuf { + state_path + .parent() + .expect("state_path always has a parent (.agent-relay/)") + .join("continuity") +} + +/// Create ephemeral runtime paths in the system temp directory. +/// +/// Unlike `ensure_runtime_paths`, this function: +/// - Writes nothing to the project directory +/// - Uses a deterministic temp directory derived from cwd+broker name so +/// duplicate brokers still collide on the same lock/PID files +/// +/// The temp directory is NOT removed on exit — the OS cleans it up on reboot. +/// State and pending-delivery files are still written there so they don't +/// interfere with the project tree; they're just ephemeral. +/// Ephemeral mode: no lock file, no PID file, no temp directory. +/// The broker lifecycle is tied to the parent process via stdin — when the +/// parent (SDK client) exits, stdin gets EOF and the broker shuts down. +/// Single-instance enforcement is unnecessary here because each SDK client +/// manages its own child process. +pub(crate) fn ensure_ephemeral_paths(_cwd: &Path, _broker_name: &str) -> Result { + // Use a random temp subdir so concurrent ephemeral brokers don't collide + // on state files. + let root = std::env::temp_dir().join(format!("agent-relay-ephemeral-{}", std::process::id())); + std::fs::create_dir_all(&root) + .with_context(|| format!("failed to create ephemeral temp dir {}", root.display()))?; + + Ok(RuntimePaths { + persist: false, + state: root.join("state.json"), + pending: root.join("pending.json"), + _lock: None, + }) +} + +pub(crate) fn ensure_runtime_paths( + cwd: &Path, + broker_name: &str, + state_dir: Option<&Path>, +) -> Result { + let root = state_dir + .map(PathBuf::from) + .unwrap_or_else(|| cwd.join(".agent-relay")); + std::fs::create_dir_all(&root) + .with_context(|| format!("failed to create runtime dir {}", root.display()))?; + + // Sanitise name for use in filenames — keep only alphanumeric and hyphens + let safe_name: String = broker_name + .chars() + .map(|c| { + if c.is_alphanumeric() || c == '-' { + c + } else { + '-' + } + }) + .collect(); + + // Lock and PID files are per-broker-name so concurrent workflows can coexist. + let lock_path = root.join(format!("broker-{safe_name}.lock")); + let lock_file = std::fs::File::create(&lock_path) + .with_context(|| format!("failed to create lock file {}", lock_path.display()))?; + + #[cfg(unix)] + { + use std::os::unix::io::AsRawFd; + let fd = lock_file.as_raw_fd(); + let rc = unsafe { nix::libc::flock(fd, nix::libc::LOCK_EX | nix::libc::LOCK_NB) }; + if rc != 0 { + // Lock acquisition failed — check if the holder is still alive + // by reading the PID from connection.json. + let connection_path = root.join("connection.json"); + let old_pid = std::fs::read_to_string(&connection_path) + .ok() + .and_then(|c| serde_json::from_str::(&c).ok()) + .and_then(|v| v.get("pid").and_then(|p| p.as_u64())) + .map(|p| p as u32); + if let Some(old_pid) = old_pid { + if !broker::is_pid_alive(old_pid) { + tracing::warn!( + old_pid = old_pid, + "stale broker lock detected (PID {} is dead), recovering", + old_pid + ); + // The old process is dead — remove stale PID file and retry lock. + // We drop and re-create the lock file to clear the stale flock. + drop(lock_file); + let lock_file = std::fs::File::create(&lock_path).with_context(|| { + format!( + "failed to re-create lock file after stale recovery {}", + lock_path.display() + ) + })?; + let fd = lock_file.as_raw_fd(); + let rc = + unsafe { nix::libc::flock(fd, nix::libc::LOCK_EX | nix::libc::LOCK_NB) }; + if rc != 0 { + anyhow::bail!( + "another broker instance is already running in this directory ({})", + root.display() + ); + } + // Successfully recovered — PID is written via connection.json at API start + return Ok(RuntimePaths { + persist: true, + state: root.join(format!("state-{safe_name}.json")), + pending: root.join(format!("pending-{safe_name}.json")), + _lock: Some(lock_file), + }); + } else { + anyhow::bail!( + "another broker instance is already running in this directory (pid: {}, {})", + old_pid, + root.display() + ); + } + } + // PID file missing or unreadable while lock is held — treat as stale. + // This happens when the user deletes .agent-relay/ while an old broker + // is still alive, or during the shutdown race (PID deleted before flock + // released). + tracing::warn!( + "broker lock held but no valid PID file found, treating as stale and recovering" + ); + drop(lock_file); + let lock_file = std::fs::File::create(&lock_path).with_context(|| { + format!( + "failed to re-create lock file after stale recovery {}", + lock_path.display() + ) + })?; + let fd = lock_file.as_raw_fd(); + let rc = unsafe { nix::libc::flock(fd, nix::libc::LOCK_EX | nix::libc::LOCK_NB) }; + if rc != 0 { + anyhow::bail!( + "another broker instance is already running in this directory ({})", + root.display() + ); + } + return Ok(RuntimePaths { + persist: true, + state: root.join(format!("state-{safe_name}.json")), + pending: root.join(format!("pending-{safe_name}.json")), + _lock: Some(lock_file), + }); + } + } + + // PID is written via connection.json at API start + + Ok(RuntimePaths { + persist: true, + state: root.join(format!("state-{safe_name}.json")), + pending: root.join(format!("pending-{safe_name}.json")), + _lock: Some(lock_file), + }) +} + +pub(crate) fn derive_ws_base_url_from_http(http_base: &str) -> String { + let trimmed = http_base.trim(); + if let Some(rest) = trimmed.strip_prefix("https://") { + format!("wss://{rest}") + } else if let Some(rest) = trimmed.strip_prefix("http://") { + format!("ws://{rest}") + } else { + trimmed.to_string() + } +} diff --git a/crates/broker/src/runtime/session.rs b/crates/broker/src/runtime/session.rs new file mode 100644 index 000000000..5625629d7 --- /dev/null +++ b/crates/broker/src/runtime/session.rs @@ -0,0 +1,264 @@ +use super::*; + +/// Shared Relaycast connection state used by run_init and run_wrap. +#[derive(Clone)] +pub(crate) struct RelayWorkspace { + pub(crate) workspace_id: String, + pub(crate) workspace_alias: Option, + pub(crate) relay_workspace_key: String, + pub(crate) self_name: String, + pub(crate) self_agent_id: String, + pub(crate) self_names: HashSet, + pub(crate) self_agent_ids: HashSet, + pub(crate) http_client: RelaycastHttpClient, + pub(crate) ws_control_tx: mpsc::Sender, +} + +pub(crate) struct RelaySession { + pub(crate) http_base: String, + pub(crate) default_workspace_id: Option, + pub(crate) workspaces: Vec, + pub(crate) ws_inbound_rx: mpsc::Receiver, +} + +#[derive(Clone)] +pub(crate) struct RelayReadyState { + pub(super) workspace_key: String, + pub(super) memberships: Vec, + pub(super) default_workspace_id: Option, +} + +pub(crate) async fn serve_startup_api_until_ready( + listener: tokio::net::TcpListener, + relay_ready: Arc, +) -> tokio::net::TcpListener { + loop { + tokio::select! { + _ = relay_ready.notified() => { + return listener; + } + accepted = listener.accept() => { + match accepted { + Ok((stream, _addr)) => { + tokio::spawn(handle_startup_api_connection(stream)); + } + Err(error) => { + tracing::warn!(error = %error, "startup API accept failed"); + tokio::time::sleep(Duration::from_millis(50)).await; + } + } + } + } + } +} + +pub(crate) async fn handle_startup_api_connection(mut stream: tokio::net::TcpStream) { + let mut buffer = [0_u8; 1024]; + let read = match timeout(Duration::from_secs(5), stream.read(&mut buffer)).await { + Ok(Ok(read)) => read, + Ok(Err(error)) => { + tracing::debug!(error = %error, "failed reading startup API request"); + return; + } + Err(_) => return, + }; + + let request = String::from_utf8_lossy(&buffer[..read]); + let path = request + .lines() + .next() + .and_then(|line| line.split_whitespace().nth(1)) + .unwrap_or("/"); + let (status, content_type, body) = if path == "/health" { + ( + "200 OK", + "application/json", + listen_api::listen_api_health_payload(None, vec![]).to_string(), + ) + } else { + ( + "503 Service Unavailable", + "text/plain; charset=utf-8", + "Broker is starting, please retry".to_string(), + ) + }; + let response = format!( + "HTTP/1.1 {status}\r\ncontent-type: {content_type}\r\ncontent-length: {}\r\nconnection: close\r\n\r\n{body}", + body.len() + ); + if let Err(error) = stream.write_all(response.as_bytes()).await { + tracing::debug!(error = %error, "failed writing startup API response"); + } +} + +/// Build the standard env-var array passed to every spawned child agent. +pub(crate) fn normalize_initial_task(task: Option) -> Option { + task.and_then(|value| { + if value.trim().is_empty() { + None + } else { + Some(value) + } + }) +} + +pub(crate) struct RelaySessionOptions<'a> { + pub(crate) paths: &'a RuntimePaths, + pub(crate) requested_name: &'a str, + pub(crate) channels: Vec, + pub(crate) strict_name: bool, + pub(crate) agent_type: Option<&'a str>, + /// Read .mcp.json for additional self-name identities + pub(crate) read_mcp_identity: bool, + /// Write relaycast server entry to .mcp.json + pub(crate) ensure_mcp_config: bool, + pub(crate) runtime_cwd: &'a Path, +} + +pub(crate) async fn connect_relay(opts: RelaySessionOptions<'_>) -> Result { + let startup_debug = startup_debug_enabled(); + let connect_started = Instant::now(); + let http_base = std::env::var("RELAYCAST_BASE_URL") + .ok() + .or_else(|| std::env::var("RELAY_BASE_URL").ok()) + .unwrap_or_else(|| DEFAULT_RELAYCAST_BASE_URL.to_string()); + let ws_base = std::env::var("RELAYCAST_WS_URL") + .unwrap_or_else(|_| derive_ws_base_url_from_http(&http_base)); + + log_startup_phase( + startup_debug, + connect_started, + format!( + "connect_relay begin requested_name='{}' channels={}", + opts.requested_name, + opts.channels.join(",") + ), + ); + let auth = AuthClient::new(http_base.clone()); + let sessions = auth + .startup_session_set_with_options( + Some(opts.requested_name), + opts.strict_name, + opts.agent_type, + ) + .await + .context("failed to initialize relaycast session")?; + log_startup_phase( + startup_debug, + connect_started, + format!( + "startup_session_set_with_options complete memberships={}", + sessions.memberships.len() + ), + ); + + let default_session = sessions + .default_session() + .or_else(|| sessions.memberships.first()) + .context("no relaycast memberships were initialized")?; + let relay_workspace_key = default_session.credentials.api_key.clone(); + let self_agent_id = default_session.credentials.agent_id.clone(); + let self_token = default_session.token.clone(); + let agent_name = default_session + .credentials + .agent_name + .clone() + .unwrap_or_else(|| opts.requested_name.to_string()); + + let identity_debug = format!( + "agent_name='{}' +requested='{}' +agent_id='{}' +token_prefix='{}' +default_workspace='{}' +workspace_count='{}' +timestamp='{}' +", + agent_name, + opts.requested_name, + self_agent_id, + &self_token[..self_token.len().min(16)], + default_session.credentials.workspace_id, + sessions.memberships.len(), + chrono::Utc::now().to_rfc3339() + ); + let debug_path = opts + .paths + .state + .parent() + .unwrap() + .join("identity-debug.txt"); + if std::env::var("AGENT_RELAY_NO_DEBUG_FILES").is_err() { + let _ = std::fs::write(&debug_path, &identity_debug); + eprintln!( + "[agent-relay] identity debug written to {}", + debug_path.display() + ); + } + if agent_name != opts.requested_name { + eprintln!( + "[agent-relay] WARNING: registered as '{}' (requested '{}')", + agent_name, opts.requested_name + ); + } + + if opts.ensure_mcp_config { + if let Err(error) = ensure_relaycast_mcp_config( + opts.runtime_cwd, + Some(relay_workspace_key.as_str()), + Some(http_base.as_str()), + None, + ) { + tracing::warn!("failed to ensure .mcp.json: {error}"); + } + } + + log_startup_phase( + startup_debug, + connect_started, + "MultiWorkspaceSession::new begin", + ); + let mut multi = MultiWorkspaceSession::new( + http_base.clone(), + ws_base, + auth, + sessions, + opts.channels, + opts.read_mcp_identity, + opts.runtime_cwd, + relay_broker::events::EventEmitter::new(false), + ); + log_startup_phase( + startup_debug, + connect_started, + format!( + "MultiWorkspaceSession::new complete handles={} default_workspace={:?}", + multi.handles.len(), + multi.default_workspace_id + ), + ); + + let default_workspace_id = multi.default_workspace_id.clone(); + let workspaces = multi + .handles + .drain(..) + .map(|handle| RelayWorkspace { + workspace_id: handle.workspace_id, + workspace_alias: handle.workspace_alias, + relay_workspace_key: handle.relay_workspace_key, + self_name: handle.self_name, + self_agent_id: handle.self_agent_id, + self_names: handle.self_names, + self_agent_ids: handle.self_agent_ids, + http_client: handle.http_client, + ws_control_tx: handle.ws_control_tx, + }) + .collect(); + + Ok(RelaySession { + http_base, + default_workspace_id, + workspaces, + ws_inbound_rx: multi.inbound_rx, + }) +} diff --git a/crates/broker/src/runtime/spawn_spec.rs b/crates/broker/src/runtime/spawn_spec.rs new file mode 100644 index 000000000..9781d84bb --- /dev/null +++ b/crates/broker/src/runtime/spawn_spec.rs @@ -0,0 +1,68 @@ +use super::*; + +pub(crate) fn runtime_label(runtime: &AgentRuntime) -> &'static str { + match runtime { + AgentRuntime::Pty => "pty", + AgentRuntime::Headless => "headless", + } +} + +#[allow(clippy::too_many_arguments)] +pub(crate) fn build_http_api_spawn_spec( + name: String, + cli: String, + transport: Option, + model: Option, + args: Vec, + channels: Vec, + cwd: Option, + team: Option, + shadow_of: Option, + shadow_mode: Option, + restart_policy: Option, +) -> Result { + let runtime = match transport + .as_deref() + .map(str::trim) + .filter(|value| !value.is_empty()) + .map(|value| value.to_ascii_lowercase()) + { + None => AgentRuntime::Pty, + Some(value) if value == "pty" => AgentRuntime::Pty, + Some(value) if value == "headless" => AgentRuntime::Headless, + Some(other) => { + anyhow::bail!("unsupported transport '{other}' (expected 'pty' or 'headless')") + } + }; + let parsed_restart_policy = match restart_policy { + Some(v) => Some(serde_json::from_value(v).context("invalid restart_policy")?), + None => None, + }; + + let (provider, cli_command, model) = match runtime { + AgentRuntime::Pty => (None, Some(cli), model), + AgentRuntime::Headless => { + let provider = headless_provider_from_cli(&cli).with_context(|| { + format!( + "provider '{cli}' does not support headless transport (supported: claude, opencode)" + ) + })?; + (Some(provider), None, model) + } + }; + + Ok(AgentSpec { + name, + runtime, + provider, + cli: cli_command, + model, + cwd, + team, + shadow_of, + shadow_mode, + args, + channels, + restart_policy: parsed_restart_policy, + }) +} diff --git a/crates/broker/src/runtime/system.rs b/crates/broker/src/runtime/system.rs new file mode 100644 index 000000000..7743d825d --- /dev/null +++ b/crates/broker/src/runtime/system.rs @@ -0,0 +1,94 @@ +use super::*; + +/// Get terminal rows from TIOCGWINSZ. +#[cfg(unix)] +pub(crate) fn terminal_rows() -> Option { + use nix::libc; + use nix::pty::Winsize; + let mut ws = Winsize { + ws_row: 0, + ws_col: 0, + ws_xpixel: 0, + ws_ypixel: 0, + }; + unsafe { + if libc::ioctl(libc::STDOUT_FILENO, libc::TIOCGWINSZ, &mut ws) == 0 && ws.ws_row > 0 { + Some(ws.ws_row) + } else { + None + } + } +} + +/// Get terminal cols from TIOCGWINSZ. +#[cfg(unix)] +pub(crate) fn terminal_cols() -> Option { + use nix::libc; + use nix::pty::Winsize; + let mut ws = Winsize { + ws_row: 0, + ws_col: 0, + ws_xpixel: 0, + ws_ypixel: 0, + }; + unsafe { + if libc::ioctl(libc::STDOUT_FILENO, libc::TIOCGWINSZ, &mut ws) == 0 && ws.ws_col > 0 { + Some(ws.ws_col) + } else { + None + } + } +} + +#[cfg(not(unix))] +pub(crate) fn terminal_rows() -> Option { + None +} +#[cfg(not(unix))] +pub(crate) fn terminal_cols() -> Option { + None +} + +#[cfg(target_os = "linux")] +pub(crate) fn memory_bytes_for_pid(pid: u32) -> u64 { + let statm_path = format!("/proc/{pid}/statm"); + let statm = match std::fs::read_to_string(statm_path) { + Ok(contents) => contents, + Err(_) => return 0, + }; + + let rss_pages = match statm + .split_whitespace() + .nth(1) + .and_then(|value| value.parse::().ok()) + { + Some(value) => value, + None => return 0, + }; + + let page_size = unsafe { nix::libc::sysconf(nix::libc::_SC_PAGESIZE) }; + if page_size <= 0 { + return 0; + } + + rss_pages.saturating_mul(page_size as u64) +} + +#[cfg(not(target_os = "linux"))] +pub(crate) fn memory_bytes_for_pid(_pid: u32) -> u64 { + 0 +} + +pub(crate) fn build_agent_metrics(handle: &WorkerHandle) -> AgentMetrics { + let pid = handle.child.id().unwrap_or_default(); + AgentMetrics { + name: handle.spec.name.clone(), + pid, + memory_bytes: if pid == 0 { + 0 + } else { + memory_bytes_for_pid(pid) + }, + uptime_secs: handle.spawned_at.elapsed().as_secs(), + } +} diff --git a/crates/broker/src/runtime/tests.rs b/crates/broker/src/runtime/tests.rs new file mode 100644 index 000000000..606e75c0b --- /dev/null +++ b/crates/broker/src/runtime/tests.rs @@ -0,0 +1,1743 @@ +use std::{ + collections::{BTreeSet, HashMap, HashSet}, + path::PathBuf, + process::Stdio, + time::{Duration, Instant}, +}; + +use crate::helpers::{ + detect_bypass_permissions_prompt, detect_claude_trust_prompt, floor_char_boundary, + format_injection, is_auto_suggestion, is_bypass_selection_menu, is_in_editor_mode, strip_ansi, +}; +use crate::worker::{WorkerEvent, WorkerHandle, WorkerRegistry}; +use relay_broker::protocol::{AgentSpec, MessageInjectionMode, RelayDelivery}; +use serde_json::{json, Value}; +use tokio::sync::mpsc; + +use super::{ + build_agent_state_transition_event, build_http_api_spawn_spec, build_thread_infos, + channels_from_csv, continuity_dir, delivery_retry_interval, derive_ws_base_url_from_http, + display_target_for_dashboard, drop_pending_for_worker, extract_mcp_message_ids, + http_api_event_emit_timeout, http_api_local_delivery_timeout, http_api_relaycast_send_timeout, + is_relaycast_self_control_target, is_unknown_worker_error_message, normalize_channel, + normalize_initial_task, normalize_sender, queue_inbound_for_delivery_mode, + relaycast_spawn_control_dedup_key, relaycast_ws_control_dedup_key, + relaycast_ws_should_apply_local_spawn_echo_dedup, relaycast_ws_spawn_token, + sender_is_dashboard_label, should_clear_pending_delivery_for_event, AgentRuntime, + InboundContext, InboundQueueOutcome, PendingDelivery, ProtocolHeadlessProvider, +}; +use relay_broker::dedup::DedupCache; +use relay_broker::relaycast_ws::{format_worker_preregistration_error, RelaycastRegistrationError}; +use relay_broker::types::{InboundDeliveryMode, InboundDeliveryState}; + +async fn make_worker_registry_with_worker(name: &str) -> WorkerRegistry { + let (tx, _rx) = mpsc::channel::(16); + let mut registry = WorkerRegistry::new( + tx, + Vec::new(), + PathBuf::from("/tmp/agent-relay-broker-tests"), + Instant::now(), + ); + let mut child = tokio::process::Command::new("cat") + .stdin(Stdio::piped()) + .stdout(Stdio::null()) + .stderr(Stdio::null()) + .spawn() + .expect("test worker process should spawn"); + let stdin = child.stdin.take().expect("test worker stdin should exist"); + registry.workers.insert( + name.to_string(), + WorkerHandle { + spec: AgentSpec { + name: name.to_string(), + runtime: AgentRuntime::Pty, + provider: None, + cli: Some("cat".to_string()), + model: None, + cwd: None, + team: None, + shadow_of: None, + shadow_mode: None, + args: Vec::new(), + channels: Vec::new(), + restart_policy: None, + }, + parent: None, + workspace_id: Some("ws_demo".to_string()), + child, + stdin, + spawned_at: Instant::now(), + }, + ); + registry +} + +async fn cleanup_worker_registry(mut registry: WorkerRegistry) { + for handle in registry.workers.values_mut() { + let _ = handle.child.start_kill(); + let _ = handle.child.wait().await; + } +} + +fn inbound_ctx<'a>(event_id: &'a str) -> InboundContext<'a> { + InboundContext { + from: "Alice", + body: "hello from relay", + target: "#general", + thread_id: Some("thr_123"), + workspace_id: Some("ws_demo"), + workspace_alias: Some("Demo"), + priority: 1, + mode: MessageInjectionMode::Steer, + event_id: Some(event_id), + } +} + +#[tokio::test] +async fn inbound_queue_auto_inject_drains_immediately_with_full_context() { + let worker_name = "worker-a"; + let workers = make_worker_registry_with_worker(worker_name).await; + let mut delivery_states = HashMap::new(); + + let outcome = queue_inbound_for_delivery_mode( + &mut delivery_states, + &workers, + worker_name, + inbound_ctx("evt_auto"), + ); + + match outcome { + InboundQueueOutcome::DrainNow(messages) => { + assert_eq!(messages.len(), 1); + let msg = &messages[0]; + assert_eq!(msg.from, "Alice"); + assert_eq!(msg.body, "hello from relay"); + assert_eq!(msg.target, "#general"); + assert_eq!(msg.thread_id.as_deref(), Some("thr_123")); + assert_eq!(msg.workspace_id.as_deref(), Some("ws_demo")); + assert_eq!(msg.workspace_alias.as_deref(), Some("Demo")); + assert_eq!(msg.priority, 1); + assert_eq!(msg.mode, MessageInjectionMode::Steer); + assert_eq!(msg.event_id.as_deref(), Some("evt_auto")); + } + other => panic!("expected immediate drain, got {other:?}"), + } + assert_eq!( + delivery_states + .get(worker_name) + .expect("state should be created") + .pending_snapshot(), + Vec::new(), + "auto_inject drains the per-worker pending queue in the same broker turn" + ); + + cleanup_worker_registry(workers).await; +} + +#[tokio::test] +async fn inbound_queue_manual_flush_holds_until_explicit_drain() { + let worker_name = "worker-a"; + let workers = make_worker_registry_with_worker(worker_name).await; + let mut delivery_states = HashMap::from([( + worker_name.to_string(), + InboundDeliveryState::new(InboundDeliveryMode::ManualFlush), + )]); + + let outcome = queue_inbound_for_delivery_mode( + &mut delivery_states, + &workers, + worker_name, + inbound_ctx("evt_manual"), + ); + + assert_eq!(outcome, InboundQueueOutcome::Queued); + let snapshot = delivery_states + .get(worker_name) + .expect("manual state should remain present") + .pending_snapshot(); + assert_eq!(snapshot.len(), 1); + assert_eq!(snapshot[0].event_id.as_deref(), Some("evt_manual")); + assert_eq!(snapshot[0].target, "#general"); + + cleanup_worker_registry(workers).await; +} + +#[tokio::test] +async fn inbound_queue_worker_missing_does_not_create_state() { + let (tx, _rx) = mpsc::channel::(16); + let workers = WorkerRegistry::new( + tx, + Vec::new(), + PathBuf::from("/tmp/agent-relay-broker-tests"), + Instant::now(), + ); + let mut delivery_states = HashMap::new(); + + let outcome = queue_inbound_for_delivery_mode( + &mut delivery_states, + &workers, + "ghost", + inbound_ctx("evt_missing"), + ); + + assert_eq!(outcome, InboundQueueOutcome::WorkerMissing); + assert!(delivery_states.is_empty()); +} + +fn extract_kind_literals(source: &str) -> BTreeSet { + let marker = "\"kind\""; + let mut kinds = BTreeSet::new(); + let mut cursor = 0; + while let Some(offset) = source[cursor..].find(marker) { + let mut start = cursor + offset + marker.len(); + if start >= source.len() { + break; + } + if !source[start..].starts_with(':') { + cursor = start; + continue; + } + start += 1; + while start < source.len() && source.as_bytes()[start].is_ascii_whitespace() { + start += 1; + } + if start >= source.len() || source.as_bytes()[start] != b'"' { + cursor = start; + continue; + } + start += 1; + if let Some(end) = source[start..].find('"') { + let candidate = &source[start..start + end]; + if !candidate.is_empty() + && candidate + .chars() + .all(|c| c.is_ascii_lowercase() || c == '_' || c.is_ascii_digit()) + { + kinds.insert(candidate.to_string()); + } + } + cursor = start; + if cursor >= source.len() { + break; + } + } + kinds +} + +#[test] +fn parses_channels() { + assert_eq!(channels_from_csv("general,ops"), vec!["general", "ops"]); +} + +#[test] +fn channel_normalization() { + assert_eq!(normalize_channel("general"), "#general"); + assert_eq!(normalize_channel("#ops"), "#ops"); +} + +#[test] +fn normalize_initial_task_drops_empty_values() { + assert_eq!(normalize_initial_task(None), None); + assert_eq!(normalize_initial_task(Some(String::new())), None); + assert_eq!(normalize_initial_task(Some(" ".to_string())), None); +} + +#[test] +fn normalize_initial_task_keeps_non_empty_values() { + assert_eq!( + normalize_initial_task(Some("Ship the patch".to_string())), + Some("Ship the patch".to_string()) + ); +} + +#[test] +fn ws_base_derivation() { + assert_eq!( + derive_ws_base_url_from_http("https://api.relaycast.dev"), + "wss://api.relaycast.dev" + ); + assert_eq!( + derive_ws_base_url_from_http("http://localhost:8787"), + "ws://localhost:8787" + ); +} + +#[test] +fn relaycast_control_dedup_key_prefers_event_id() { + let value = json!({ + "type": "agent.spawn_requested", + "event_id": "evt_123", + "agent": { "name": "worker-a", "cli": "claude", "task": "Ship it" } + }); + + assert_eq!( + relaycast_ws_control_dedup_key("ws_1", "agent.spawn_requested", &value), + Some("control:ws_1:agent.spawn_requested:evt_123".to_string()) + ); +} + +#[test] +fn relaycast_control_dedup_key_prefers_spawn_token_for_spawn_requests() { + let value = json!({ + "type": "agent.spawn_requested", + "event_id": "evt_123", + "agent": { + "name": "worker-a", + "cli": "claude", + "task": "Ship it", + "token": "at_live_worker" + } + }); + + assert_eq!( + relaycast_ws_control_dedup_key("ws_1", "agent.spawn_requested", &value), + Some("control:ws_1:agent.spawn_requested:at_live_worker".to_string()) + ); +} + +#[test] +fn relaycast_control_dedup_key_falls_back_to_agent_name_for_spawn_requests() { + let value = json!({ + "type": "agent.spawn_requested", + "agent": { + "name": "worker-a", + "cli": "claude", + "task": "Ship it" + } + }); + + assert_eq!( + relaycast_ws_control_dedup_key("ws_1", "agent.spawn_requested", &value), + Some("control:ws_1:agent.spawn_requested:worker-a".to_string()) + ); +} + +#[test] +fn relaycast_control_dedup_key_falls_back_to_serialized_payload() { + let value = json!({ + "type": "agent.release_requested", + "agent": { "name": "worker-a" } + }); + + let key = relaycast_ws_control_dedup_key("ws_1", "agent.release_requested", &value) + .expect("fallback dedup key"); + assert!(key.starts_with("control:ws_1:agent.release_requested:{")); + assert!(key.contains("\"worker-a\"")); +} + +#[test] +fn relaycast_ws_spawn_token_extracts_agent_token() { + let value = json!({ + "type": "agent.spawn_requested", + "agent": { + "name": "worker-a", + "token": "at_live_worker" + } + }); + + assert_eq!( + relaycast_ws_spawn_token(&value), + Some("at_live_worker".to_string()) + ); +} + +#[test] +fn relaycast_ws_spawn_name_only_control_key_skips_second_name_dedup() { + let value = json!({ + "type": "agent.spawn_requested", + "agent": { + "name": "worker-a", + "cli": "claude", + "task": "Ship it" + } + }); + + let control_key = relaycast_ws_control_dedup_key("ws_1", "agent.spawn_requested", &value) + .expect("control dedup key"); + let local_key = relaycast_spawn_control_dedup_key("ws_1", "worker-a"); + + assert_eq!(control_key, local_key); + assert!(!relaycast_ws_should_apply_local_spawn_echo_dedup( + Some(control_key.as_str()), + &local_key + )); +} + +#[test] +fn relaycast_ws_spawn_event_id_echo_still_uses_local_name_dedup() { + let value = json!({ + "type": "agent.spawn_requested", + "event_id": "evt_123", + "agent": { + "name": "worker-a", + "cli": "claude", + "task": "Ship it" + } + }); + + let control_key = relaycast_ws_control_dedup_key("ws_1", "agent.spawn_requested", &value) + .expect("control dedup key"); + let local_key = relaycast_spawn_control_dedup_key("ws_1", "worker-a"); + + assert_ne!(control_key, local_key); + assert!(relaycast_ws_should_apply_local_spawn_echo_dedup( + Some(control_key.as_str()), + &local_key + )); + + let now = Instant::now(); + let mut dedup = DedupCache::new(Duration::from_secs(60), 16); + assert!(dedup.insert_if_new(&local_key, now)); + assert!(dedup.insert_if_new(&control_key, now + Duration::from_secs(1))); + assert!(!dedup.insert_if_new(&local_key, now + Duration::from_secs(2))); +} + +#[test] +fn unknown_worker_error_message_matches_release_failures() { + assert!(is_unknown_worker_error_message("unknown worker 'worker-a'")); + assert!(is_unknown_worker_error_message( + "failed to release 'worker-a': unknown worker 'worker-a'" + )); + assert!(!is_unknown_worker_error_message("failed to bind api port")); +} + +#[test] +fn relaycast_self_control_target_matches_aliases_case_insensitively() { + let self_names = HashSet::from([ + "relay-broker".to_string(), + "relay-broker@workspace".to_string(), + ]); + + assert!(is_relaycast_self_control_target( + "Relay-Broker", + "relay-broker", + &self_names + )); + assert!(is_relaycast_self_control_target( + "@relay-broker@workspace", + "relay-broker", + &self_names + )); + assert!(!is_relaycast_self_control_target( + "worker-a", + "relay-broker", + &self_names + )); +} + +#[tokio::test] +async fn contract_health_fixture_requires_rich_listen_health_shape() { + let fixture: Value = serde_json::from_str(include_str!( + "../../../../packages/contracts/fixtures/health-fixtures.json" + )) + .expect("health fixture should be valid JSON"); + let expected_shape = fixture + .get("health_response") + .and_then(Value::as_object) + .expect("health fixture must include health_response object"); + + let actual = crate::listen_api::listen_api_health_payload(None, vec![]); + + for required_key in expected_shape.keys() { + // TODO(contract-wave1-health-shape): listen-mode /health should + // implement the shared BrokerHealthResponse contract fields. + assert!( + actual.get(required_key).is_some(), + "listen /health response is missing required contract field: {}", + required_key + ); + } +} + +#[tokio::test] +async fn contract_startup_429_fixture_requires_degraded_health_status() { + let fixture: Value = serde_json::from_str(include_str!( + "../../../../packages/contracts/fixtures/health-fixtures.json" + )) + .expect("health fixture should be valid JSON"); + let expected = fixture + .get("wave0_startup_429_degraded") + .and_then(|v| v.get("expected_health_status")) + .and_then(Value::as_str) + .expect("health fixture must include expected degraded health status"); + let startup_error_code = fixture + .get("wave0_startup_429_degraded") + .and_then(|v| v.get("error")) + .and_then(|v| v.get("code")) + .and_then(Value::as_str) + .expect("health fixture must include startup error code"); + std::env::set_var("AGENT_RELAY_STARTUP_ERROR_CODE", startup_error_code); + let actual = crate::listen_api::listen_api_health_payload(None, vec![]) + .get("status") + .and_then(Value::as_str) + .unwrap_or("unknown") + .to_string(); + std::env::remove_var("AGENT_RELAY_STARTUP_ERROR_CODE"); + + assert_eq!( + actual, expected, + "listen /health status \"{}\" does not match startup 429 degraded contract \"{}\"", + actual, expected + ); +} + +#[test] +fn contract_replay_fixture_requires_replay_route_exposure() { + let replay_fixture: Value = serde_json::from_str(include_str!( + "../../../../packages/contracts/fixtures/replay-fixtures.json" + )) + .expect("replay fixture should be valid JSON"); + assert!( + replay_fixture.get("replay_cursor_request").is_some(), + "replay fixture must include replay_cursor_request" + ); + assert!( + replay_fixture.get("replay_response").is_some(), + "replay fixture must include replay_response" + ); + + let source = include_str!("../listen_api.rs"); + assert!( + source.contains(".route(\"/api/events/replay\""), + "listen API router does not expose /api/events/replay" + ); +} + +#[test] +fn contract_timeout_fixture_requires_terminal_failed_guard_before_late_ack() { + let replay_fixture: Value = serde_json::from_str(include_str!( + "../../../../packages/contracts/fixtures/replay-fixtures.json" + )) + .expect("replay fixture should be valid JSON"); + let timeout_fixture = replay_fixture + .get("wave0_timeout_terminal_semantics") + .and_then(Value::as_object) + .expect("replay fixture must include wave0_timeout_terminal_semantics object"); + + let expected_terminal_status = timeout_fixture + .get("expected_terminal_status") + .and_then(Value::as_str) + .expect("timeout fixture requires expected_terminal_status"); + let late_event_kind = timeout_fixture + .get("late_event_kind") + .and_then(Value::as_str) + .expect("timeout fixture requires late_event_kind"); + + let source = include_str!("init.rs"); + let ack_branch = source + .find("msg_type == \"delivery_ack\"") + .map(|idx| { + let end = (idx + 1200).min(source.len()); + &source[idx..end] + }) + .expect("main.rs must include delivery_ack handling"); + + assert!( + ack_branch.contains(expected_terminal_status) || ack_branch.contains("terminal"), + "delivery_ack branch lacks terminal guard for timeout status \"{}\" and late event \"{}\"", + expected_terminal_status, + late_event_kind + ); +} + +#[test] +fn contract_broadcast_whitelist_fixture_requires_filtering_to_required_kinds() { + let event_fixture: Value = serde_json::from_str(include_str!( + "../../../../packages/contracts/fixtures/event-fixtures.json" + )) + .expect("event fixture should be valid JSON"); + let required = event_fixture + .get("wave0_broadcast_whitelist") + .and_then(|v| v.get("required_kinds")) + .and_then(Value::as_array) + .expect("event fixture must include wave0_broadcast_whitelist.required_kinds") + .iter() + .filter_map(Value::as_str) + .map(str::to_owned) + .collect::>(); + + let emitted = extract_kind_literals(include_str!("init.rs")); + + assert!( + required.is_subset(&emitted), + "broker source is missing required broadcast kinds; expected {:?}, got {:?}", + required, + emitted + ); +} + +#[test] +fn build_thread_infos_groups_channel_messages() { + let messages = vec![ + json!({ + "from": "broker", + "target": "#general", + "text": "outbound", + "timestamp": "2026-02-23T10:00:00Z", + }), + json!({ + "from": "Lead", + "target": "#general", + "text": "inbound", + "timestamp": "2026-02-23T10:01:00Z", + }), + ]; + let self_names = HashSet::from(["broker".to_string()]); + let threads = build_thread_infos(&messages, &self_names); + + assert_eq!(threads.len(), 1); + assert_eq!(threads[0].thread_id, "#general"); + assert_eq!(threads[0].name, "#general"); + assert_eq!(threads[0].unread_count, 1); + assert_eq!(threads[0].last_message.as_deref(), Some("inbound")); +} + +#[test] +fn build_thread_infos_groups_direct_messages_case_insensitively() { + let messages = vec![ + json!({ + "from": "BROKER", + "to": "WorkerA", + "text": "ping", + "timestamp": "2026-02-23T10:00:00Z", + }), + json!({ + "from": "workera", + "to": "broker", + "text": "pong", + "timestamp": "2026-02-23T10:01:00Z", + }), + ]; + let self_names = HashSet::from(["broker".to_string()]); + let threads = build_thread_infos(&messages, &self_names); + + assert_eq!(threads.len(), 1); + assert_eq!(threads[0].thread_id, "direct:broker:workera"); + assert_eq!(threads[0].name, "workera"); + assert_eq!(threads[0].unread_count, 1); + assert_eq!(threads[0].last_message.as_deref(), Some("pong")); +} + +#[test] +fn build_thread_infos_uses_dm_conversation_id_and_sender_name() { + let messages = vec![json!({ + "from": "Planner", + "conversation_id": "conv_123", + "text": "dm payload", + "timestamp": "2026-02-23T10:01:00Z", + })]; + let self_names = HashSet::from(["broker".to_string()]); + let threads = build_thread_infos(&messages, &self_names); + + assert_eq!(threads.len(), 1); + assert_eq!(threads[0].thread_id, "conv_123"); + assert_eq!(threads[0].name, "Planner"); + assert_eq!(threads[0].unread_count, 1); +} + +#[test] +fn build_thread_infos_shows_dms_between_non_broker_agents() { + let messages = vec![ + json!({ + "from": "WorkerA", + "conversation_id": "dm_456", + "participants": ["WorkerA", "WorkerB"], + "text": "hello WorkerB", + "timestamp": "2026-02-23T10:00:00Z", + }), + json!({ + "from": "WorkerB", + "conversation_id": "dm_456", + "participants": ["WorkerA", "WorkerB"], + "text": "hi WorkerA", + "timestamp": "2026-02-23T10:01:00Z", + }), + ]; + let self_names = HashSet::from(["broker".to_string()]); + let threads = build_thread_infos(&messages, &self_names); + + assert_eq!(threads.len(), 1, "should group into one conversation"); + assert_eq!(threads[0].thread_id, "dm_456"); + assert_eq!(threads[0].name, "WorkerA ↔ WorkerB"); + assert_eq!( + threads[0].unread_count, 2, + "both messages unread (neither from broker)" + ); + assert_eq!(threads[0].last_message.as_deref(), Some("hi WorkerA")); +} + +#[test] +fn build_thread_infos_dm_with_participants_filters_broker() { + let messages = vec![json!({ + "from": "WorkerA", + "conversation_id": "dm_789", + "participants": ["broker", "WorkerA"], + "text": "hello broker", + "timestamp": "2026-02-23T10:00:00Z", + })]; + let self_names = HashSet::from(["broker".to_string()]); + let threads = build_thread_infos(&messages, &self_names); + + assert_eq!(threads.len(), 1); + assert_eq!( + threads[0].name, "WorkerA", + "should filter out broker from participants" + ); +} + +#[test] +fn build_thread_infos_multiple_independent_dm_conversations() { + let messages = vec![ + json!({ + "from": "Alice", + "conversation_id": "dm_aaa", + "participants": ["Alice", "Bob"], + "text": "hi Bob", + "timestamp": "2026-02-23T10:00:00Z", + }), + json!({ + "from": "Charlie", + "conversation_id": "dm_bbb", + "participants": ["Charlie", "Diana"], + "text": "hi Diana", + "timestamp": "2026-02-23T10:01:00Z", + }), + json!({ + "from": "broker", + "conversation_id": "dm_ccc", + "participants": ["broker", "Eve"], + "text": "hi Eve", + "timestamp": "2026-02-23T10:02:00Z", + }), + ]; + let self_names = HashSet::from(["broker".to_string()]); + let threads = build_thread_infos(&messages, &self_names); + + assert_eq!( + threads.len(), + 3, + "should have three separate DM conversations" + ); + + let thread_aaa = threads.iter().find(|t| t.thread_id == "dm_aaa").unwrap(); + assert_eq!(thread_aaa.name, "Alice ↔ Bob"); + + let thread_bbb = threads.iter().find(|t| t.thread_id == "dm_bbb").unwrap(); + assert_eq!(thread_bbb.name, "Charlie ↔ Diana"); + + let thread_ccc = threads.iter().find(|t| t.thread_id == "dm_ccc").unwrap(); + assert_eq!(thread_ccc.name, "Eve", "broker filtered from participants"); +} + +#[test] +fn build_thread_infos_respects_explicit_unread_count() { + let messages = vec![json!({ + "from": "Planner", + "target": "broker", + "text": "status", + "unread_count": 7, + "timestamp": "2026-02-23T10:01:00Z", + })]; + let self_names = HashSet::from(["broker".to_string()]); + let threads = build_thread_infos(&messages, &self_names); + + assert_eq!(threads.len(), 1); + assert_eq!(threads[0].unread_count, 7); +} + +#[test] +fn build_agent_state_transition_event_has_expected_shape() { + let payload = build_agent_state_transition_event("worker-a", "spawned", Some("sdk_spawn")); + assert_eq!(payload["type"], "agent.state"); + assert_eq!(payload["state"], "spawned"); + assert_eq!(payload["agent"]["name"], "worker-a"); + assert_eq!(payload["reason"], "sdk_spawn"); + assert!(payload["timestamp"].as_str().is_some()); + + let no_reason = build_agent_state_transition_event("worker-a", "idle", None); + assert!(no_reason.get("reason").is_none()); +} + +#[test] +fn preregistration_error_message_dedupes_retry_after_for_rate_limit() { + let error = RelaycastRegistrationError::RateLimited { + agent_name: "Foobar".to_string(), + retry_after_secs: 60, + detail: "{\"ok\":false}".to_string(), + }; + let message = format_worker_preregistration_error("Foobar", &error); + assert_eq!(message.matches("retry after").count(), 1); +} + +#[test] +fn preregistration_error_message_does_not_invent_retry_after_for_transport_errors() { + let error = RelaycastRegistrationError::Transport { + agent_name: "Foobar".to_string(), + detail: "timeout".to_string(), + }; + let message = format_worker_preregistration_error("Foobar", &error); + assert!(!message.contains("retry after")); +} + +#[test] +fn injection_format_preserved() { + let rendered = format_injection("alice", "evt_1", "hello", "bob"); + assert!(rendered.contains("")); + assert!(rendered.contains("mcp__relaycast__message_dm_send")); + assert!(rendered.contains("Relay message from alice [evt_1]: hello")); +} + +#[test] +fn injection_format_includes_channel() { + let rendered = format_injection("alice", "evt_1", "hello", "#general"); + assert!(rendered.contains("mcp__relaycast__message_post")); + assert!(rendered.contains("channel: \"general\"")); + assert!(rendered.contains("Relay message from alice in #general [evt_1]: hello")); +} + +#[test] +fn normalize_sender_defaults_to_human_orchestrator() { + assert_eq!(normalize_sender(None), "human:orchestrator"); + assert_eq!(normalize_sender(Some(String::new())), "human:orchestrator"); + assert_eq!( + normalize_sender(Some(" ".to_string())), + "human:orchestrator" + ); +} + +#[test] +fn normalize_sender_normalizes_human_prefix() { + assert_eq!( + normalize_sender(Some("human: Dashboard ".to_string())), + "human:Dashboard" + ); +} + +#[test] +fn normalize_sender_preserves_worker_names() { + assert_eq!( + normalize_sender(Some("WorkerOne".to_string())), + "WorkerOne".to_string() + ); +} + +#[test] +fn sender_is_dashboard_label_accepts_legacy_dashboard_senders() { + assert!(sender_is_dashboard_label("Dashboard", "my-project")); + assert!(sender_is_dashboard_label("human:Dashboard", "my-project")); + assert!(sender_is_dashboard_label( + "human:orchestrator", + "my-project" + )); + assert!(sender_is_dashboard_label("my-project", "my-project")); + assert!(!sender_is_dashboard_label("Lead", "my-project")); +} + +#[test] +fn display_target_for_dashboard_maps_self_identity() { + let mut self_names = HashSet::new(); + self_names.insert("broker-951762d5".to_string()); + self_names.insert("DashProbe".to_string()); + let primary = "my-project"; + + assert_eq!( + display_target_for_dashboard("broker-951762d5", &self_names, primary), + "my-project" + ); + assert_eq!( + display_target_for_dashboard("dashprobe", &self_names, primary), + "my-project" + ); + assert_eq!( + display_target_for_dashboard("Lead", &self_names, primary), + "Lead".to_string() + ); +} + +#[test] +fn delivery_retry_interval_uses_default_and_env_override() { + std::env::remove_var("AGENT_RELAY_DELIVERY_RETRY_MS"); + assert_eq!(delivery_retry_interval().as_millis(), 1_000); + + std::env::set_var("AGENT_RELAY_DELIVERY_RETRY_MS", "250"); + assert_eq!(delivery_retry_interval().as_millis(), 250); + + std::env::set_var("AGENT_RELAY_DELIVERY_RETRY_MS", "1"); + assert_eq!(delivery_retry_interval().as_millis(), 50); + + std::env::remove_var("AGENT_RELAY_DELIVERY_RETRY_MS"); +} + +#[test] +fn http_api_timeout_windows_use_default_and_env_override() { + std::env::remove_var("AGENT_RELAY_HTTP_API_LOCAL_DELIVERY_TIMEOUT_MS"); + std::env::remove_var("AGENT_RELAY_HTTP_API_RELAYCAST_SEND_TIMEOUT_MS"); + std::env::remove_var("AGENT_RELAY_HTTP_API_EVENT_EMIT_TIMEOUT_MS"); + + assert_eq!(http_api_local_delivery_timeout().as_millis(), 3_000); + assert_eq!(http_api_relaycast_send_timeout().as_millis(), 20_000); + assert_eq!(http_api_event_emit_timeout().as_millis(), 200); + + std::env::set_var("AGENT_RELAY_HTTP_API_LOCAL_DELIVERY_TIMEOUT_MS", "10"); + std::env::set_var("AGENT_RELAY_HTTP_API_RELAYCAST_SEND_TIMEOUT_MS", "100"); + std::env::set_var("AGENT_RELAY_HTTP_API_EVENT_EMIT_TIMEOUT_MS", "1"); + + assert_eq!(http_api_local_delivery_timeout().as_millis(), 100); + assert_eq!(http_api_relaycast_send_timeout().as_millis(), 500); + assert_eq!(http_api_event_emit_timeout().as_millis(), 25); + + std::env::set_var("AGENT_RELAY_HTTP_API_LOCAL_DELIVERY_TIMEOUT_MS", "1500"); + std::env::set_var("AGENT_RELAY_HTTP_API_RELAYCAST_SEND_TIMEOUT_MS", "12000"); + std::env::set_var("AGENT_RELAY_HTTP_API_EVENT_EMIT_TIMEOUT_MS", "150"); + + assert_eq!(http_api_local_delivery_timeout().as_millis(), 1_500); + assert_eq!(http_api_relaycast_send_timeout().as_millis(), 12_000); + assert_eq!(http_api_event_emit_timeout().as_millis(), 150); + + std::env::remove_var("AGENT_RELAY_HTTP_API_LOCAL_DELIVERY_TIMEOUT_MS"); + std::env::remove_var("AGENT_RELAY_HTTP_API_RELAYCAST_SEND_TIMEOUT_MS"); + std::env::remove_var("AGENT_RELAY_HTTP_API_EVENT_EMIT_TIMEOUT_MS"); +} + +#[test] +fn drop_pending_for_worker_removes_only_matching_entries() { + let mut pending = HashMap::new(); + pending.insert( + "del_1".to_string(), + PendingDelivery { + worker_name: "A".to_string(), + delivery: RelayDelivery { + delivery_id: "del_1".to_string(), + event_id: "evt_1".to_string(), + workspace_id: Some("ws_test".to_string()), + workspace_alias: Some("test".to_string()), + from: "x".to_string(), + target: "#general".to_string(), + body: "hello".to_string(), + thread_id: None, + priority: None, + injection_mode: MessageInjectionMode::Wait, + }, + attempts: 1, + next_retry_at: Instant::now(), + }, + ); + pending.insert( + "del_2".to_string(), + PendingDelivery { + worker_name: "B".to_string(), + delivery: RelayDelivery { + delivery_id: "del_2".to_string(), + event_id: "evt_2".to_string(), + workspace_id: Some("ws_test".to_string()), + workspace_alias: Some("test".to_string()), + from: "y".to_string(), + target: "#general".to_string(), + body: "world".to_string(), + thread_id: None, + priority: None, + injection_mode: MessageInjectionMode::Wait, + }, + attempts: 1, + next_retry_at: Instant::now(), + }, + ); + + let dropped = drop_pending_for_worker(&mut pending, "A"); + assert_eq!(dropped, 1); + assert!(pending.contains_key("del_2")); + assert!(!pending.contains_key("del_1")); +} + +#[test] +fn should_clear_pending_delivery_when_event_id_matches() { + let pending = PendingDelivery { + worker_name: "A".to_string(), + delivery: RelayDelivery { + delivery_id: "del_1".to_string(), + event_id: "evt_1".to_string(), + workspace_id: Some("ws_test".to_string()), + workspace_alias: Some("test".to_string()), + from: "x".to_string(), + target: "#general".to_string(), + body: "hello".to_string(), + thread_id: None, + priority: None, + injection_mode: MessageInjectionMode::Wait, + }, + attempts: 1, + next_retry_at: Instant::now(), + }; + + assert!(should_clear_pending_delivery_for_event( + Some(&pending), + Some("evt_1") + )); + assert!(!should_clear_pending_delivery_for_event( + Some(&pending), + Some("evt_2") + )); +} + +#[test] +fn should_clear_pending_delivery_without_event_id_for_compatibility() { + let pending = PendingDelivery { + worker_name: "A".to_string(), + delivery: RelayDelivery { + delivery_id: "del_1".to_string(), + event_id: "evt_1".to_string(), + workspace_id: Some("ws_test".to_string()), + workspace_alias: Some("test".to_string()), + from: "x".to_string(), + target: "#general".to_string(), + body: "hello".to_string(), + thread_id: None, + priority: None, + injection_mode: MessageInjectionMode::Wait, + }, + attempts: 1, + next_retry_at: Instant::now(), + }; + + assert!(should_clear_pending_delivery_for_event( + Some(&pending), + None + )); + assert!(should_clear_pending_delivery_for_event( + Some(&pending), + Some("") + )); + assert!(should_clear_pending_delivery_for_event(None, Some("evt_1"))); +} + +// ==================== strip_ansi tests ==================== + +#[test] +fn strip_ansi_removes_csi_sequences() { + assert_eq!(strip_ansi("\x1b[32mHello\x1b[0m"), "Hello"); + assert_eq!(strip_ansi("\x1b[1;31mred bold\x1b[0m"), "red bold"); +} + +#[test] +fn strip_ansi_removes_osc_sequences() { + assert_eq!(strip_ansi("\x1b]0;title\x07rest"), "rest"); + assert_eq!(strip_ansi("\x1b]0;title\x1b\\rest"), "rest"); +} + +#[test] +fn strip_ansi_preserves_plain_text() { + assert_eq!(strip_ansi("Hello world"), "Hello world"); + assert_eq!(strip_ansi(""), ""); +} + +#[test] +fn strip_ansi_handles_mixed_content() { + let input = "\x1b[33m⚠️ bypass\x1b[0m permissions mode\n\x1b[1m(yes/no)\x1b[0m"; + let clean = strip_ansi(input); + assert!(clean.contains("bypass")); + assert!(clean.contains("(yes/no)")); + assert!(!clean.contains("\x1b")); +} + +#[test] +fn strip_ansi_handles_cursor_forward_sequences() { + // Claude Code uses \x1b[1C (cursor forward) instead of spaces + // These should be replaced with spaces so echo detection works + let input = "\x1b[1CYes,\x1b[1CI\x1b[1Caccept"; + let clean = strip_ansi(input); + assert_eq!(clean, " Yes, I accept"); +} + +// ==================== floor_char_boundary tests ==================== + +#[test] +fn floor_char_boundary_at_valid_positions() { + let s = "Hello 世界"; + assert_eq!(floor_char_boundary(s, 0), 0); + assert_eq!(floor_char_boundary(s, 6), 6); + assert_eq!(floor_char_boundary(s, 9), 9); +} + +#[test] +fn floor_char_boundary_mid_multibyte() { + let s = "Hello 世界"; + assert_eq!(floor_char_boundary(s, 7), 6); + assert_eq!(floor_char_boundary(s, 8), 6); +} + +#[test] +fn floor_char_boundary_past_end() { + let s = "Hello 世界"; + assert_eq!(floor_char_boundary(s, 100), s.len()); +} + +// ==================== detect_bypass_permissions_prompt tests ==================== + +#[test] +fn bypass_perms_yes_no_prompt() { + let output = "⚠️ Bypassing all permission checks.\nDo you want to proceed? (yes/no)"; + let (has_ref, has_confirm) = detect_bypass_permissions_prompt(output); + assert!(has_ref); + assert!(has_confirm); +} + +#[test] +fn bypass_perms_dangerously_with_yn() { + let output = "Running with --dangerously-skip-permissions\nAccept the risks? (y/n)"; + let (has_ref, has_confirm) = detect_bypass_permissions_prompt(output); + assert!(has_ref); + assert!(has_confirm); +} + +#[test] +fn bypass_perms_accept_risk_variant() { + let output = "bypass permissions mode enabled\nDo you accept the risk of running in this mode?"; + let (has_ref, has_confirm) = detect_bypass_permissions_prompt(output); + assert!(has_ref); + assert!(has_confirm); +} + +#[test] +fn bypass_perms_no_match_normal_output() { + let output = "I'll help you fix that bug. Let me read the file first."; + let (has_ref, has_confirm) = detect_bypass_permissions_prompt(output); + assert!(!has_ref); + assert!(!has_confirm); +} + +#[test] +fn bypass_perms_no_false_positive_permission_without_bypass() { + let output = "File permission denied. (yes/no)"; + let (has_ref, has_confirm) = detect_bypass_permissions_prompt(output); + assert!(!has_ref, "permission without bypass should not match"); + assert!(has_confirm, "yes/no detected but insufficient alone"); +} + +#[test] +fn bypass_perms_no_false_positive_status_bar() { + let output = "-- INSERT -- ⏵⏵ bypass permissions on (shift+tab to cycle)"; + let (has_ref, has_confirm) = detect_bypass_permissions_prompt(output); + assert!(has_ref, "status bar has bypass+permissions"); + assert!(!has_confirm, "but no confirmation prompt"); +} + +#[test] +fn bypass_perms_selection_menu_format() { + let output = "WARNING: ClaudeCoderunninginBypassPermissionsmode\n\ + Byproceeding,youacceptallresponsibility\n\ + No,exit\nYes,Iaccept\nEntertoconfirm"; + let (has_ref, has_confirm) = detect_bypass_permissions_prompt(output); + assert!(has_ref); + assert!(has_confirm); + assert!(is_bypass_selection_menu(output)); +} + +#[test] +fn bypass_perms_selection_menu_with_spaces() { + let output = "WARNING: Claude Code running in Bypass Permissions mode\n\ + 1. No, exit\n2. Yes, I accept\nEnter to confirm"; + let (has_ref, has_confirm) = detect_bypass_permissions_prompt(output); + assert!(has_ref && has_confirm); + assert!(is_bypass_selection_menu(output)); +} + +#[test] +fn bypass_perms_legacy_not_selection_menu() { + let output = "bypass permissions mode\nProceed? (yes/no)"; + let (has_ref, has_confirm) = detect_bypass_permissions_prompt(output); + assert!(has_ref && has_confirm, "legacy should still detect"); + assert!( + !is_bypass_selection_menu(output), + "legacy should NOT be selection menu" + ); +} + +#[test] +fn bypass_perms_with_raw_ansi() { + let raw = "\x1b[33m⚠️ bypass permissions\x1b[0m mode\nProceed? \x1b[1m(yes/no)\x1b[0m"; + let clean = strip_ansi(raw); + let (has_ref, has_confirm) = detect_bypass_permissions_prompt(&clean); + assert!(has_ref && has_confirm); +} + +// ==================== detect_claude_trust_prompt tests ==================== + +#[test] +fn claude_trust_prompt_full_match() { + let output = "take a moment to review what's in this folder first.\n\ + Claude Code'll be able to read, edit, and execute files here.\n\ + Security guide\n\ + ❯ 1. Yes, I trust this folder\n\ + 2. No, exit\n\ + Enter to confirm · Esc to cancel"; + let (has_trust_ref, has_confirmation) = detect_claude_trust_prompt(output); + assert!(has_trust_ref); + assert!(has_confirmation); +} + +#[test] +fn claude_trust_prompt_stripped_spaces() { + let output = "Yes,Itrustthisfolder\nNo,exit"; + let (has_trust_ref, has_confirmation) = detect_claude_trust_prompt(output); + assert!(has_trust_ref); + assert!(has_confirmation); +} + +#[test] +fn claude_trust_prompt_no_match_normal_output() { + let output = "I'll help you fix that bug. Let me read the file first."; + let (has_trust_ref, has_confirmation) = detect_claude_trust_prompt(output); + assert!(!has_trust_ref); + assert!(!has_confirmation); +} + +#[test] +fn claude_trust_prompt_partial_no_exit() { + let output = "Yes, I trust this folder"; + let (has_trust_ref, has_confirmation) = detect_claude_trust_prompt(output); + assert!(has_trust_ref); + assert!(!has_confirmation, "should not match without exit option"); +} + +#[test] +fn claude_trust_prompt_with_ansi() { + let raw = "\x1b[1m❯ 1. Yes, I trust this folder\x1b[0m\n 2. No, exit"; + let clean = strip_ansi(raw); + let (has_trust_ref, has_confirmation) = detect_claude_trust_prompt(&clean); + assert!(has_trust_ref && has_confirmation); +} + +// ==================== is_in_editor_mode tests ==================== + +#[test] +fn editor_mode_vim_insert() { + assert!(is_in_editor_mode("Some text\n-- INSERT --\n")); + assert!(is_in_editor_mode("Some text\n-- INSERT --")); +} + +#[test] +fn editor_mode_claude_cli_not_vim() { + let output = "-- INSERT -- ⏵⏵ bypass permissions on (shift+tab to cycle)"; + assert!(!is_in_editor_mode(output)); +} + +#[test] +fn editor_mode_nano() { + let output = " GNU nano 5.8\nFile: test.txt\n^G Get Help ^O Write Out"; + assert!(is_in_editor_mode(output)); +} + +#[test] +fn editor_mode_less_pager() { + assert!(is_in_editor_mode("some content\n(END)")); + assert!(is_in_editor_mode("some content\n--More--")); +} + +#[test] +fn editor_mode_normal_output() { + assert!(!is_in_editor_mode( + "I'll help you with that task. Let me search." + )); + assert!(!is_in_editor_mode("$ ls -la\ntotal 0\n$ ")); +} + +#[test] +fn editor_mode_with_ansi() { + let output = "\x1b[32mSome text\x1b[0m\n-- INSERT --\n"; + assert!(is_in_editor_mode(output)); +} + +#[test] +fn editor_mode_vim_visual_modes() { + assert!(is_in_editor_mode("text\n-- VISUAL --\n")); + assert!(is_in_editor_mode("text\n-- VISUAL LINE --\n")); + assert!(is_in_editor_mode("text\n-- VISUAL BLOCK --\n")); + assert!(is_in_editor_mode("text\n-- REPLACE --\n")); +} + +#[test] +fn editor_mode_claude_normal_not_vim() { + assert!(!is_in_editor_mode("-- NORMAL -- ► some Claude UI text")); + assert!(!is_in_editor_mode("-- VISUAL -- ▶ Claude UI")); +} + +#[test] +fn auto_suggestion_detects_cursor_plus_dim_pattern() { + assert!(is_auto_suggestion( + "\x1b[7mW\x1b[27m\x1b[2mhat's the task?\x1b[22m" + )); +} + +#[test] +fn auto_suggestion_detects_send_hint() { + assert!(is_auto_suggestion(" ↵ send")); +} + +#[test] +fn auto_suggestion_ignores_normal_output() { + assert!(!is_auto_suggestion("Relay message from Alice [abc]: hello")); + assert!(!is_auto_suggestion("Running tests...")); + assert!(!is_auto_suggestion("> \x1b[7m \x1b[27m")); +} + +#[test] +fn extract_mcp_ids_from_tool_response() { + let output = r#" ⎿ { + "id": "147310274064424960", + "conversation_id": "147310245874507776", + "from": "agent-a", + "text": "hello" + }"#; + let ids = extract_mcp_message_ids(output); + // Only extracts "id" keys, not "conversation_id" + assert_eq!(ids, vec!["147310274064424960"]); +} + +#[test] +fn extract_mcp_ids_ignores_short_ids() { + let output = r#""id": "123""#; + assert!(extract_mcp_message_ids(output).is_empty()); +} + +#[test] +fn extract_mcp_ids_ignores_non_numeric() { + let output = r#""id": "msg_abc123def456ghi""#; + assert!(extract_mcp_message_ids(output).is_empty()); +} + +#[test] +fn extract_mcp_ids_handles_no_ids() { + assert!(extract_mcp_message_ids("normal output with no JSON").is_empty()); + assert!(extract_mcp_message_ids("").is_empty()); +} + +// ==================== bypass flag selection logic tests ==================== +// Tests for the bypass flag logic used in WorkerRegistry::spawn(). +// The logic is: claude/claude:* → --dangerously-skip-permissions, codex → --dangerously-bypass-approvals-and-sandbox + +fn compute_bypass_flag(cli: &str, existing_args: &[String]) -> Option<&'static str> { + let cli_lower = cli.to_lowercase(); + if (cli_lower == "claude" || cli_lower.starts_with("claude:")) + && !existing_args + .iter() + .any(|a| a.contains("dangerously-skip-permissions")) + { + Some("--dangerously-skip-permissions") + } else if cli_lower == "codex" + && !existing_args + .iter() + .any(|a| a.contains("dangerously-bypass") || a.contains("full-auto")) + { + Some("--dangerously-bypass-approvals-and-sandbox") + } else if cli_lower == "gemini" && !existing_args.iter().any(|a| a == "--yolo" || a == "-y") { + Some("--yolo") + } else { + None + } +} + +#[test] +fn bypass_flag_claude_gets_skip_permissions() { + assert_eq!( + compute_bypass_flag("claude", &[]), + Some("--dangerously-skip-permissions") + ); +} + +#[test] +fn bypass_flag_claude_variant_gets_skip_permissions() { + assert_eq!( + compute_bypass_flag("claude:latest", &[]), + Some("--dangerously-skip-permissions") + ); + assert_eq!( + compute_bypass_flag("Claude", &[]), + Some("--dangerously-skip-permissions") + ); + assert_eq!( + compute_bypass_flag("CLAUDE:v2", &[]), + Some("--dangerously-skip-permissions") + ); +} + +#[test] +fn bypass_flag_codex_gets_dangerously_bypass() { + assert_eq!( + compute_bypass_flag("codex", &[]), + Some("--dangerously-bypass-approvals-and-sandbox") + ); +} + +#[test] +fn bypass_flag_gemini_gets_yolo() { + assert_eq!(compute_bypass_flag("gemini", &[]), Some("--yolo")); +} + +#[test] +fn bypass_flag_gemini_dedup_when_yolo_present() { + let args = vec!["--yolo".to_string()]; + assert_eq!( + compute_bypass_flag("gemini", &args), + None, + "should not duplicate --yolo flag" + ); +} + +#[test] +fn bypass_flag_gemini_dedup_when_y_present() { + let args = vec!["-y".to_string()]; + assert_eq!( + compute_bypass_flag("gemini", &args), + None, + "should not duplicate when -y shorthand present" + ); +} + +#[test] +fn bypass_flag_aider_gets_none() { + assert_eq!(compute_bypass_flag("aider", &[]), None); +} + +#[test] +fn bypass_flag_goose_gets_none() { + assert_eq!(compute_bypass_flag("goose", &[]), None); +} + +#[test] +fn bypass_flag_unknown_cli_gets_none() { + assert_eq!(compute_bypass_flag("mystery-cli", &[]), None); +} + +#[test] +fn bypass_flag_claude_dedup_when_already_present() { + let args = vec!["--dangerously-skip-permissions".to_string()]; + assert_eq!( + compute_bypass_flag("claude", &args), + None, + "should not duplicate flag" + ); +} + +#[test] +fn bypass_flag_codex_dedup_when_already_present() { + let args = vec!["--dangerously-bypass-approvals-and-sandbox".to_string()]; + assert_eq!( + compute_bypass_flag("codex", &args), + None, + "should not duplicate flag" + ); +} + +#[test] +fn bypass_flag_codex_dedup_when_full_auto_present() { + let args = vec!["--full-auto".to_string()]; + assert_eq!( + compute_bypass_flag("codex", &args), + None, + "should not add bypass when --full-auto already present" + ); +} + +#[test] +fn bypass_flag_claude_dedup_partial_match() { + // If someone passes a different arg containing the substring, still dedup + let args = vec!["--my-dangerously-skip-permissions-flag".to_string()]; + assert_eq!( + compute_bypass_flag("claude", &args), + None, + "substring match should prevent duplication" + ); +} + +#[test] +fn bypass_flag_codex_with_other_args() { + let args = vec!["--model".to_string(), "gpt-4".to_string()]; + assert_eq!( + compute_bypass_flag("codex", &args), + Some("--dangerously-bypass-approvals-and-sandbox"), + "unrelated args should not prevent bypass flag" + ); +} + +// ==================== is_pid_alive ==================== + +#[test] +fn is_pid_alive_returns_true_for_self() { + let pid = std::process::id(); + assert!( + crate::broker::is_pid_alive(pid), + "current process PID should be alive" + ); +} + +#[test] +fn is_pid_alive_returns_false_for_dead_pid() { + // Spawn a short-lived child, wait for it to exit, then verify it's dead + let child = std::process::Command::new("true") + .spawn() + .expect("failed to spawn 'true'"); + let pid = child.id(); + let mut child = child; + child.wait().expect("failed to wait on child"); + // After the child exits, its PID should not be alive + // (the PID may be recycled, but on macOS/Linux it won't be immediately) + assert!( + !crate::broker::is_pid_alive(pid), + "exited child PID should be dead" + ); +} + +#[test] +fn is_pid_alive_returns_false_for_bogus_pid() { + // PID 0 is the kernel scheduler — kill(0, 0) signals the entire process group, + // not a real target. Use a very high PID that almost certainly doesn't exist. + // On macOS pid_max is ~99999; on Linux it's typically 32768 or 4194304. + // 4_000_000 is unlikely to be in use. + assert!( + !crate::broker::is_pid_alive(4_000_000), + "bogus PID 4_000_000 should not be alive (ESRCH)" + ); +} + +#[test] +fn is_pid_alive_eperm_means_alive() { + // PID 1 (launchd/init) is owned by root. When run as a normal user, + // kill(1, 0) returns EPERM — the process exists but we can't signal it. + // This is exactly the EPERM case our fix handles. + // Skip if running as root (e.g., in some CI containers) since root can + // signal any process and would get rc=0 instead of EPERM. + if unsafe { nix::libc::getuid() } == 0 { + eprintln!("skipping EPERM test: running as root"); + return; + } + assert!( + crate::broker::is_pid_alive(1), + "PID 1 (init/launchd) should report alive via EPERM" + ); +} + +// ==================== write_pid_file ==================== + +// ==================== continuity_dir ==================== + +#[test] +fn continuity_dir_derives_correct_path_from_state_json() { + let state_path = std::path::Path::new("/project/.agent-relay/state.json"); + let result = continuity_dir(state_path); + assert_eq!( + result, + std::path::PathBuf::from("/project/.agent-relay/continuity") + ); +} + +#[test] +fn continuity_dir_works_with_nested_project_path() { + let state_path = std::path::Path::new("/home/user/projects/my-app/.agent-relay/state.json"); + let result = continuity_dir(state_path); + assert_eq!( + result, + std::path::PathBuf::from("/home/user/projects/my-app/.agent-relay/continuity") + ); +} + +#[test] +fn continuity_dir_preserves_relative_paths() { + let state_path = std::path::Path::new(".agent-relay/state.json"); + let result = continuity_dir(state_path); + assert_eq!(result, std::path::PathBuf::from(".agent-relay/continuity")); +} + +#[test] +fn http_api_spawn_spec_defaults_to_pty_runtime() { + let spec = build_http_api_spawn_spec( + "worker-a".to_string(), + "codex".to_string(), + None, + Some("o3".to_string()), + vec!["--fast".to_string()], + vec!["general".to_string()], + Some("/tmp/project".to_string()), + Some("core".to_string()), + Some("Lead".to_string()), + Some("subagent".to_string()), + None, + ) + .expect("spec should build"); + + assert!(matches!(spec.runtime, AgentRuntime::Pty)); + assert!(spec.provider.is_none()); + assert_eq!(spec.cli.as_deref(), Some("codex")); + assert_eq!(spec.model.as_deref(), Some("o3")); +} + +#[test] +fn http_api_spawn_spec_uses_headless_runtime_for_supported_providers() { + let spec = build_http_api_spawn_spec( + "worker-a".to_string(), + "opencode".to_string(), + Some("headless".to_string()), + Some("ignored".to_string()), + vec![], + vec!["general".to_string()], + None, + None, + None, + None, + None, + ) + .expect("headless spec should build"); + + assert!(matches!(spec.runtime, AgentRuntime::Headless)); + assert!(matches!( + spec.provider, + Some(ProtocolHeadlessProvider::Opencode) + )); + assert!(spec.cli.is_none()); + assert_eq!(spec.model.as_deref(), Some("ignored")); +} + +#[test] +fn headless_provider_command_claude_places_flags_before_task() { + let (bin, args) = super::headless_provider_command( + &ProtocolHeadlessProvider::Claude, + "hello world", + &[ + "--mcp-config".to_string(), + "{\"mcpServers\":{}}".to_string(), + ], + ); + + assert_eq!(bin, "claude"); + assert_eq!(args.last().map(String::as_str), Some("hello world")); + let mcp_pos = args.iter().position(|a| a == "--mcp-config").unwrap(); + let task_pos = args.iter().position(|a| a == "hello world").unwrap(); + assert!(mcp_pos < task_pos, "--mcp-config must precede task"); +} + +#[test] +fn headless_provider_command_opencode_places_flags_before_task() { + let (bin, args) = super::headless_provider_command( + &ProtocolHeadlessProvider::Opencode, + "hello world", + &["--agent".to_string(), "relaycast".to_string()], + ); + + assert_eq!(bin, "opencode"); + assert_eq!(args.first().map(String::as_str), Some("run")); + assert_eq!(args.last().map(String::as_str), Some("hello world")); + let agent_pos = args.iter().position(|a| a == "--agent").unwrap(); + let task_pos = args.iter().position(|a| a == "hello world").unwrap(); + assert!(agent_pos < task_pos, "--agent must precede task"); +} + +#[test] +fn http_api_spawn_spec_rejects_unknown_headless_providers() { + let error = build_http_api_spawn_spec( + "worker-a".to_string(), + "codex".to_string(), + Some("headless".to_string()), + None, + vec![], + vec!["general".to_string()], + None, + None, + None, + None, + None, + ) + .expect_err("unsupported headless provider should fail"); + + assert!( + error + .to_string() + .contains("does not support headless transport"), + "unexpected error: {error}" + ); +} + +// ==================== model flag injection tests ==================== +// Tests for the --model flag injection logic used in WorkerRegistry::spawn(). +// When spec.model is set and non-empty, the broker should inject --model +// into the spawned CLI's argv, unless the user already specified --model. + +/// Mirror of the model flag logic in WorkerRegistry::spawn(). +fn compute_model_flag(model: Option<&str>, existing_args: &[String]) -> Option { + model.and_then(|m| { + if m.is_empty() + || existing_args + .iter() + .any(|a| a == "--model" || a.starts_with("--model=") || a == "-m") + { + None + } else { + Some(m.to_string()) + } + }) +} + +#[test] +fn model_flag_injected_when_present() { + assert_eq!( + compute_model_flag(Some("haiku"), &[]), + Some("haiku".to_string()), + "model should be injected when set and args are empty" + ); +} + +#[test] +fn model_flag_not_injected_when_none() { + assert_eq!( + compute_model_flag(None, &[]), + None, + "model should not be injected when not set" + ); +} + +#[test] +fn model_flag_not_injected_when_empty() { + assert_eq!( + compute_model_flag(Some(""), &[]), + None, + "model should not be injected when empty string" + ); +} + +#[test] +fn model_flag_not_injected_when_already_in_args() { + let args = vec!["--model".to_string(), "opus".to_string()]; + assert_eq!( + compute_model_flag(Some("haiku"), &args), + None, + "model should not be injected when --model already in args" + ); +} + +#[test] +fn model_flag_not_injected_when_short_flag_in_args() { + let args = vec!["-m".to_string(), "opus".to_string()]; + assert_eq!( + compute_model_flag(Some("haiku"), &args), + None, + "model should not be injected when -m already in args" + ); +} + +#[test] +fn model_flag_not_injected_when_equals_format_in_args() { + let args = vec!["--model=opus".to_string()]; + assert_eq!( + compute_model_flag(Some("haiku"), &args), + None, + "model should not be injected when --model=value already in args" + ); +} + +#[test] +fn model_flag_injected_with_other_args() { + let args = vec!["--verbose".to_string()]; + assert_eq!( + compute_model_flag(Some("gpt-4o"), &args), + Some("gpt-4o".to_string()), + "model should be injected when other unrelated args exist" + ); +} diff --git a/crates/broker/src/runtime/util.rs b/crates/broker/src/runtime/util.rs new file mode 100644 index 000000000..7cccdce55 --- /dev/null +++ b/crates/broker/src/runtime/util.rs @@ -0,0 +1,211 @@ +use super::*; + +pub(crate) fn startup_debug_enabled() -> bool { + std::env::var("AGENT_RELAY_STARTUP_DEBUG") + .map(|value| { + let trimmed = value.trim(); + !trimmed.is_empty() && trimmed != "0" && !trimmed.eq_ignore_ascii_case("false") + }) + .unwrap_or(false) +} + +pub(crate) fn log_startup_phase(enabled: bool, started_at: Instant, message: impl AsRef) { + if enabled { + eprintln!( + "[agent-relay][startup +{}ms] {}", + started_at.elapsed().as_millis(), + message.as_ref() + ); + } +} + +pub(crate) fn init_tracing() { + let (writer, guard) = tracing_appender::non_blocking(std::io::stderr()); + let subscriber = tracing_subscriber::fmt::Subscriber::builder() + .with_env_filter( + tracing_subscriber::EnvFilter::try_from_default_env() + .unwrap_or_else(|_| tracing_subscriber::EnvFilter::new("info")), + ) + .with_target(true) + .with_writer(writer) + .finish(); + if tracing::subscriber::set_global_default(subscriber).is_ok() { + let _ = TRACING_GUARD.set(guard); + } +} + +pub(crate) fn channels_from_csv(raw: &str) -> Vec { + raw.split(',') + .map(str::trim) + .filter(|s| !s.is_empty()) + .map(ToOwned::to_owned) + .collect() +} + +/// Default channels for freshly spawned agents. +/// Reads RELAY_DEFAULT_CHANNELS (comma-separated) or falls back to the +/// broker's default channels: vec!["general", "engineering"] — both created +/// at startup by ensure_default_channels(). +pub(crate) fn default_spawn_channels() -> Vec { + if let Ok(raw) = std::env::var("RELAY_DEFAULT_CHANNELS") { + let parsed = channels_from_csv(&raw); + if !parsed.is_empty() { + return parsed; + } + } + // channels: ["general", "engineering"] (must match ensure_default_channels) + vec!["general".to_string(), "engineering".to_string()] +} + +pub(crate) fn command_targets_self(cmd_event: &BrokerCommandEvent, self_agent_id: &str) -> bool { + match cmd_event.handler_agent_id.as_deref() { + Some(handler_id) => handler_id == self_agent_id, + None => { + tracing::warn!( + command = %cmd_event.command, + invoked_by = %cmd_event.invoked_by, + "command has no handler_agent_id; accepting by default (multi-broker setups should scope commands)" + ); + true + } + } +} + +pub(crate) fn env_flag_enabled(name: &str) -> bool { + std::env::var(name) + .ok() + .map(|value| value.trim().to_ascii_lowercase()) + .is_some_and(|value| matches!(value.as_str(), "1" | "true" | "yes" | "on")) +} + +pub(crate) fn delivery_retry_interval() -> Duration { + let ms = std::env::var("AGENT_RELAY_DELIVERY_RETRY_MS") + .ok() + .and_then(|raw| raw.trim().parse::().ok()) + .unwrap_or(DEFAULT_DELIVERY_RETRY_MS); + Duration::from_millis(ms.max(50)) +} + +pub(crate) fn http_api_local_delivery_timeout() -> Duration { + let ms = std::env::var("AGENT_RELAY_HTTP_API_LOCAL_DELIVERY_TIMEOUT_MS") + .ok() + .and_then(|raw| raw.trim().parse::().ok()) + .unwrap_or(DEFAULT_HTTP_API_LOCAL_DELIVERY_TIMEOUT_MS); + Duration::from_millis(ms.max(100)) +} + +pub(crate) fn http_api_relaycast_send_timeout() -> Duration { + let ms = std::env::var("AGENT_RELAY_HTTP_API_RELAYCAST_SEND_TIMEOUT_MS") + .ok() + .and_then(|raw| raw.trim().parse::().ok()) + .unwrap_or(DEFAULT_HTTP_API_RELAYCAST_SEND_TIMEOUT_MS); + Duration::from_millis(ms.max(500)) +} + +pub(crate) fn http_api_event_emit_timeout() -> Duration { + let ms = std::env::var("AGENT_RELAY_HTTP_API_EVENT_EMIT_TIMEOUT_MS") + .ok() + .and_then(|raw| raw.trim().parse::().ok()) + .unwrap_or(DEFAULT_HTTP_API_EVENT_EMIT_TIMEOUT_MS); + Duration::from_millis(ms.max(25)) +} + +pub(crate) fn normalize_channel(raw: &str) -> String { + let trimmed = raw.trim(); + if trimmed.starts_with('#') { + trimmed.to_string() + } else { + format!("#{trimmed}") + } +} + +pub(crate) fn build_agent_state_transition_event( + name: &str, + state: &str, + reason: Option<&str>, +) -> Value { + let mut payload = json!({ + "type": "agent.state", + "state": state, + "agent": { "name": name }, + "timestamp": chrono::Utc::now().to_rfc3339(), + }); + if let Some(reason) = reason.map(str::trim).filter(|value| !value.is_empty()) { + payload["reason"] = json!(reason); + } + payload +} + +pub(crate) async fn publish_agent_state_transition( + ws_control_tx: &mpsc::Sender, + name: &str, + state: &str, + reason: Option<&str>, +) { + let event = build_agent_state_transition_event(name, state, reason); + if let Err(error) = ws_control_tx.send(WsControl::Publish(event)).await { + tracing::debug!( + agent = %name, + state = %state, + error = %error, + "failed to publish agent state transition" + ); + } +} + +/// Get current terminal size. Returns (rows, cols). +/// +/// Uses `crossterm::terminal::size()`, which is cross-platform: +/// TIOCGWINSZ on unix, GetConsoleScreenBufferInfo on Windows. +pub(crate) fn get_terminal_size() -> Option<(u16, u16)> { + crossterm::terminal::size() + .ok() + .map(|(cols, rows)| (rows, cols)) +} + +/// Detect Claude Code auto-suggestion ghost text. +/// +/// Auto-suggestions are rendered with reverse-video cursor + dim ghost text, +/// and often include the "↵ send" hint. +/// Extract Relaycast message IDs from MCP tool response output. +/// +/// When the agent sends a message via MCP (send_dm, send_message, etc.), +/// the response JSON contains `"id": ""`. We extract these IDs +/// and pre-seed the dedup cache so the WS echo of the same message is dropped. +/// This is more robust than name-based filtering since it works regardless +/// of what identity the MCP server registers with. +pub(crate) fn extract_mcp_message_ids(buffer: &str) -> Vec { + let mut ids = Vec::new(); + // Match patterns like "id": "147310274064424960" (Relaycast snowflake IDs are 18-digit numbers) + let mut search_start = 0; + while let Some(key_pos) = buffer[search_start..].find("\"id\"") { + let abs_pos = search_start + key_pos + 4; // skip past "id" + if abs_pos >= buffer.len() { + break; + } + let rest = &buffer[abs_pos..]; + // Skip whitespace and colon + let rest = rest.trim_start(); + let rest = if let Some(r) = rest.strip_prefix(':') { + r.trim_start() + } else { + search_start = abs_pos; + continue; + }; + // Extract quoted value + if let Some(r) = rest.strip_prefix('"') { + if let Some(end) = r.find('"') { + let value = &r[..end]; + // Only match numeric snowflake IDs (15-20 digits) + if value.len() >= 15 + && value.len() <= 20 + && value.chars().all(|c| c.is_ascii_digit()) + { + ids.push(value.to_string()); + } + } + } + search_start = abs_pos; + } + ids +} From 450740c7201e6fd4c3cfb9c875ef2f4bff7e98e5 Mon Sep 17 00:00:00 2001 From: Will Washburn Date: Mon, 18 May 2026 21:39:09 -0400 Subject: [PATCH 5/8] chore: record runtime split trajectory --- .../completed/2026-05/traj_47akjihewlow.json | 73 ++++++ .../completed/2026-05/traj_47akjihewlow.md | 41 +++ .../2026-05/traj_47akjihewlow.trace.json | 234 ++++++++++++++++++ .trajectories/index.json | 9 +- 4 files changed, 356 insertions(+), 1 deletion(-) create mode 100644 .trajectories/completed/2026-05/traj_47akjihewlow.json create mode 100644 .trajectories/completed/2026-05/traj_47akjihewlow.md create mode 100644 .trajectories/completed/2026-05/traj_47akjihewlow.trace.json diff --git a/.trajectories/completed/2026-05/traj_47akjihewlow.json b/.trajectories/completed/2026-05/traj_47akjihewlow.json new file mode 100644 index 000000000..fd86d6ba5 --- /dev/null +++ b/.trajectories/completed/2026-05/traj_47akjihewlow.json @@ -0,0 +1,73 @@ +{ + "id": "traj_47akjihewlow", + "version": 1, + "task": { + "title": "Further split broker runtime module for issue 875", + "source": { + "system": "plain", + "id": "#875" + } + }, + "status": "completed", + "startedAt": "2026-05-19T01:28:35.746Z", + "completedAt": "2026-05-19T01:38:29.105Z", + "agents": [ + { + "name": "default", + "role": "lead", + "joinedAt": "2026-05-19T01:36:08.629Z" + } + ], + "chapters": [ + { + "id": "chap_t1myrcwsr9k7", + "title": "Work", + "agentName": "default", + "startedAt": "2026-05-19T01:36:08.629Z", + "endedAt": "2026-05-19T01:38:29.105Z", + "events": [ + { + "ts": 1779154568630, + "type": "decision", + "content": "Split broker runtime by responsibility: Split broker runtime by responsibility", + "raw": { + "question": "Split broker runtime by responsibility", + "chosen": "Split broker runtime by responsibility", + "alternatives": [], + "reasoning": "Kept the CLI-facing runtime API stable while moving cohesive concerns into runtime submodules: init loop, session setup, pending delivery, headless worker, connection discovery, paths, terminal/system helpers, message/thread helpers, frame IO, and spawn spec parsing." + }, + "significance": "high" + } + ] + } + ], + "retrospective": { + "summary": "Split broker runtime into focused modules for session setup, init loop, pending delivery, headless workers, connection discovery, paths, frame I/O, message/thread helpers, system helpers, spawn spec parsing, and tests.", + "approach": "Standard approach", + "confidence": 0.9 + }, + "commits": ["7182810c"], + "filesChanged": [ + "crates/broker/src/runtime.rs", + "crates/broker/src/runtime/connection.rs", + "crates/broker/src/runtime/delivery.rs", + "crates/broker/src/runtime/headless.rs", + "crates/broker/src/runtime/init.rs", + "crates/broker/src/runtime/io.rs", + "crates/broker/src/runtime/messages.rs", + "crates/broker/src/runtime/mod.rs", + "crates/broker/src/runtime/paths.rs", + "crates/broker/src/runtime/session.rs", + "crates/broker/src/runtime/spawn_spec.rs", + "crates/broker/src/runtime/system.rs", + "crates/broker/src/runtime/tests.rs", + "crates/broker/src/runtime/util.rs" + ], + "projectId": "/Users/will/Projects/AgentWorkforce/relay", + "tags": [], + "_trace": { + "startRef": "d52c1476dc8b28f8504cf9d60dce10b1719b8c59", + "endRef": "7182810ce4863543c825d747a8f159644981d485", + "traceId": "c271dee5-bc34-4695-8fb1-bb228ded699f" + } +} diff --git a/.trajectories/completed/2026-05/traj_47akjihewlow.md b/.trajectories/completed/2026-05/traj_47akjihewlow.md new file mode 100644 index 000000000..babd08661 --- /dev/null +++ b/.trajectories/completed/2026-05/traj_47akjihewlow.md @@ -0,0 +1,41 @@ +# Trajectory: Further split broker runtime module for issue 875 + +> **Status:** ✅ Completed +> **Task:** #875 +> **Confidence:** 90% +> **Started:** May 18, 2026 at 09:28 PM +> **Completed:** May 18, 2026 at 09:38 PM + +--- + +## Summary + +Split broker runtime into focused modules for session setup, init loop, pending delivery, headless workers, connection discovery, paths, frame I/O, message/thread helpers, system helpers, spawn spec parsing, and tests. + +**Approach:** Standard approach + +--- + +## Key Decisions + +### Split broker runtime by responsibility + +- **Chose:** Split broker runtime by responsibility +- **Reasoning:** Kept the CLI-facing runtime API stable while moving cohesive concerns into runtime submodules: init loop, session setup, pending delivery, headless worker, connection discovery, paths, terminal/system helpers, message/thread helpers, frame IO, and spawn spec parsing. + +--- + +## Chapters + +### 1. Work + +_Agent: default_ + +- Split broker runtime by responsibility: Split broker runtime by responsibility + +--- + +## Artifacts + +**Commits:** 7182810c +**Files changed:** 14 diff --git a/.trajectories/completed/2026-05/traj_47akjihewlow.trace.json b/.trajectories/completed/2026-05/traj_47akjihewlow.trace.json new file mode 100644 index 000000000..9b632fe61 --- /dev/null +++ b/.trajectories/completed/2026-05/traj_47akjihewlow.trace.json @@ -0,0 +1,234 @@ +{ + "version": "1.0.0", + "id": "c271dee5-bc34-4695-8fb1-bb228ded699f", + "timestamp": "2026-05-19T01:38:29.210Z", + "trajectory": "traj_47akjihewlow", + "files": [ + { + "path": "crates/broker/src/runtime/connection.rs", + "conversations": [ + { + "contributor": { + "type": "ai" + }, + "ranges": [ + { + "start_line": 1, + "end_line": 193, + "revision": "7182810ce4863543c825d747a8f159644981d485" + } + ] + } + ] + }, + { + "path": "crates/broker/src/runtime/delivery.rs", + "conversations": [ + { + "contributor": { + "type": "ai" + }, + "ranges": [ + { + "start_line": 1, + "end_line": 455, + "revision": "7182810ce4863543c825d747a8f159644981d485" + } + ] + } + ] + }, + { + "path": "crates/broker/src/runtime/headless.rs", + "conversations": [ + { + "contributor": { + "type": "ai" + }, + "ranges": [ + { + "start_line": 1, + "end_line": 387, + "revision": "7182810ce4863543c825d747a8f159644981d485" + } + ] + } + ] + }, + { + "path": "crates/broker/src/runtime/init.rs", + "conversations": [ + { + "contributor": { + "type": "ai" + }, + "ranges": [ + { + "start_line": 1, + "end_line": 4, + "revision": "7182810ce4863543c825d747a8f159644981d485" + }, + { + "start_line": 3207, + "end_line": 3209, + "revision": "7182810ce4863543c825d747a8f159644981d485" + } + ] + } + ] + }, + { + "path": "crates/broker/src/runtime/io.rs", + "conversations": [ + { + "contributor": { + "type": "ai" + }, + "ranges": [ + { + "start_line": 1, + "end_line": 70, + "revision": "7182810ce4863543c825d747a8f159644981d485" + } + ] + } + ] + }, + { + "path": "crates/broker/src/runtime/messages.rs", + "conversations": [ + { + "contributor": { + "type": "ai" + }, + "ranges": [ + { + "start_line": 1, + "end_line": 572, + "revision": "7182810ce4863543c825d747a8f159644981d485" + } + ] + } + ] + }, + { + "path": "crates/broker/src/runtime/mod.rs", + "conversations": [ + { + "contributor": { + "type": "ai" + }, + "ranges": [ + { + "start_line": 1, + "end_line": 89, + "revision": "7182810ce4863543c825d747a8f159644981d485" + } + ] + } + ] + }, + { + "path": "crates/broker/src/runtime/paths.rs", + "conversations": [ + { + "contributor": { + "type": "ai" + }, + "ranges": [ + { + "start_line": 1, + "end_line": 184, + "revision": "7182810ce4863543c825d747a8f159644981d485" + } + ] + } + ] + }, + { + "path": "crates/broker/src/runtime/session.rs", + "conversations": [ + { + "contributor": { + "type": "ai" + }, + "ranges": [ + { + "start_line": 1, + "end_line": 264, + "revision": "7182810ce4863543c825d747a8f159644981d485" + } + ] + } + ] + }, + { + "path": "crates/broker/src/runtime/spawn_spec.rs", + "conversations": [ + { + "contributor": { + "type": "ai" + }, + "ranges": [ + { + "start_line": 1, + "end_line": 68, + "revision": "7182810ce4863543c825d747a8f159644981d485" + } + ] + } + ] + }, + { + "path": "crates/broker/src/runtime/system.rs", + "conversations": [ + { + "contributor": { + "type": "ai" + }, + "ranges": [ + { + "start_line": 1, + "end_line": 94, + "revision": "7182810ce4863543c825d747a8f159644981d485" + } + ] + } + ] + }, + { + "path": "crates/broker/src/runtime/tests.rs", + "conversations": [ + { + "contributor": { + "type": "ai" + }, + "ranges": [ + { + "start_line": 1, + "end_line": 1743, + "revision": "7182810ce4863543c825d747a8f159644981d485" + } + ] + } + ] + }, + { + "path": "crates/broker/src/runtime/util.rs", + "conversations": [ + { + "contributor": { + "type": "ai" + }, + "ranges": [ + { + "start_line": 1, + "end_line": 211, + "revision": "7182810ce4863543c825d747a8f159644981d485" + } + ] + } + ] + } + ] +} diff --git a/.trajectories/index.json b/.trajectories/index.json index 043669111..7c7ae2ef5 100644 --- a/.trajectories/index.json +++ b/.trajectories/index.json @@ -1,6 +1,6 @@ { "version": 1, - "lastUpdated": "2026-05-19T00:55:57.678Z", + "lastUpdated": "2026-05-19T01:38:29.338Z", "trajectories": { "traj_05xg7j388bc4": { "title": "Add browser workflow step integration", @@ -960,6 +960,13 @@ "startedAt": "2026-05-19T00:54:40.328Z", "completedAt": "2026-05-19T00:55:57.506Z", "path": "/Users/will/Projects/AgentWorkforce/relay/.trajectories/completed/2026-05/traj_f9wxa8ujeg78.json" + }, + "traj_47akjihewlow": { + "title": "Further split broker runtime module for issue 875", + "status": "completed", + "startedAt": "2026-05-19T01:28:35.746Z", + "completedAt": "2026-05-19T01:38:29.105Z", + "path": "/Users/will/Projects/AgentWorkforce/relay/.trajectories/completed/2026-05/traj_47akjihewlow.json" } } } From 7672f7081fd75a7bd49116ab8e91411f1b3123ba Mon Sep 17 00:00:00 2001 From: Will Washburn Date: Mon, 18 May 2026 21:51:12 -0400 Subject: [PATCH 6/8] refactor: extract broker runtime event handlers --- .../completed/2026-05/traj_x37bhga2j5ph.json | 57 + .../completed/2026-05/traj_x37bhga2j5ph.md | 34 + .trajectories/index.json | 9 +- crates/broker/src/runtime/api.rs | 1236 +++++++ crates/broker/src/runtime/event_loop.rs | 173 + crates/broker/src/runtime/init.rs | 2855 +---------------- crates/broker/src/runtime/maintenance.rs | 353 ++ crates/broker/src/runtime/mod.rs | 6 + crates/broker/src/runtime/relaycast_events.rs | 840 +++++ crates/broker/src/runtime/tests.rs | 10 +- crates/broker/src/runtime/worker_events.rs | 573 ++++ 11 files changed, 3343 insertions(+), 2803 deletions(-) create mode 100644 .trajectories/completed/2026-05/traj_x37bhga2j5ph.json create mode 100644 .trajectories/completed/2026-05/traj_x37bhga2j5ph.md create mode 100644 crates/broker/src/runtime/api.rs create mode 100644 crates/broker/src/runtime/event_loop.rs create mode 100644 crates/broker/src/runtime/maintenance.rs create mode 100644 crates/broker/src/runtime/relaycast_events.rs create mode 100644 crates/broker/src/runtime/worker_events.rs diff --git a/.trajectories/completed/2026-05/traj_x37bhga2j5ph.json b/.trajectories/completed/2026-05/traj_x37bhga2j5ph.json new file mode 100644 index 000000000..8970ea349 --- /dev/null +++ b/.trajectories/completed/2026-05/traj_x37bhga2j5ph.json @@ -0,0 +1,57 @@ +{ + "id": "traj_x37bhga2j5ph", + "version": 1, + "task": { + "title": "Deepen broker runtime refactor for PR 906", + "source": { + "system": "plain", + "id": "AgentWorkforce/relay#875" + } + }, + "status": "completed", + "startedAt": "2026-05-19T01:42:10.602Z", + "completedAt": "2026-05-19T01:50:40.359Z", + "agents": [ + { + "name": "default", + "role": "lead", + "joinedAt": "2026-05-19T01:50:18.748Z" + } + ], + "chapters": [ + { + "id": "chap_gm8mr25x5hgw", + "title": "Work", + "agentName": "default", + "startedAt": "2026-05-19T01:50:18.748Z", + "endedAt": "2026-05-19T01:50:40.359Z", + "events": [ + { + "ts": 1779155418749, + "type": "decision", + "content": "Split broker runtime into event-loop context and event handlers: Split broker runtime into event-loop context and event handlers", + "raw": { + "question": "Split broker runtime into event-loop context and event handlers", + "chosen": "Split broker runtime into event-loop context and event handlers", + "alternatives": [], + "reasoning": "The prior runtime/init split still left long-lived state and every select branch in one function. Moving state into BrokerRuntime and dispatching RuntimeEvent to HTTP API, Relaycast, worker-event, and maintenance handlers makes ownership and control flow explicit while preserving tested behavior." + }, + "significance": "high" + } + ] + } + ], + "retrospective": { + "summary": "Split the broker runtime beyond the init facade: run_init now bootstraps services and hands off to BrokerRuntime, which dispatches typed RuntimeEvent values to focused HTTP API, Relaycast inbound, worker-event, and maintenance handlers. Updated source-inspection contract tests to follow the new handler files and verified cargo fmt, test, and clippy for agent-relay-broker.", + "approach": "Standard approach", + "confidence": 0.9 + }, + "commits": [], + "filesChanged": [], + "projectId": "/Users/will/Projects/AgentWorkforce/relay", + "tags": [], + "_trace": { + "startRef": "450740c7201e6fd4c3cfb9c875ef2f4bff7e98e5", + "endRef": "450740c7201e6fd4c3cfb9c875ef2f4bff7e98e5" + } +} diff --git a/.trajectories/completed/2026-05/traj_x37bhga2j5ph.md b/.trajectories/completed/2026-05/traj_x37bhga2j5ph.md new file mode 100644 index 000000000..2ae16e367 --- /dev/null +++ b/.trajectories/completed/2026-05/traj_x37bhga2j5ph.md @@ -0,0 +1,34 @@ +# Trajectory: Deepen broker runtime refactor for PR 906 + +> **Status:** ✅ Completed +> **Task:** AgentWorkforce/relay#875 +> **Confidence:** 90% +> **Started:** May 18, 2026 at 09:42 PM +> **Completed:** May 18, 2026 at 09:50 PM + +--- + +## Summary + +Split the broker runtime beyond the init facade: run_init now bootstraps services and hands off to BrokerRuntime, which dispatches typed RuntimeEvent values to focused HTTP API, Relaycast inbound, worker-event, and maintenance handlers. Updated source-inspection contract tests to follow the new handler files and verified cargo fmt, test, and clippy for agent-relay-broker. + +**Approach:** Standard approach + +--- + +## Key Decisions + +### Split broker runtime into event-loop context and event handlers + +- **Chose:** Split broker runtime into event-loop context and event handlers +- **Reasoning:** The prior runtime/init split still left long-lived state and every select branch in one function. Moving state into BrokerRuntime and dispatching RuntimeEvent to HTTP API, Relaycast, worker-event, and maintenance handlers makes ownership and control flow explicit while preserving tested behavior. + +--- + +## Chapters + +### 1. Work + +_Agent: default_ + +- Split broker runtime into event-loop context and event handlers: Split broker runtime into event-loop context and event handlers diff --git a/.trajectories/index.json b/.trajectories/index.json index 7c7ae2ef5..c629337ae 100644 --- a/.trajectories/index.json +++ b/.trajectories/index.json @@ -1,6 +1,6 @@ { "version": 1, - "lastUpdated": "2026-05-19T01:38:29.338Z", + "lastUpdated": "2026-05-19T01:50:40.535Z", "trajectories": { "traj_05xg7j388bc4": { "title": "Add browser workflow step integration", @@ -967,6 +967,13 @@ "startedAt": "2026-05-19T01:28:35.746Z", "completedAt": "2026-05-19T01:38:29.105Z", "path": "/Users/will/Projects/AgentWorkforce/relay/.trajectories/completed/2026-05/traj_47akjihewlow.json" + }, + "traj_x37bhga2j5ph": { + "title": "Deepen broker runtime refactor for PR 906", + "status": "completed", + "startedAt": "2026-05-19T01:42:10.602Z", + "completedAt": "2026-05-19T01:50:40.359Z", + "path": "/Users/will/Projects/AgentWorkforce/relay/.trajectories/completed/2026-05/traj_x37bhga2j5ph.json" } } } diff --git a/crates/broker/src/runtime/api.rs b/crates/broker/src/runtime/api.rs new file mode 100644 index 000000000..3c621de82 --- /dev/null +++ b/crates/broker/src/runtime/api.rs @@ -0,0 +1,1236 @@ +use super::*; + +impl BrokerRuntime { + pub(super) async fn handle_api_request(&mut self, req: ListenApiRequest) { + let paths = &self.paths; + let state = &mut self.state; + let workspaces = &self.workspaces; + let workspace_lookup = &self.workspace_lookup; + let default_workspace_id = &self.default_workspace_id; + let self_names = &self.self_names; + let relaycast_http = &self.relaycast_http; + let ws_control_tx = &self.ws_control_tx; + let sdk_out_tx = &self.sdk_out_tx; + let workers = &mut self.workers; + let telemetry = &self.telemetry; + let agent_spawn_count = &mut self.agent_spawn_count; + let pending_deliveries = &mut self.pending_deliveries; + let pending_requests = &mut self.pending_requests; + let delivery_states = &mut self.delivery_states; + let dedup = &mut self.dedup; + let recent_thread_messages = &mut self.recent_thread_messages; + let delivery_retry_interval = self.delivery_retry_interval; + let last_lease_renewal = &mut self.last_lease_renewal; + let lease_duration = self.lease_duration; + let persist = self.persist; + let shutdown = &mut self.shutdown; + let crash_insights = &self.crash_insights; + + match req { + ListenApiRequest::Spawn { + name, + cli, + transport, + model, + args, + task, + channels, + cwd, + team, + shadow_of, + shadow_mode, + continue_from, + idle_threshold_secs, + skip_relay_prompt, + restart_policy, + agent_token, + reply, + } => { + let effective_channels = if channels.is_empty() { + default_spawn_channels() + } else { + channels.clone() + }; + let spec = match build_http_api_spawn_spec( + name.clone(), + cli.clone(), + transport, + model.clone(), + args, + effective_channels.clone(), + cwd, + team, + shadow_of, + shadow_mode, + *restart_policy, + ) { + Ok(spec) => spec, + Err(error) => { + let _ = reply.send(Err(error.to_string())); + return; + } + }; + let mut preregistration_warning: Option = None; + let registration_result = + retry_agent_registration(relaycast_http, &name, Some(&cli)).await; + let worker_relay_key = match registration_result { + Ok(token) => Some(token), + Err(RegRetryOutcome::RetryableExhausted(error)) => { + let message = format_worker_preregistration_error(&name, &error); + tracing::warn!( + worker = %name, + error = %error, + "continuing spawn without pre-registration after retries exhausted" + ); + preregistration_warning = Some(message); + None + } + Err(RegRetryOutcome::Fatal(error)) => { + let _ = reply.send(Err(format_worker_preregistration_error(&name, &error))); + return; + } + }; + + // Caller-supplied agent_token overrides auto-registration + let worker_relay_key = agent_token.or(worker_relay_key); + + let mut effective_task = normalize_initial_task(task); + if let Some(ref continue_from) = continue_from { + let continuity_dir = continuity_dir(&paths.state); + let continuity_file = continuity_dir.join(format!("{}.json", continue_from)); + if continuity_file.exists() { + match std::fs::read_to_string(&continuity_file) { + Ok(contents) => { + if let Ok(ctx) = serde_json::from_str::(&contents) { + let prev_task = ctx + .get("initial_task") + .and_then(Value::as_str) + .unwrap_or("unknown"); + let summary = ctx + .get("summary") + .and_then(Value::as_str) + .unwrap_or("no summary available"); + let messages = ctx + .get("message_history") + .and_then(Value::as_array) + .map(|msgs| { + msgs.iter() + .filter_map(|m| { + let from = m + .get("from") + .and_then(Value::as_str) + .unwrap_or("?"); + let text = m + .get("text") + .and_then(Value::as_str) + .unwrap_or(""); + if text.is_empty() { + None + } else { + Some(format!(" {}: {}", from, text)) + } + }) + .collect::>() + .join("\n") + }) + .unwrap_or_default(); + + let continuity_block = format!( + "## Continuity Context (from previous session as '{}')\n\ + Previous task: {}\n\ + Session summary: {}\n{}", + continue_from, + prev_task, + summary, + if messages.is_empty() { + String::new() + } else { + format!("Recent messages:\n{}\n", messages) + } + ); + + effective_task = Some(match effective_task { + Some(new_task) => { + format!( + "{}\n\n## Current Task\n{}", + continuity_block, new_task + ) + } + None => continuity_block, + }); + tracing::info!( + agent = %name, + continue_from = %continue_from, + "injected continuity context from previous session for HTTP API spawn" + ); + } + } + Err(e) => { + tracing::warn!( + agent = %name, + continue_from = %continue_from, + error = %e, + "failed to read continuity file for HTTP API spawn" + ); + } + } + } else { + tracing::warn!( + agent = %name, + continue_from = %continue_from, + "no continuity file found at {}", + continuity_file.display() + ); + } + } + + match workers + .spawn( + spec, + Some("Dashboard".to_string()), + None, + worker_relay_key.clone(), + skip_relay_prompt, + idle_threshold_secs.map(|s| s.to_string()), + ) + .await + { + Ok(effective_spec) => { + if let Some(ref task_text) = effective_task { + workers + .initial_tasks + .insert(name.clone(), task_text.clone()); + } + *agent_spawn_count += 1; + telemetry.track(TelemetryEvent::AgentSpawn { + cli: cli.clone(), + runtime: runtime_label(&effective_spec.runtime).to_string(), + spawn_source: ActionSource::HumanDashboard, + has_task: effective_task.is_some(), + is_shadow: effective_spec.shadow_of.is_some() + || effective_spec.shadow_mode.is_some(), + }); + let pid = workers.worker_pid(&name).unwrap_or(0); + state.agents.insert( + name.clone(), + broker::PersistedAgent { + runtime: effective_spec.runtime.clone(), + parent: Some("Dashboard".to_string()), + channels: effective_spec.channels.clone(), + pid: workers.worker_pid(&name), + started_at: Some( + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_secs(), + ), + spec: Some(effective_spec.clone()), + restart_policy: None, + initial_task: effective_task, + }, + ); + if paths.persist { + let _ = state.save(&paths.state); + } + note_local_spawn_control_dedup( + dedup, + default_workspace_id.as_deref().or_else(|| { + workspaces + .first() + .map(|workspace| workspace.workspace_id.as_str()) + }), + &name, + worker_relay_key.as_deref(), + ); + let _ = send_event( + sdk_out_tx, + json!({ + "kind":"agent_spawned", + "name":&name, + "runtime":runtime_label(&effective_spec.runtime), + "provider": effective_spec.provider.clone(), + "cli": effective_spec.cli.clone(), + "model": effective_spec.model.clone(), + "pid":pid, + "source":"http_api", + "pre_registered": worker_relay_key.is_some(), + "registration_warning": preregistration_warning.clone(), + }), + ) + .await; + publish_agent_state_transition( + ws_control_tx, + &name, + "spawned", + Some("http_api_spawn"), + ) + .await; + let _ = reply.send(Ok(json!({ + "success": true, + "name": name, + "runtime": runtime_label(&effective_spec.runtime), + "model": effective_spec.model.clone(), + "pid": pid, + "pre_registered": worker_relay_key.is_some(), + "warning": preregistration_warning, + }))); + } + Err(e) => { + eprintln!("[agent-relay] HTTP API: failed to spawn '{}': {}", name, e); + let _ = reply.send(Err(e.to_string())); + } + } + } + ListenApiRequest::SetModel { + name, + model, + timeout_ms, + reply, + } => { + let Some(handle) = workers.workers.get_mut(&name) else { + let _ = reply.send(Err(format!("unknown worker '{}'", name))); + return; + }; + + let model_command = format!("/model {}\n", model); + let result = async { + handle + .stdin + .write_all(model_command.as_bytes()) + .await + .with_context(|| { + format!("failed writing model command to worker '{}'", name) + })?; + handle + .stdin + .flush() + .await + .with_context(|| format!("failed flushing worker '{}' stdin", name))?; + if let Some(timeout_ms) = timeout_ms { + tracing::info!( + name = %name, + timeout_ms, + "HTTP API set_model timeout_ms is currently advisory only" + ); + } + Ok::<(), anyhow::Error>(()) + } + .await; + + match result { + Ok(()) => { + let _ = reply.send(Ok(json!({ + "name": name, + "model": model, + "success": true, + }))); + } + Err(error) => { + let _ = reply.send(Err(error.to_string())); + } + } + } + ListenApiRequest::Release { + name, + reason, + reply, + } => { + if let Some(ref r) = reason { + tracing::info!(worker = %name, reason = %r, "releasing agent via HTTP API"); + } + // Unregister from supervisor before release to prevent + // auto-restart of intentionally released agents. + workers.supervisor.unregister(&name); + workers.metrics.on_release(&name); + match workers.release(&name).await { + Ok(()) => { + if let Err(error) = relaycast_http.mark_agent_offline(&name).await { + tracing::warn!( + worker = %name, + error = %error, + "failed to mark released worker offline in relaycast" + ); + } + let dropped = drop_pending_for_worker(pending_deliveries, &name); + if dropped > 0 { + let _ = send_event( + sdk_out_tx, + json!({"kind":"delivery_dropped","name":&name,"count":dropped,"reason":"agent_released"}), + ).await; + } + fail_pending_requests_for_worker(pending_requests, &name, "agent_released"); + delivery_states.remove(&name); + state.agents.remove(&name); + if paths.persist { + let _ = state.save(&paths.state); + } + let _ = + send_event(sdk_out_tx, json!({"kind":"agent_released","name":&name})) + .await; + publish_agent_state_transition( + ws_control_tx, + &name, + "exited", + Some("http_api_release"), + ) + .await; + let _ = reply.send(Ok(json!({ "success": true, "name": name }))); + } + Err(e) => { + let message = e.to_string(); + if is_unknown_worker_error_message(&message) { + relaycast_http.forget_agent_registration(&name); + state.agents.remove(&name); + if paths.persist { + let _ = state.save(&paths.state); + } + tracing::debug!( + worker = %name, + "ignoring duplicate HTTP API release for already exited worker" + ); + let _ = reply.send(Ok(json!({ "success": true, "name": name }))); + } else { + eprintln!( + "[agent-relay] HTTP API: failed to release '{}': {}", + name, e + ); + let _ = reply.send(Err(message)); + } + } + } + } + ListenApiRequest::Send { + to, + text, + from, + thread_id, + workspace_id, + workspace_alias, + mode, + reply, + } => { + let normalized_to = to.trim().to_string(); + let selected_workspace = if let Some(workspace_id) = workspace_id.as_deref() { + workspace_lookup.get(workspace_id).cloned().ok_or_else(|| { + format!( + "workspace_not_found:workspace '{}' is not attached", + workspace_id + ) + }) + } else if let Some(workspace_alias) = workspace_alias.as_deref() { + workspaces + .iter() + .find(|workspace| { + workspace + .workspace_alias + .as_deref() + .is_some_and(|alias| alias.eq_ignore_ascii_case(workspace_alias)) + }) + .cloned() + .ok_or_else(|| { + format!( + "workspace_not_found:workspace alias '{}' is not attached", + workspace_alias + ) + }) + } else if workspaces.len() == 1 { + Ok(workspaces[0].clone()) + } else if let Some(default_workspace_id) = default_workspace_id.as_deref() { + workspace_lookup + .get(default_workspace_id) + .cloned() + .ok_or_else(|| { + format!( + "workspace_not_found: default workspace '{}' not found", + default_workspace_id + ) + }) + } else { + Err("ambiguous_workspace:workspaceId or workspaceAlias is required when multiple workspaces are attached".to_string()) + }; + let selected_workspace = match selected_workspace { + Ok(workspace) => workspace, + Err(error) => { + let _ = reply.send(Err(error)); + return; + } + }; + let selected_workspace_id = selected_workspace.workspace_id.clone(); + let selected_workspace_alias = selected_workspace.workspace_alias.clone(); + let workspace_self_name = selected_workspace.self_name.clone(); + let normalized_sender = normalize_sender(from.clone()); + let from_dashboard = + sender_is_dashboard_label(&normalized_sender, &workspace_self_name); + let delivery_from = if from_dashboard { + workspace_self_name.clone() + } else { + normalized_sender.clone() + }; + tracing::info!( + target = "relay_broker::http_api", + + raw_from = ?from, + normalized_sender = %normalized_sender, + from_dashboard = %from_dashboard, + delivery_from = %delivery_from, + to = %normalized_to, + thread_id = ?thread_id, + self_name = %workspace_self_name, + "HTTP API send request" + ); + let ui_from = if from_dashboard { + workspace_self_name.clone() + } else { + normalized_sender + }; + let event_id = format!("http_{}", Uuid::new_v4().simple()); + let priority = if normalized_to.starts_with('#') { 3 } else { 2 }; + let mut delivered = 0usize; + let mut delivery_errors = 0usize; + let request_start = Instant::now(); + let local_delivery_timeout = http_api_local_delivery_timeout(); + let relaycast_timeout = http_api_relaycast_send_timeout(); + let event_emit_timeout = http_api_event_emit_timeout(); + + record_thread_history_event( + recent_thread_messages, + json!({ + "event_id": event_id.clone(), + "from": ui_from.clone(), + "target": normalized_to.clone(), + "to": normalized_to.clone(), + "text": text.clone(), + "thread_id": thread_id.clone(), + "workspace_id": selected_workspace_id.clone(), + "workspace_alias": selected_workspace_alias.clone(), + "timestamp": chrono::Utc::now().to_rfc3339(), + }), + ); + + let targets = if normalized_to.starts_with('#') { + workers.worker_names_for_channel_delivery( + &normalized_to, + &delivery_from, + Some(&selected_workspace_id), + ) + } else { + workers.worker_names_for_direct_target( + &normalized_to, + &delivery_from, + Some(&selected_workspace_id), + ) + }; + + tracing::info!( + target = "relay_broker::http_api", + + event_id = %event_id, + to = %normalized_to, + delivery_from = %delivery_from, + target_count = %targets.len(), + "resolved HTTP API send targets" + ); + + for worker_name in targets { + // Inbound-delivery queue: every inbound message + // enters the per-worker FIFO first. `auto_inject` + // drains immediately; `manual_flush` holds and + // counts as delivered so the HTTP caller's ack + // semantics are unchanged. We pass the FULL + // routing context so any drain reproduces the + // original delivery (channel/thread/workspace + // /priority/mode), not a stripped-down DM. + match queue_inbound_for_delivery_mode( + delivery_states, + workers, + &worker_name, + InboundContext { + from: &delivery_from, + body: &text, + target: &normalized_to, + thread_id: thread_id.as_deref(), + workspace_id: Some(selected_workspace_id.as_str()), + workspace_alias: selected_workspace_alias.as_deref(), + priority, + mode: mode.clone(), + event_id: Some(&event_id), + }, + ) { + InboundQueueOutcome::Queued => { + delivered = delivered.saturating_add(1); + tracing::info!( + target = "relay_broker::http_api", + event_id = %event_id, + to = %normalized_to, + worker = %worker_name, + "queued local delivery (manual_flush inbound delivery mode)" + ); + let _ = send_event( + sdk_out_tx, + json!({ + "kind":"delivery_queued", + "name":&worker_name, + "event_id":&event_id, + "from":&delivery_from, + "target":&normalized_to, + "reason":"inbound_delivery_manual_flush", + }), + ) + .await; + continue; + } + InboundQueueOutcome::DrainNow(to_drain) => { + for queued in to_drain { + let queued_event_id = queued.event_id.as_deref().unwrap_or(""); + let is_current = + queued.event_id.as_deref() == Some(event_id.as_str()); + match timeout( + local_delivery_timeout, + try_inject_pending_relay_message( + workers, + pending_deliveries, + &worker_name, + &queued, + delivery_retry_interval, + ), + ) + .await + { + Ok(Ok(_)) => { + if is_current { + delivered = delivered.saturating_add(1); + } + } + Ok(Err(error)) => { + if is_current { + delivery_errors = delivery_errors.saturating_add(1); + } + tracing::warn!( + target = "relay_broker::http_api", + + event_id = %queued_event_id, + to = %queued.target, + worker = %worker_name, + error = %error, + "local delivery attempt failed" + ); + } + Err(_) => { + if is_current { + delivery_errors = delivery_errors.saturating_add(1); + } + tracing::warn!( + target = "relay_broker::http_api", + + event_id = %queued_event_id, + to = %queued.target, + worker = %worker_name, + timeout_ms = %local_delivery_timeout.as_millis(), + "local delivery attempt timed out" + ); + } + } + } + continue; + } + InboundQueueOutcome::WorkerMissing => { + // Fall through so the standard + // not-found accounting path runs. + } + } + match timeout( + local_delivery_timeout, + queue_and_try_delivery_raw( + workers, + pending_deliveries, + &worker_name, + &event_id, + &delivery_from, + &normalized_to, + &text, + thread_id.clone(), + Some(selected_workspace_id.clone()), + selected_workspace_alias.clone(), + priority, + mode.clone(), + delivery_retry_interval, + ), + ) + .await + { + Ok(Ok(_)) => { + delivered = delivered.saturating_add(1); + } + Ok(Err(error)) => { + delivery_errors = delivery_errors.saturating_add(1); + tracing::warn!( + target = "relay_broker::http_api", + + event_id = %event_id, + to = %normalized_to, + worker = %worker_name, + error = %error, + "local delivery attempt failed" + ); + } + Err(_) => { + delivery_errors = delivery_errors.saturating_add(1); + tracing::warn!( + target = "relay_broker::http_api", + + event_id = %event_id, + to = %normalized_to, + worker = %worker_name, + timeout_ms = %local_delivery_timeout.as_millis(), + "local delivery attempt timed out" + ); + } + } + } + + if delivered > 0 { + tracing::info!( + target = "relay_broker::http_api", + + event_id = %event_id, + to = %normalized_to, + delivery_from = %delivery_from, + ui_from = %ui_from, + delivered = %delivered, + "local delivery succeeded" + ); + emit_http_api_event_with_timeout( + sdk_out_tx, + json!({ + "kind": "relay_inbound", + "event_id": event_id, + "from": ui_from, + "target": normalized_to, + "body": text, + "thread_id": thread_id.clone(), + "workspace_id": selected_workspace_id.clone(), + "workspace_alias": selected_workspace_alias.clone(), + }), + event_emit_timeout, + ) + .await; + if reply + .send(Ok(json!({ + "success": true, + "event_id": event_id, + "delivered": delivered, + "local": true, + "workspace_id": selected_workspace_id, + "workspace_alias": selected_workspace_alias, + }))) + .is_err() + { + tracing::warn!( + target = "relay_broker::http_api", + + event_id = %event_id, + "broker HTTP API reply channel closed before local delivery response" + ); + } + } else { + tracing::info!( + target = "relay_broker::http_api", + + event_id = %event_id, + to = %normalized_to, + mode = ?mode, + delivery_errors = %delivery_errors, + delivery_from = %delivery_from, + ui_from = %ui_from, + relaycast_timeout_ms = %relaycast_timeout.as_millis(), + "no local deliveries succeeded; forwarding to relaycast" + ); + let relaycast_start = Instant::now(); + match timeout( + relaycast_timeout, + selected_workspace.http_client.send_with_mode( + &normalized_to, + &text, + mode.clone(), + ), + ) + .await + { + Ok(Ok(())) => { + tracing::info!( + target = "relay_broker::http_api", + + event_id = %event_id, + to = %normalized_to, + relaycast_ms = %relaycast_start.elapsed().as_millis(), + "relaycast publish succeeded" + ); + emit_http_api_event_with_timeout( + sdk_out_tx, + json!({ + "kind": "relay_inbound", + "event_id": event_id, + "from": ui_from, + "target": normalized_to, + "body": text, + "thread_id": thread_id.clone(), + "workspace_id": selected_workspace_id.clone(), + "workspace_alias": selected_workspace_alias.clone(), + }), + event_emit_timeout, + ) + .await; + if reply + .send(Ok(json!({ + "success": true, + "event_id": event_id, + "relaycast_published": true, + "local": false, + "workspace_id": selected_workspace_id, + "workspace_alias": selected_workspace_alias, + }))) + .is_err() + { + tracing::warn!( + target = "relay_broker::http_api", + + event_id = %event_id, + "broker HTTP API reply channel closed before relaycast response" + ); + } + } + Ok(Err(error)) => { + tracing::warn!( + target = "relay_broker::http_api", + + event_id = %event_id, + to = %normalized_to, + relaycast_ms = %relaycast_start.elapsed().as_millis(), + error = %error, + "relaycast publish failed" + ); + let not_found = format!("Agent \"{}\" not found", normalized_to); + if reply + .send(Err(format!( + "{not_found} and Relaycast publish failed: {error}" + ))) + .is_err() + { + tracing::warn!( + target = "relay_broker::http_api", + + event_id = %event_id, + "broker HTTP API reply channel closed before relaycast failure response" + ); + } + } + Err(_) => { + tracing::warn!( + target = "relay_broker::http_api", + + event_id = %event_id, + to = %normalized_to, + relaycast_timeout_ms = %relaycast_timeout.as_millis(), + relaycast_ms = %relaycast_start.elapsed().as_millis(), + "relaycast publish timed out" + ); + let not_found = format!("Agent \"{}\" not found", normalized_to); + if reply + .send(Err(format!( + "{not_found} and Relaycast publish timed out after {}ms", + relaycast_timeout.as_millis() + ))) + .is_err() + { + tracing::warn!( + target = "relay_broker::http_api", + + event_id = %event_id, + "broker HTTP API reply channel closed before relaycast timeout response" + ); + } + } + } + } + tracing::info!( + target = "relay_broker::http_api", + + event_id = %event_id, + to = %normalized_to, + total_ms = %request_start.elapsed().as_millis(), + "HTTP API send request handling complete" + ); + } + ListenApiRequest::List { reply } => { + let _ = reply.send(Ok(json!({ "agents": workers.list() }))); + } + ListenApiRequest::Threads { reply } => { + let mut messages: Vec = recent_thread_messages.iter().cloned().collect(); + match relaycast_http.get_all_dms(200).await { + Ok(dm_messages) => messages.extend(dm_messages), + Err(error) => { + tracing::debug!( + error = %error, + "failed to fetch relaycast dm history for /api/threads" + ); + } + } + let threads = build_thread_infos(&messages, self_names); + let _ = reply.send(Ok(json!({ "threads": threads }))); + } + ListenApiRequest::SendInput { name, data, reply } => { + if let Err(err) = workers + .send_to_worker( + &name, + "write_pty", + Some(format!("api_{}", Uuid::new_v4().simple())), + json!({ "data": data }), + ) + .await + { + let _ = reply.send(Err(format!("agent_not_found: {}", err))); + } else { + let _ = reply.send(Ok(json!({ + "name": name, + "bytes_written": data.len(), + }))); + } + } + ListenApiRequest::ResizePty { + name, + rows, + cols, + reply, + } => { + if rows == 0 || cols == 0 { + let _ = + reply.send(Err("invalid_dimensions: rows and cols must be >= 1".into())); + } else if let Err(err) = workers + .send_to_worker( + &name, + "resize_pty", + Some(format!("api_{}", Uuid::new_v4().simple())), + json!({ "rows": rows, "cols": cols }), + ) + .await + { + let _ = reply.send(Err(format!("agent_not_found: {}", err))); + } else { + let _ = reply.send(Ok(json!({ + "name": name, + "rows": rows, + "cols": cols, + }))); + } + } + ListenApiRequest::WorkerRequest { + name, + kind, + payload, + timeout, + reply, + } => { + // Generic worker request/response: validate the + // worker exists and supports a PTY (all current + // request/response routes target the PTY side), + // then ship the frame and park the `reply` + // oneshot in `pending_requests`. The response is + // fulfilled either by the `*_response` arm below + // or by the deadline sweep in `reap_tick`. + // + // Headless workers don't run a VT and don't handle + // PTY-oriented RPCs — short-circuit with a typed + // error rather than letting the request sit until + // the timeout sweep returns a misleading + // `worker_timeout`. + let runtime = workers + .workers + .get(&name) + .map(|handle| handle.spec.runtime.clone()); + match runtime { + None => { + let _ = + reply.send(Err(worker_request::RequestWorkerError::WorkerNotFound( + format!("no worker named '{name}'"), + ))); + } + Some(AgentRuntime::Headless) => { + let _ = reply.send(Err( + worker_request::RequestWorkerError::UnsupportedRuntime( + format!("worker '{name}' is headless; {kind} is only supported on PTY workers"), + ), + )); + } + Some(AgentRuntime::Pty) => { + let request_id = format!("req_{}", Uuid::new_v4().simple()); + if let Err(err) = workers + .send_to_worker(&name, &kind, Some(request_id.clone()), payload) + .await + { + let _ = reply.send(Err( + worker_request::RequestWorkerError::SendFailed(err.to_string()), + )); + } else { + pending_requests.insert( + request_id, + worker_request::PendingRequest { + kind, + worker_name: name, + reply, + deadline: Instant::now() + timeout, + }, + ); + } + } + } + } + ListenApiRequest::GetMetrics { agent, reply } => { + if let Some(ref agent_name) = agent { + if let Some(handle) = workers.workers.get(agent_name) { + let m = build_agent_metrics(handle); + let _ = reply.send(Ok(json!({ "agents": [m], "broker": workers.metrics.snapshot(workers.workers.len()) }))); + } else { + let _ = reply.send(Err(format!("unknown worker '{}'", agent_name))); + } + } else { + let mut agent_metrics: Vec = + workers.workers.values().map(build_agent_metrics).collect(); + agent_metrics.sort_by(|a, b| a.name.cmp(&b.name)); + let _ = reply.send(Ok(json!({ + "agents": agent_metrics, + "broker": workers.metrics.snapshot(workers.workers.len()), + }))); + } + } + ListenApiRequest::GetStatus { reply } => { + let pending: Vec = pending_deliveries + .values() + .map(|pd| { + json!({ + "delivery_id": pd.delivery.delivery_id, + "worker_name": pd.worker_name, + "event_id": pd.delivery.event_id, + "attempts": pd.attempts, + }) + }) + .collect(); + let _ = reply.send(Ok(json!({ + "agent_count": workers.workers.len(), + "agents": workers.list(), + "pending_delivery_count": pending.len(), + "pending_deliveries": pending, + }))); + } + ListenApiRequest::GetCrashInsights { reply } => { + let _ = reply.send(Ok(crash_insights.to_json())); + } + ListenApiRequest::Preflight { agents, reply } => { + let count = agents.len(); + let _ = reply.send(Ok(json!({ "queued": count }))); + // Background preflight — same as stdio handler + for entry in agents { + let http = relaycast_http.clone(); + tokio::spawn(async move { + let _ = tokio::time::timeout( + Duration::from_secs(30), + http.register_agent_token(&entry.name, Some(&entry.cli)), + ) + .await; + }); + } + } + ListenApiRequest::SubscribeChannels { + name, + channels, + reply, + } => { + let Some(handle) = workers.workers.get_mut(&name) else { + let _ = reply.send(Err(format!("unknown worker '{}'", name))); + return; + }; + let mut added = Vec::new(); + for ch in &channels { + let exists = handle + .spec + .channels + .iter() + .any(|c| c.eq_ignore_ascii_case(ch)); + if !exists { + handle.spec.channels.push(ch.clone()); + added.push(ch.clone()); + } + } + let all_channels = handle.spec.channels.clone(); + let _ = reply.send(Ok(json!({ + "name": name, + "channels": all_channels, + }))); + } + ListenApiRequest::UnsubscribeChannels { + name, + channels, + reply, + } => { + let Some(handle) = workers.workers.get_mut(&name) else { + let _ = reply.send(Err(format!("unknown worker '{}'", name))); + return; + }; + handle + .spec + .channels + .retain(|c| !channels.iter().any(|rem| rem.eq_ignore_ascii_case(c))); + let remaining = handle.spec.channels.clone(); + let _ = reply.send(Ok(json!({ + "name": name, + "channels": remaining, + }))); + } + ListenApiRequest::GetInboundDeliveryMode { name, reply } => { + if !workers.has_worker(&name) { + let _ = reply.send(Err(DeliveryRouteError::WorkerNotFound(name))); + } else { + let mode = delivery_states + .get(&name) + .map(|s| s.mode) + .unwrap_or_default(); + let _ = reply.send(Ok(mode)); + } + } + ListenApiRequest::SetInboundDeliveryMode { name, mode, reply } => { + if !workers.has_worker(&name) { + let _ = reply.send(Err(DeliveryRouteError::WorkerNotFound(name))); + } else { + let entry = delivery_states.entry(name.clone()).or_default(); + let previous = entry.mode; + entry.mode = mode; + let to_flush: Vec = if previous + == InboundDeliveryMode::ManualFlush + && mode == InboundDeliveryMode::AutoInject + { + entry.drain_pending() + } else { + Vec::new() + }; + let flushed = to_flush.len(); + if !to_flush.is_empty() { + tracing::info!( + target = "agent_relay::broker", + worker = %name, + drained = flushed, + "draining pending queue on manual_flush → auto_inject transition" + ); + } + for queued in to_flush { + inject_pending_relay_message( + workers, + pending_deliveries, + &name, + &queued, + delivery_retry_interval, + ) + .await; + } + tracing::info!( + target = "agent_relay::broker", + worker = %name, + previous_mode = previous.as_wire_str(), + mode = mode.as_wire_str(), + flushed, + "inbound delivery mode updated" + ); + if previous != mode { + let _ = send_event( + sdk_out_tx, + json!({ + "kind":"agent_inbound_delivery_mode_changed", + "name":&name, + "previous_mode":previous.as_wire_str(), + "mode":mode.as_wire_str(), + }), + ) + .await; + } + if flushed > 0 { + let _ = send_event( + sdk_out_tx, + json!({ + "kind":"agent_pending_drained", + "name":&name, + "count":flushed, + "reason":"delivery_mode_transition", + }), + ) + .await; + } + let _ = reply.send(Ok(SetInboundDeliveryModeOk { mode, flushed })); + } + } + ListenApiRequest::GetPending { name, reply } => { + if !workers.has_worker(&name) { + let _ = reply.send(Err(DeliveryRouteError::WorkerNotFound(name))); + } else { + let snapshot = delivery_states + .get(&name) + .map(|s| s.pending_snapshot()) + .unwrap_or_default(); + let _ = reply.send(Ok(snapshot)); + } + } + ListenApiRequest::FlushPending { name, reply } => { + if !workers.has_worker(&name) { + let _ = reply.send(Err(DeliveryRouteError::WorkerNotFound(name))); + } else { + let to_flush: Vec = delivery_states + .get_mut(&name) + .map(|state| state.drain_pending()) + .unwrap_or_default(); + let flushed = to_flush.len(); + if flushed > 0 { + tracing::info!( + target = "agent_relay::broker", + worker = %name, + drained = flushed, + "flushing pending queue on explicit /flush" + ); + } + for queued in to_flush { + inject_pending_relay_message( + workers, + pending_deliveries, + &name, + &queued, + delivery_retry_interval, + ) + .await; + } + if flushed > 0 { + let _ = send_event( + sdk_out_tx, + json!({ + "kind":"agent_pending_drained", + "name":&name, + "count":flushed, + "reason":"explicit_flush", + }), + ) + .await; + } + let _ = reply.send(Ok(flushed)); + } + } + ListenApiRequest::Shutdown { reply } => { + let _ = reply.send(Ok(json!({ "status": "shutting_down" }))); + *shutdown = true; + } + ListenApiRequest::RenewLease { reply } => { + *last_lease_renewal = Instant::now(); + let expires_in = lease_duration.map(|d| d.as_secs()).unwrap_or(0); + let _ = reply.send(Ok(json!({ + "renewed": true, + "expires_in_secs": expires_in, + "persist": persist, + }))); + } + } + } +} diff --git a/crates/broker/src/runtime/event_loop.rs b/crates/broker/src/runtime/event_loop.rs new file mode 100644 index 000000000..b4d766b80 --- /dev/null +++ b/crates/broker/src/runtime/event_loop.rs @@ -0,0 +1,173 @@ +use super::*; + +pub(crate) struct BrokerRuntime { + pub(super) persist: bool, + pub(super) broker_start: Instant, + pub(super) agent_spawn_count: u32, + pub(super) paths: RuntimePaths, + pub(super) state: broker::BrokerState, + pub(super) workspaces: Vec, + pub(super) workspace_lookup: HashMap, + pub(super) default_workspace: RelayWorkspace, + pub(super) default_workspace_id: Option, + pub(super) self_names: HashSet, + pub(super) ws_control_tx: mpsc::Sender, + pub(super) relaycast_http: RelaycastHttpClient, + pub(super) api_rx: mpsc::Receiver, + pub(super) ws_inbound_rx: mpsc::Receiver, + pub(super) sdk_out_tx: mpsc::Sender>, + pub(super) worker_event_rx: mpsc::Receiver, + pub(super) workers: WorkerRegistry, + pub(super) crash_insights: relay_broker::crash_insights::CrashInsights, + pub(super) crash_insights_path: PathBuf, + pub(super) sdk_lines: tokio::io::Lines>, + pub(super) stdin_open: bool, + pub(super) reap_tick: tokio::time::Interval, + pub(super) dedup: DedupCache, + pub(super) delivery_retry_interval: Duration, + pub(super) pending_deliveries: HashMap, + pub(super) terminal_failed_deliveries: HashSet, + pub(super) pending_requests: HashMap, + pub(super) delivery_states: HashMap, + pub(super) dm_participants_cache: HashMap)>, + pub(super) recent_thread_messages: VecDeque, + pub(super) shutdown: bool, + pub(super) lease_duration: Option, + pub(super) last_lease_renewal: Instant, + pub(super) lease_check: tokio::time::Interval, + #[cfg(unix)] + pub(super) sigterm: tokio::signal::unix::Signal, + #[cfg(windows)] + pub(super) sigterm: tokio::signal::windows::CtrlShutdown, + pub(super) telemetry: TelemetryClient, +} + +enum RuntimeEvent { + CtrlC, + LeaseTick, + Sigterm, + Api(Box), + ApiClosed, + Stdin(std::io::Result>), + Relaycast(Option), + Worker(Option), + MaintenanceTick, +} + +impl BrokerRuntime { + pub(super) async fn run(mut self) -> Result<()> { + while !self.shutdown { + let event = tokio::select! { + _ = tokio::signal::ctrl_c() => RuntimeEvent::CtrlC, + _ = self.lease_check.tick() => RuntimeEvent::LeaseTick, + _ = self.sigterm.recv() => RuntimeEvent::Sigterm, + request = self.api_rx.recv() => match request { + Some(request) => RuntimeEvent::Api(Box::new(request)), + None => RuntimeEvent::ApiClosed, + }, + result = self.sdk_lines.next_line(), if self.stdin_open => RuntimeEvent::Stdin(result), + message = self.ws_inbound_rx.recv() => RuntimeEvent::Relaycast(message), + event = self.worker_event_rx.recv() => RuntimeEvent::Worker(event), + _ = self.reap_tick.tick() => RuntimeEvent::MaintenanceTick, + }; + + match event { + RuntimeEvent::CtrlC => { + self.shutdown = true; + } + RuntimeEvent::LeaseTick => { + self.handle_lease_tick(); + } + RuntimeEvent::Sigterm => { + tracing::info!("received SIGTERM, shutting down"); + self.shutdown = true; + } + RuntimeEvent::Api(request) => { + self.handle_api_request(*request).await; + } + RuntimeEvent::ApiClosed => {} + RuntimeEvent::Stdin(result) => { + if matches!(result, Ok(None) | Err(_)) { + self.stdin_open = false; + } + } + RuntimeEvent::Relaycast(Some(message)) => { + self.handle_relaycast_message(message).await; + } + RuntimeEvent::Relaycast(None) => {} + RuntimeEvent::Worker(Some(event)) => { + self.handle_worker_event(event).await; + } + RuntimeEvent::Worker(None) => {} + RuntimeEvent::MaintenanceTick => { + self.handle_maintenance_tick().await; + } + } + } + + self.shutdown_runtime().await + } + + fn handle_lease_tick(&mut self) { + if let Some(duration) = self.lease_duration { + if self.last_lease_renewal.elapsed() > duration { + tracing::info!( + elapsed_secs = self.last_lease_renewal.elapsed().as_secs(), + lease_secs = duration.as_secs(), + "owner lease expired — shutting down" + ); + self.shutdown = true; + } + } + } + + async fn shutdown_runtime(mut self) -> Result<()> { + // Save crash insights before shutdown (only in persist mode) + if self.paths.persist { + if let Err(error) = self.crash_insights.save(&self.crash_insights_path) { + tracing::warn!(error = %error, "failed to save crash insights"); + } + } + + self.telemetry.track(TelemetryEvent::BrokerStop { + uptime_seconds: self.broker_start.elapsed().as_secs(), + agent_spawn_count: self.agent_spawn_count, + }); + self.telemetry.shutdown(); + + let active_workers: Vec = self.workers.workers.keys().cloned().collect(); + for worker_name in active_workers { + if let Err(error) = self.relaycast_http.mark_agent_offline(&worker_name).await { + tracing::warn!( + worker = %worker_name, + error = %error, + "failed to mark worker offline during shutdown" + ); + } + } + + // Mark broker agent offline in Relaycast before shutting down WS + if let Err(error) = self.relaycast_http.mark_offline().await { + tracing::warn!(error = %error, "failed to mark broker offline during shutdown"); + } + + if let Err(error) = self.ws_control_tx.send(WsControl::Shutdown).await { + tracing::warn!(error = %error, "failed to send ws shutdown signal"); + } + self.pending_deliveries.clear(); + // Clean shutdown — remove pending file since nothing is pending + if self.paths.persist { + let _ = std::fs::remove_file(&self.paths.pending); + } + self.workers.shutdown_all().await?; + + // Clean up state and connection files on graceful shutdown + if self.paths.persist { + let _ = std::fs::remove_file(&self.paths.state); + } + let connection_path = self.paths.state.parent().unwrap().join("connection.json"); + let _ = std::fs::remove_file(&connection_path); + + Ok(()) + } +} diff --git a/crates/broker/src/runtime/init.rs b/crates/broker/src/runtime/init.rs index 0b6f7181c..72100bb5c 100644 --- a/crates/broker/src/runtime/init.rs +++ b/crates/broker/src/runtime/init.rs @@ -3,7 +3,7 @@ use super::*; pub(crate) async fn run_init(cmd: InitCommand, telemetry: TelemetryClient) -> Result<()> { let broker_start = Instant::now(); let startup_debug = startup_debug_enabled(); - let mut agent_spawn_count: u32 = 0; + let agent_spawn_count: u32 = 0; telemetry.track(TelemetryEvent::BrokerStart); let runtime_cwd = std::env::current_dir()?; @@ -99,7 +99,7 @@ pub(crate) async fn run_init(cmd: InitCommand, telemetry: TelemetryClient) -> Re let relay_ready = Arc::new(Notify::new()); let relay_ready_state: Arc>> = Arc::new(RwLock::new(None)); - let (api_tx, mut api_rx) = mpsc::channel::(32); + let (api_tx, api_rx) = mpsc::channel::(32); let bind_addr = format!("{}:{}", cmd.api_bind, cmd.api_port); log_startup_phase( startup_debug, @@ -170,7 +170,7 @@ pub(crate) async fn run_init(cmd: InitCommand, telemetry: TelemetryClient) -> Re http_base, default_workspace_id, workspaces, - mut ws_inbound_rx, + ws_inbound_rx, } = relay; let workspace_lookup: HashMap = workspaces .iter() @@ -353,29 +353,27 @@ pub(crate) async fn run_init(cmd: InitCommand, telemetry: TelemetryClient) -> Re } }); - let (worker_event_tx, mut worker_event_rx) = mpsc::channel::(1024); + let (worker_event_tx, worker_event_rx) = mpsc::channel::(1024); let worker_logs_dir = paths .state .parent() .expect("state path should always have a parent") .join("team") .join("worker-logs"); - let mut workers = - WorkerRegistry::new(worker_event_tx, worker_env, worker_logs_dir, broker_start); + let workers = WorkerRegistry::new(worker_event_tx, worker_env, worker_logs_dir, broker_start); // Load crash insights from previous session let crash_insights_path = paths.state.parent().unwrap().join("crash-insights.json"); - let mut crash_insights = - relay_broker::crash_insights::CrashInsights::load(&crash_insights_path); + let crash_insights = relay_broker::crash_insights::CrashInsights::load(&crash_insights_path); - let mut sdk_lines = BufReader::new(tokio::io::stdin()).lines(); - let mut stdin_open = true; + let sdk_lines = BufReader::new(tokio::io::stdin()).lines(); + let stdin_open = true; let mut reap_tick = tokio::time::interval(Duration::from_millis(500)); reap_tick.set_missed_tick_behavior(MissedTickBehavior::Skip); - let mut dedup = DedupCache::new(Duration::from_secs(300), 8192); + let dedup = DedupCache::new(Duration::from_secs(300), 8192); let delivery_retry_interval = delivery_retry_interval(); - let mut pending_deliveries = load_pending_deliveries(&paths.pending); - let mut terminal_failed_deliveries: HashSet = HashSet::new(); + let pending_deliveries = load_pending_deliveries(&paths.pending); + let terminal_failed_deliveries: HashSet = HashSet::new(); // Outstanding worker-bound RPC requests waiting on a `*_response` // frame from the wrapped worker. Keyed by the `request_id` we put on // the outbound request frame; the reply `oneshot` is consumed when @@ -385,16 +383,16 @@ pub(crate) async fn run_init(cmd: InitCommand, telemetry: TelemetryClient) -> Re // The generic correlation infrastructure lives in `crate::worker_request` // so each new request/response route (`snapshot_pty`, `delivery-mode`, // `pending`, `flush`, ...) costs about five lines of broker plumbing. - let mut pending_requests: HashMap = HashMap::new(); + let pending_requests: HashMap = HashMap::new(); // Per-worker inbound-delivery-mode + pending-relay-message queue. Lives // parallel to `workers.workers` so we can swap modes / inspect / // drain without touching `WorkerHandle` (which holds OS-level // process state). See `relay_broker::types::InboundDeliveryState`. Entries // are created lazily on first lookup and removed wherever workers // exit (`Release` arm, `worker_exited` frame, `reap_exited` sweep). - let mut delivery_states: HashMap = HashMap::new(); - let mut dm_participants_cache: HashMap)> = HashMap::new(); - let mut recent_thread_messages: VecDeque = VecDeque::new(); + let delivery_states: HashMap = HashMap::new(); + let dm_participants_cache: HashMap)> = HashMap::new(); + let recent_thread_messages: VecDeque = VecDeque::new(); if !pending_deliveries.is_empty() { tracing::info!( count = pending_deliveries.len(), @@ -403,7 +401,7 @@ pub(crate) async fn run_init(cmd: InitCommand, telemetry: TelemetryClient) -> Re ); } - let mut shutdown = false; + let shutdown = false; // Owner lease: in ephemeral mode, the broker shuts down if the SDK // doesn't renew the lease within this duration. Replaces stdin EOF @@ -413,7 +411,7 @@ pub(crate) async fn run_init(cmd: InitCommand, telemetry: TelemetryClient) -> Re } else { Some(Duration::from_secs(120)) }; - let mut last_lease_renewal = Instant::now(); + let last_lease_renewal = Instant::now(); let mut lease_check = tokio::time::interval(Duration::from_secs(10)); lease_check.set_missed_tick_behavior(MissedTickBehavior::Skip); @@ -421,2789 +419,48 @@ pub(crate) async fn run_init(cmd: InitCommand, telemetry: TelemetryClient) -> Re // `tokio::signal::ctrl_c()` is handled in its own select! arm below and // works on both platforms. #[cfg(unix)] - let mut sigterm = tokio::signal::unix::signal(tokio::signal::unix::SignalKind::terminate())?; + let sigterm = tokio::signal::unix::signal(tokio::signal::unix::SignalKind::terminate())?; #[cfg(windows)] let mut sigterm = tokio::signal::windows::ctrl_shutdown()?; - while !shutdown { - tokio::select! { - _ = tokio::signal::ctrl_c() => { - shutdown = true; - } - - _ = lease_check.tick() => { - if let Some(duration) = lease_duration { - if last_lease_renewal.elapsed() > duration { - tracing::info!( - elapsed_secs = last_lease_renewal.elapsed().as_secs(), - lease_secs = duration.as_secs(), - "owner lease expired — shutting down" - ); - shutdown = true; - } - } - } - - _ = sigterm.recv() => { - tracing::info!("received SIGTERM, shutting down"); - shutdown = true; - } - - // HTTP API requests (when --api-port is active) - result = api_rx.recv() => { - if let Some(req) = result { - match req { - ListenApiRequest::Spawn { - name, - cli, - transport, - model, - args, - task, - channels, - cwd, - team, - shadow_of, - shadow_mode, - continue_from, - idle_threshold_secs, - skip_relay_prompt, - restart_policy, - agent_token, - reply, - } => { - let effective_channels = if channels.is_empty() { - default_spawn_channels() - } else { - channels.clone() - }; - let spec = match build_http_api_spawn_spec( - name.clone(), - cli.clone(), - transport, - model.clone(), - args, - effective_channels.clone(), - cwd, - team, - shadow_of, - shadow_mode, - *restart_policy, - ) { - Ok(spec) => spec, - Err(error) => { - let _ = reply.send(Err(error.to_string())); - continue; - } - }; - let mut preregistration_warning: Option = None; - let registration_result = retry_agent_registration( - &relaycast_http, &name, Some(&cli), - ).await; - let worker_relay_key = match registration_result { - Ok(token) => Some(token), - Err(RegRetryOutcome::RetryableExhausted(error)) => { - let message = format_worker_preregistration_error(&name, &error); - tracing::warn!( - worker = %name, - error = %error, - "continuing spawn without pre-registration after retries exhausted" - ); - preregistration_warning = Some(message); - None - } - Err(RegRetryOutcome::Fatal(error)) => { - let _ = reply.send(Err(format_worker_preregistration_error(&name, &error))); - continue; - } - }; - - // Caller-supplied agent_token overrides auto-registration - let worker_relay_key = agent_token.or(worker_relay_key); - - let mut effective_task = normalize_initial_task(task); - if let Some(ref continue_from) = continue_from { - let continuity_dir = continuity_dir(&paths.state); - let continuity_file = continuity_dir.join(format!("{}.json", continue_from)); - if continuity_file.exists() { - match std::fs::read_to_string(&continuity_file) { - Ok(contents) => { - if let Ok(ctx) = serde_json::from_str::(&contents) { - let prev_task = ctx - .get("initial_task") - .and_then(Value::as_str) - .unwrap_or("unknown"); - let summary = ctx - .get("summary") - .and_then(Value::as_str) - .unwrap_or("no summary available"); - let messages = ctx - .get("message_history") - .and_then(Value::as_array) - .map(|msgs| { - msgs.iter() - .filter_map(|m| { - let from = m - .get("from") - .and_then(Value::as_str) - .unwrap_or("?"); - let text = m - .get("text") - .and_then(Value::as_str) - .unwrap_or(""); - if text.is_empty() { - None - } else { - Some(format!(" {}: {}", from, text)) - } - }) - .collect::>() - .join("\n") - }) - .unwrap_or_default(); - - let continuity_block = format!( - "## Continuity Context (from previous session as '{}')\n\ - Previous task: {}\n\ - Session summary: {}\n{}", - continue_from, - prev_task, - summary, - if messages.is_empty() { - String::new() - } else { - format!("Recent messages:\n{}\n", messages) - } - ); - - effective_task = Some(match effective_task { - Some(new_task) => { - format!( - "{}\n\n## Current Task\n{}", - continuity_block, new_task - ) - } - None => continuity_block, - }); - tracing::info!( - agent = %name, - continue_from = %continue_from, - "injected continuity context from previous session for HTTP API spawn" - ); - } - } - Err(e) => { - tracing::warn!( - agent = %name, - continue_from = %continue_from, - error = %e, - "failed to read continuity file for HTTP API spawn" - ); - } - } - } else { - tracing::warn!( - agent = %name, - continue_from = %continue_from, - "no continuity file found at {}", - continuity_file.display() - ); - } - } - - match workers.spawn( - spec, - Some("Dashboard".to_string()), - None, - worker_relay_key.clone(), - skip_relay_prompt, - idle_threshold_secs.map(|s| s.to_string()), - ).await { - Ok(effective_spec) => { - if let Some(ref task_text) = effective_task { - workers.initial_tasks.insert(name.clone(), task_text.clone()); - } - agent_spawn_count += 1; - telemetry.track(TelemetryEvent::AgentSpawn { - cli: cli.clone(), - runtime: runtime_label(&effective_spec.runtime).to_string(), - spawn_source: ActionSource::HumanDashboard, - has_task: effective_task.is_some(), - is_shadow: effective_spec.shadow_of.is_some() - || effective_spec.shadow_mode.is_some(), - }); - let pid = workers.worker_pid(&name).unwrap_or(0); - state.agents.insert( - name.clone(), - broker::PersistedAgent { - runtime: effective_spec.runtime.clone(), - parent: Some("Dashboard".to_string()), - channels: effective_spec.channels.clone(), - pid: workers.worker_pid(&name), - started_at: Some( - std::time::SystemTime::now() - .duration_since(std::time::UNIX_EPOCH) - .unwrap_or_default() - .as_secs(), - ), - spec: Some(effective_spec.clone()), - restart_policy: None, - initial_task: effective_task, - - }, - ); - if paths.persist { let _ = state.save(&paths.state); } - note_local_spawn_control_dedup( - &mut dedup, - default_workspace_id - .as_deref() - .or_else(|| workspaces.first().map(|workspace| workspace.workspace_id.as_str())), - &name, - worker_relay_key.as_deref(), - ); - let _ = send_event( - &sdk_out_tx, - json!({ - "kind":"agent_spawned", - "name":&name, - "runtime":runtime_label(&effective_spec.runtime), - "provider": effective_spec.provider.clone(), - "cli": effective_spec.cli.clone(), - "model": effective_spec.model.clone(), - "pid":pid, - "source":"http_api", - "pre_registered": worker_relay_key.is_some(), - "registration_warning": preregistration_warning.clone(), - }), - ).await; - publish_agent_state_transition( - &ws_control_tx, - &name, - "spawned", - Some("http_api_spawn"), - ) - .await; - let _ = reply.send(Ok(json!({ - "success": true, - "name": name, - "runtime": runtime_label(&effective_spec.runtime), - "model": effective_spec.model.clone(), - "pid": pid, - "pre_registered": worker_relay_key.is_some(), - "warning": preregistration_warning, - }))); - } - Err(e) => { - eprintln!("[agent-relay] HTTP API: failed to spawn '{}': {}", name, e); - let _ = reply.send(Err(e.to_string())); - } - } - } - ListenApiRequest::SetModel { name, model, timeout_ms, reply } => { - let Some(handle) = workers.workers.get_mut(&name) else { - let _ = reply.send(Err(format!("unknown worker '{}'", name))); - continue; - }; - - let model_command = format!("/model {}\n", model); - let result = async { - handle - .stdin - .write_all(model_command.as_bytes()) - .await - .with_context(|| { - format!("failed writing model command to worker '{}'", name) - })?; - handle - .stdin - .flush() - .await - .with_context(|| { - format!("failed flushing worker '{}' stdin", name) - })?; - if let Some(timeout_ms) = timeout_ms { - tracing::info!( - name = %name, - timeout_ms, - "HTTP API set_model timeout_ms is currently advisory only" - ); - } - Ok::<(), anyhow::Error>(()) - } - .await; - - match result { - Ok(()) => { - let _ = reply.send(Ok(json!({ - "name": name, - "model": model, - "success": true, - }))); - } - Err(error) => { - let _ = reply.send(Err(error.to_string())); - } - } - } - ListenApiRequest::Release { name, reason, reply } => { - if let Some(ref r) = reason { - tracing::info!(worker = %name, reason = %r, "releasing agent via HTTP API"); - } - // Unregister from supervisor before release to prevent - // auto-restart of intentionally released agents. - workers.supervisor.unregister(&name); - workers.metrics.on_release(&name); - match workers.release(&name).await { - Ok(()) => { - if let Err(error) = relaycast_http.mark_agent_offline(&name).await { - tracing::warn!( - worker = %name, - error = %error, - "failed to mark released worker offline in relaycast" - ); - } - let dropped = drop_pending_for_worker(&mut pending_deliveries, &name); - if dropped > 0 { - let _ = send_event( - &sdk_out_tx, - json!({"kind":"delivery_dropped","name":&name,"count":dropped,"reason":"agent_released"}), - ).await; - } - fail_pending_requests_for_worker(&mut pending_requests, &name, "agent_released"); - delivery_states.remove(&name); - state.agents.remove(&name); - if paths.persist { let _ = state.save(&paths.state); } - let _ = send_event( - &sdk_out_tx, - json!({"kind":"agent_released","name":&name}), - ).await; - publish_agent_state_transition( - &ws_control_tx, - &name, - "exited", - Some("http_api_release"), - ) - .await; - let _ = reply.send(Ok(json!({ "success": true, "name": name }))); - } - Err(e) => { - let message = e.to_string(); - if is_unknown_worker_error_message(&message) { - relaycast_http.forget_agent_registration(&name); - state.agents.remove(&name); - if paths.persist { - let _ = state.save(&paths.state); - } - tracing::debug!( - worker = %name, - "ignoring duplicate HTTP API release for already exited worker" - ); - let _ = reply.send(Ok(json!({ "success": true, "name": name }))); - } else { - eprintln!("[agent-relay] HTTP API: failed to release '{}': {}", name, e); - let _ = reply.send(Err(message)); - } - } - } - } - ListenApiRequest::Send { - to, - text, - from, - thread_id, - workspace_id, - workspace_alias, - mode, - reply, - } => { - let normalized_to = to.trim().to_string(); - let selected_workspace = if let Some(workspace_id) = workspace_id.as_deref() { - workspace_lookup - .get(workspace_id) - .cloned() - .ok_or_else(|| format!("workspace_not_found:workspace '{}' is not attached", workspace_id)) - } else if let Some(workspace_alias) = workspace_alias.as_deref() { - workspaces - .iter() - .find(|workspace| { - workspace - .workspace_alias - .as_deref() - .is_some_and(|alias| alias.eq_ignore_ascii_case(workspace_alias)) - }) - .cloned() - .ok_or_else(|| format!("workspace_not_found:workspace alias '{}' is not attached", workspace_alias)) - } else if workspaces.len() == 1 { - Ok(workspaces[0].clone()) - } else if let Some(default_workspace_id) = default_workspace_id.as_deref() { - workspace_lookup - .get(default_workspace_id) - .cloned() - .ok_or_else(|| format!("workspace_not_found: default workspace '{}' not found", default_workspace_id)) - } else { - Err("ambiguous_workspace:workspaceId or workspaceAlias is required when multiple workspaces are attached".to_string()) - }; - let selected_workspace = match selected_workspace { - Ok(workspace) => workspace, - Err(error) => { - let _ = reply.send(Err(error)); - continue; - } - }; - let selected_workspace_id = selected_workspace.workspace_id.clone(); - let selected_workspace_alias = selected_workspace.workspace_alias.clone(); - let workspace_self_name = selected_workspace.self_name.clone(); - let normalized_sender = normalize_sender(from.clone()); - let from_dashboard = - sender_is_dashboard_label(&normalized_sender, &workspace_self_name); - let delivery_from = if from_dashboard { - workspace_self_name.clone() - } else { - normalized_sender.clone() - }; - tracing::info!( - target = "relay_broker::http_api", - - raw_from = ?from, - normalized_sender = %normalized_sender, - from_dashboard = %from_dashboard, - delivery_from = %delivery_from, - to = %normalized_to, - thread_id = ?thread_id, - self_name = %workspace_self_name, - "HTTP API send request" - ); - let ui_from = if from_dashboard { - workspace_self_name.clone() - } else { - normalized_sender - }; - let event_id = format!("http_{}", Uuid::new_v4().simple()); - let priority = if normalized_to.starts_with('#') { 3 } else { 2 }; - let mut delivered = 0usize; - let mut delivery_errors = 0usize; - let request_start = Instant::now(); - let local_delivery_timeout = http_api_local_delivery_timeout(); - let relaycast_timeout = http_api_relaycast_send_timeout(); - let event_emit_timeout = http_api_event_emit_timeout(); - - record_thread_history_event( - &mut recent_thread_messages, - json!({ - "event_id": event_id.clone(), - "from": ui_from.clone(), - "target": normalized_to.clone(), - "to": normalized_to.clone(), - "text": text.clone(), - "thread_id": thread_id.clone(), - "workspace_id": selected_workspace_id.clone(), - "workspace_alias": selected_workspace_alias.clone(), - "timestamp": chrono::Utc::now().to_rfc3339(), - }), - ); - - let targets = if normalized_to.starts_with('#') { - workers.worker_names_for_channel_delivery(&normalized_to, &delivery_from, Some(&selected_workspace_id)) - } else { - workers.worker_names_for_direct_target(&normalized_to, &delivery_from, Some(&selected_workspace_id)) - }; - - tracing::info!( - target = "relay_broker::http_api", - - event_id = %event_id, - to = %normalized_to, - delivery_from = %delivery_from, - target_count = %targets.len(), - "resolved HTTP API send targets" - ); - - for worker_name in targets { - // Inbound-delivery queue: every inbound message - // enters the per-worker FIFO first. `auto_inject` - // drains immediately; `manual_flush` holds and - // counts as delivered so the HTTP caller's ack - // semantics are unchanged. We pass the FULL - // routing context so any drain reproduces the - // original delivery (channel/thread/workspace - // /priority/mode), not a stripped-down DM. - match queue_inbound_for_delivery_mode( - &mut delivery_states, - &workers, - &worker_name, - InboundContext { - from: &delivery_from, - body: &text, - target: &normalized_to, - thread_id: thread_id.as_deref(), - workspace_id: Some(selected_workspace_id.as_str()), - workspace_alias: selected_workspace_alias.as_deref(), - priority, - mode: mode.clone(), - event_id: Some(&event_id), - }, - ) { - InboundQueueOutcome::Queued => { - delivered = delivered.saturating_add(1); - tracing::info!( - target = "relay_broker::http_api", - event_id = %event_id, - to = %normalized_to, - worker = %worker_name, - "queued local delivery (manual_flush inbound delivery mode)" - ); - let _ = send_event( - &sdk_out_tx, - json!({ - "kind":"delivery_queued", - "name":&worker_name, - "event_id":&event_id, - "from":&delivery_from, - "target":&normalized_to, - "reason":"inbound_delivery_manual_flush", - }), - ).await; - continue; - } - InboundQueueOutcome::DrainNow(to_drain) => { - for queued in to_drain { - let queued_event_id = - queued.event_id.as_deref().unwrap_or(""); - let is_current = - queued.event_id.as_deref() == Some(event_id.as_str()); - match timeout( - local_delivery_timeout, - try_inject_pending_relay_message( - &mut workers, - &mut pending_deliveries, - &worker_name, - &queued, - delivery_retry_interval, - ), - ) - .await - { - Ok(Ok(_)) => { - if is_current { - delivered = delivered.saturating_add(1); - } - } - Ok(Err(error)) => { - if is_current { - delivery_errors = - delivery_errors.saturating_add(1); - } - tracing::warn!( - target = "relay_broker::http_api", - - event_id = %queued_event_id, - to = %queued.target, - worker = %worker_name, - error = %error, - "local delivery attempt failed" - ); - } - Err(_) => { - if is_current { - delivery_errors = - delivery_errors.saturating_add(1); - } - tracing::warn!( - target = "relay_broker::http_api", - - event_id = %queued_event_id, - to = %queued.target, - worker = %worker_name, - timeout_ms = %local_delivery_timeout.as_millis(), - "local delivery attempt timed out" - ); - } - } - } - continue; - } - InboundQueueOutcome::WorkerMissing => { - // Fall through so the standard - // not-found accounting path runs. - } - } - match timeout( - local_delivery_timeout, - queue_and_try_delivery_raw( - &mut workers, - &mut pending_deliveries, - &worker_name, - &event_id, - &delivery_from, - &normalized_to, - &text, - thread_id.clone(), - Some(selected_workspace_id.clone()), - selected_workspace_alias.clone(), - priority, - mode.clone(), - delivery_retry_interval, - ), - ) - .await - { - Ok(Ok(_)) => { - delivered = delivered.saturating_add(1); - } - Ok(Err(error)) => { - delivery_errors = delivery_errors.saturating_add(1); - tracing::warn!( - target = "relay_broker::http_api", - - event_id = %event_id, - to = %normalized_to, - worker = %worker_name, - error = %error, - "local delivery attempt failed" - ); - } - Err(_) => { - delivery_errors = delivery_errors.saturating_add(1); - tracing::warn!( - target = "relay_broker::http_api", - - event_id = %event_id, - to = %normalized_to, - worker = %worker_name, - timeout_ms = %local_delivery_timeout.as_millis(), - "local delivery attempt timed out" - ); - } - } - } - - if delivered > 0 { - tracing::info!( - target = "relay_broker::http_api", - - event_id = %event_id, - to = %normalized_to, - delivery_from = %delivery_from, - ui_from = %ui_from, - delivered = %delivered, - "local delivery succeeded" - ); - emit_http_api_event_with_timeout( - &sdk_out_tx, - json!({ - "kind": "relay_inbound", - "event_id": event_id, - "from": ui_from, - "target": normalized_to, - "body": text, - "thread_id": thread_id.clone(), - "workspace_id": selected_workspace_id.clone(), - "workspace_alias": selected_workspace_alias.clone(), - }), - event_emit_timeout, - ) - .await; - if reply - .send(Ok(json!({ - "success": true, - "event_id": event_id, - "delivered": delivered, - "local": true, - "workspace_id": selected_workspace_id, - "workspace_alias": selected_workspace_alias, - }))) - .is_err() - { - tracing::warn!( - target = "relay_broker::http_api", - - event_id = %event_id, - "broker HTTP API reply channel closed before local delivery response" - ); - } - } else { - tracing::info!( - target = "relay_broker::http_api", - - event_id = %event_id, - to = %normalized_to, - mode = ?mode, - delivery_errors = %delivery_errors, - delivery_from = %delivery_from, - ui_from = %ui_from, - relaycast_timeout_ms = %relaycast_timeout.as_millis(), - "no local deliveries succeeded; forwarding to relaycast" - ); - let relaycast_start = Instant::now(); - match timeout( - relaycast_timeout, - selected_workspace - .http_client - .send_with_mode(&normalized_to, &text, mode.clone()), - ) - .await - { - Ok(Ok(())) => { - tracing::info!( - target = "relay_broker::http_api", - - event_id = %event_id, - to = %normalized_to, - relaycast_ms = %relaycast_start.elapsed().as_millis(), - "relaycast publish succeeded" - ); - emit_http_api_event_with_timeout( - &sdk_out_tx, - json!({ - "kind": "relay_inbound", - "event_id": event_id, - "from": ui_from, - "target": normalized_to, - "body": text, - "thread_id": thread_id.clone(), - "workspace_id": selected_workspace_id.clone(), - "workspace_alias": selected_workspace_alias.clone(), - }), - event_emit_timeout, - ) - .await; - if reply - .send(Ok(json!({ - "success": true, - "event_id": event_id, - "relaycast_published": true, - "local": false, - "workspace_id": selected_workspace_id, - "workspace_alias": selected_workspace_alias, - }))) - .is_err() - { - tracing::warn!( - target = "relay_broker::http_api", - - event_id = %event_id, - "broker HTTP API reply channel closed before relaycast response" - ); - } - } - Ok(Err(error)) => { - tracing::warn!( - target = "relay_broker::http_api", - - event_id = %event_id, - to = %normalized_to, - relaycast_ms = %relaycast_start.elapsed().as_millis(), - error = %error, - "relaycast publish failed" - ); - let not_found = format!("Agent \"{}\" not found", normalized_to); - if reply - .send(Err(format!( - "{not_found} and Relaycast publish failed: {error}" - ))) - .is_err() - { - tracing::warn!( - target = "relay_broker::http_api", - - event_id = %event_id, - "broker HTTP API reply channel closed before relaycast failure response" - ); - } - } - Err(_) => { - tracing::warn!( - target = "relay_broker::http_api", - - event_id = %event_id, - to = %normalized_to, - relaycast_timeout_ms = %relaycast_timeout.as_millis(), - relaycast_ms = %relaycast_start.elapsed().as_millis(), - "relaycast publish timed out" - ); - let not_found = format!("Agent \"{}\" not found", normalized_to); - if reply - .send(Err(format!( - "{not_found} and Relaycast publish timed out after {}ms", - relaycast_timeout.as_millis() - ))) - .is_err() - { - tracing::warn!( - target = "relay_broker::http_api", - - event_id = %event_id, - "broker HTTP API reply channel closed before relaycast timeout response" - ); - } - } - } - } - tracing::info!( - target = "relay_broker::http_api", - - event_id = %event_id, - to = %normalized_to, - total_ms = %request_start.elapsed().as_millis(), - "HTTP API send request handling complete" - ); - } - ListenApiRequest::List { reply } => { - let _ = reply.send(Ok(json!({ "agents": workers.list() }))); - } - ListenApiRequest::Threads { reply } => { - let mut messages: Vec = - recent_thread_messages.iter().cloned().collect(); - match relaycast_http.get_all_dms(200).await { - Ok(dm_messages) => messages.extend(dm_messages), - Err(error) => { - tracing::debug!( - error = %error, - "failed to fetch relaycast dm history for /api/threads" - ); - } - } - let threads = build_thread_infos(&messages, &self_names); - let _ = reply.send(Ok(json!({ "threads": threads }))); - } - ListenApiRequest::SendInput { name, data, reply } => { - if let Err(err) = workers.send_to_worker( - &name, "write_pty", Some(format!("api_{}", Uuid::new_v4().simple())), - json!({ "data": data }), - ).await { - let _ = reply.send(Err(format!("agent_not_found: {}", err))); - } else { - let _ = reply.send(Ok(json!({ - "name": name, - "bytes_written": data.len(), - }))); - } - } - ListenApiRequest::ResizePty { name, rows, cols, reply } => { - if rows == 0 || cols == 0 { - let _ = reply.send(Err("invalid_dimensions: rows and cols must be >= 1".into())); - } else if let Err(err) = workers.send_to_worker( - &name, "resize_pty", Some(format!("api_{}", Uuid::new_v4().simple())), - json!({ "rows": rows, "cols": cols }), - ).await { - let _ = reply.send(Err(format!("agent_not_found: {}", err))); - } else { - let _ = reply.send(Ok(json!({ - "name": name, - "rows": rows, - "cols": cols, - }))); - } - } - ListenApiRequest::WorkerRequest { name, kind, payload, timeout, reply } => { - // Generic worker request/response: validate the - // worker exists and supports a PTY (all current - // request/response routes target the PTY side), - // then ship the frame and park the `reply` - // oneshot in `pending_requests`. The response is - // fulfilled either by the `*_response` arm below - // or by the deadline sweep in `reap_tick`. - // - // Headless workers don't run a VT and don't handle - // PTY-oriented RPCs — short-circuit with a typed - // error rather than letting the request sit until - // the timeout sweep returns a misleading - // `worker_timeout`. - let runtime = workers - .workers - .get(&name) - .map(|handle| handle.spec.runtime.clone()); - match runtime { - None => { - let _ = reply.send(Err( - worker_request::RequestWorkerError::WorkerNotFound( - format!("no worker named '{name}'"), - ), - )); - } - Some(AgentRuntime::Headless) => { - let _ = reply.send(Err( - worker_request::RequestWorkerError::UnsupportedRuntime( - format!("worker '{name}' is headless; {kind} is only supported on PTY workers"), - ), - )); - } - Some(AgentRuntime::Pty) => { - let request_id = format!("req_{}", Uuid::new_v4().simple()); - if let Err(err) = workers.send_to_worker( - &name, - &kind, - Some(request_id.clone()), - payload, - ).await { - let _ = reply.send(Err( - worker_request::RequestWorkerError::SendFailed( - err.to_string(), - ), - )); - } else { - pending_requests.insert( - request_id, - worker_request::PendingRequest { - kind, - worker_name: name, - reply, - deadline: Instant::now() + timeout, - }, - ); - } - } - } - } - ListenApiRequest::GetMetrics { agent, reply } => { - if let Some(ref agent_name) = agent { - if let Some(handle) = workers.workers.get(agent_name) { - let m = build_agent_metrics(handle); - let _ = reply.send(Ok(json!({ "agents": [m], "broker": workers.metrics.snapshot(workers.workers.len()) }))); - } else { - let _ = reply.send(Err(format!("unknown worker '{}'", agent_name))); - } - } else { - let mut agent_metrics: Vec = workers.workers.values() - .map(build_agent_metrics) - .collect(); - agent_metrics.sort_by(|a, b| a.name.cmp(&b.name)); - let _ = reply.send(Ok(json!({ - "agents": agent_metrics, - "broker": workers.metrics.snapshot(workers.workers.len()), - }))); - } - } - ListenApiRequest::GetStatus { reply } => { - let pending: Vec = pending_deliveries.values().map(|pd| { - json!({ - "delivery_id": pd.delivery.delivery_id, - "worker_name": pd.worker_name, - "event_id": pd.delivery.event_id, - "attempts": pd.attempts, - }) - }).collect(); - let _ = reply.send(Ok(json!({ - "agent_count": workers.workers.len(), - "agents": workers.list(), - "pending_delivery_count": pending.len(), - "pending_deliveries": pending, - }))); - } - ListenApiRequest::GetCrashInsights { reply } => { - let _ = reply.send(Ok(crash_insights.to_json())); - } - ListenApiRequest::Preflight { agents, reply } => { - let count = agents.len(); - let _ = reply.send(Ok(json!({ "queued": count }))); - // Background preflight — same as stdio handler - for entry in agents { - let http = relaycast_http.clone(); - tokio::spawn(async move { - let _ = tokio::time::timeout( - Duration::from_secs(30), - http.register_agent_token(&entry.name, Some(&entry.cli)), - ).await; - }); - } - } - ListenApiRequest::SubscribeChannels { name, channels, reply } => { - let Some(handle) = workers.workers.get_mut(&name) else { - let _ = reply.send(Err(format!("unknown worker '{}'", name))); - continue; - }; - let mut added = Vec::new(); - for ch in &channels { - let exists = handle.spec.channels.iter() - .any(|c| c.eq_ignore_ascii_case(ch)); - if !exists { - handle.spec.channels.push(ch.clone()); - added.push(ch.clone()); - } - } - let all_channels = handle.spec.channels.clone(); - let _ = reply.send(Ok(json!({ - "name": name, - "channels": all_channels, - }))); - } - ListenApiRequest::UnsubscribeChannels { name, channels, reply } => { - let Some(handle) = workers.workers.get_mut(&name) else { - let _ = reply.send(Err(format!("unknown worker '{}'", name))); - continue; - }; - handle.spec.channels.retain(|c| { - !channels.iter().any(|rem| rem.eq_ignore_ascii_case(c)) - }); - let remaining = handle.spec.channels.clone(); - let _ = reply.send(Ok(json!({ - "name": name, - "channels": remaining, - }))); - } - ListenApiRequest::GetInboundDeliveryMode { name, reply } => { - if !workers.has_worker(&name) { - let _ = reply.send(Err(DeliveryRouteError::WorkerNotFound(name))); - } else { - let mode = delivery_states - .get(&name) - .map(|s| s.mode) - .unwrap_or_default(); - let _ = reply.send(Ok(mode)); - } - } - ListenApiRequest::SetInboundDeliveryMode { name, mode, reply } => { - if !workers.has_worker(&name) { - let _ = reply.send(Err(DeliveryRouteError::WorkerNotFound(name))); - } else { - let entry = delivery_states.entry(name.clone()).or_default(); - let previous = entry.mode; - entry.mode = mode; - let to_flush: Vec = if previous - == InboundDeliveryMode::ManualFlush - && mode == InboundDeliveryMode::AutoInject - { - entry.drain_pending() - } else { - Vec::new() - }; - let flushed = to_flush.len(); - if !to_flush.is_empty() { - tracing::info!( - target = "agent_relay::broker", - worker = %name, - drained = flushed, - "draining pending queue on manual_flush → auto_inject transition" - ); - } - for queued in to_flush { - inject_pending_relay_message( - &mut workers, - &mut pending_deliveries, - &name, - &queued, - delivery_retry_interval, - ) - .await; - } - tracing::info!( - target = "agent_relay::broker", - worker = %name, - previous_mode = previous.as_wire_str(), - mode = mode.as_wire_str(), - flushed, - "inbound delivery mode updated" - ); - if previous != mode { - let _ = send_event( - &sdk_out_tx, - json!({ - "kind":"agent_inbound_delivery_mode_changed", - "name":&name, - "previous_mode":previous.as_wire_str(), - "mode":mode.as_wire_str(), - }), - ).await; - } - if flushed > 0 { - let _ = send_event( - &sdk_out_tx, - json!({ - "kind":"agent_pending_drained", - "name":&name, - "count":flushed, - "reason":"delivery_mode_transition", - }), - ).await; - } - let _ = reply.send(Ok(SetInboundDeliveryModeOk { mode, flushed })); - } - } - ListenApiRequest::GetPending { name, reply } => { - if !workers.has_worker(&name) { - let _ = reply.send(Err(DeliveryRouteError::WorkerNotFound(name))); - } else { - let snapshot = delivery_states - .get(&name) - .map(|s| s.pending_snapshot()) - .unwrap_or_default(); - let _ = reply.send(Ok(snapshot)); - } - } - ListenApiRequest::FlushPending { name, reply } => { - if !workers.has_worker(&name) { - let _ = reply.send(Err(DeliveryRouteError::WorkerNotFound(name))); - } else { - let to_flush: Vec = delivery_states - .get_mut(&name) - .map(|state| state.drain_pending()) - .unwrap_or_default(); - let flushed = to_flush.len(); - if flushed > 0 { - tracing::info!( - target = "agent_relay::broker", - worker = %name, - drained = flushed, - "flushing pending queue on explicit /flush" - ); - } - for queued in to_flush { - inject_pending_relay_message( - &mut workers, - &mut pending_deliveries, - &name, - &queued, - delivery_retry_interval, - ) - .await; - } - if flushed > 0 { - let _ = send_event( - &sdk_out_tx, - json!({ - "kind":"agent_pending_drained", - "name":&name, - "count":flushed, - "reason":"explicit_flush", - }), - ).await; - } - let _ = reply.send(Ok(flushed)); - } - } - ListenApiRequest::Shutdown { reply } => { - let _ = reply.send(Ok(json!({ "status": "shutting_down" }))); - shutdown = true; - } - ListenApiRequest::RenewLease { reply } => { - last_lease_renewal = Instant::now(); - let expires_in = lease_duration.map(|d| d.as_secs()).unwrap_or(0); - let _ = reply.send(Ok(json!({ - "renewed": true, - "expires_in_secs": expires_in, - "persist": cmd.persist, - }))); - } - } - } - } - - // Stdin is no longer used for SDK communication — all control - // goes through the HTTP/WS API. We drain stdin to avoid - // blocking if anything writes to it, and stop polling after EOF. - result = sdk_lines.next_line(), if stdin_open => { - if matches!(result, Ok(None) | Err(_)) { - stdin_open = false; - } - } - - ws_msg = ws_inbound_rx.recv() => { - if let Some(ws_msg) = ws_msg { - let workspace_id = ws_msg.workspace_id.clone(); - let workspace_alias = ws_msg.workspace_alias.clone(); - let ws_value = ws_msg.value; - let workspace_state = workspace_lookup - .get(&workspace_id) - .cloned() - .unwrap_or_else(|| default_workspace.clone()); - let workspace_self_name = workspace_state.self_name.clone(); - let workspace_self_names = workspace_state.self_names.clone(); - let workspace_self_agent_ids = workspace_state.self_agent_ids.clone(); - let workspace_http = workspace_state.http_client.clone(); - let ws_type = ws_value - .get("type") - .and_then(Value::as_str) - .unwrap_or(""); - tracing::info!( - target = "agent_relay::broker", - ws_type = %ws_type, - workspace_id = %workspace_id, - event = %ws_value, - "received relaycast ws event" - ); - - let control_dedup_key = if matches!( - ws_type, - "agent.spawn_requested" | "agent.release_requested" - ) { - relaycast_ws_control_dedup_key(&workspace_id, ws_type, &ws_value) - } else { - None - }; - - if let Some(ref control_dedup_key) = control_dedup_key { - if !dedup.insert_if_new(control_dedup_key, Instant::now()) { - tracing::info!( - ws_type = %ws_type, - workspace_id = %workspace_id, - "dropping duplicate relaycast control event" - ); - continue; - } - } - - if matches!(ws_type, "agent.spawn_requested" | "agent.release_requested") { - if let Err(ref deser_err) = serde_json::from_value::(ws_value.clone()) { - eprintln!( - "[agent-relay] WARNING: failed to deserialize {} event: {}", - ws_type, deser_err - ); - } - } - if let Ok(ws_event) = serde_json::from_value::(ws_value.clone()) { - match ws_event { - WsEvent::AgentReleaseRequested(event) => { - let name = event.agent.name; - if is_relaycast_self_control_target( - &name, - &workspace_self_name, - &workspace_self_names, - ) { - workspace_http.forget_agent_registration(&name); - tracing::debug!( - worker = %name, - "ignoring relaycast release request for broker self" - ); - continue; - } - workers.supervisor.unregister(&name); - workers.metrics.on_release(&name); - match workers.release(&name).await { - Ok(()) => { - workspace_http.forget_agent_registration(&name); - let dropped = drop_pending_for_worker(&mut pending_deliveries, &name); - if dropped > 0 { - let _ = send_event( - &sdk_out_tx, - json!({"kind":"delivery_dropped","name":name,"count":dropped,"reason":"agent_released"}), - ).await; - } - fail_pending_requests_for_worker(&mut pending_requests, &name, "relaycast_release"); - delivery_states.remove(&name); - telemetry.track(TelemetryEvent::AgentRelease { - cli: String::new(), - release_reason: "relaycast_release".to_string(), - lifetime_seconds: 0, - release_source: ActionSource::Protocol, - }); - state.agents.remove(&name); - if paths.persist { - if let Err(error) = state.save(&paths.state) { - tracing::warn!(path = %paths.state.display(), error = %error, "failed to persist broker state"); - } - } - let _ = send_event( - &sdk_out_tx, - json!({"kind":"agent_released","name":name}), - ).await; - publish_agent_state_transition( - &workspace_state.ws_control_tx, - &name, - "exited", - Some("relaycast_release"), - ) - .await; - tracing::info!(child = %name, "released worker via relaycast in broker mode"); - eprintln!("[agent-relay] released worker '{}' via relaycast", name); - } - Err(error) => { - let message = error.to_string(); - if is_unknown_worker_error_message(&message) { - workspace_http.forget_agent_registration(&name); - state.agents.remove(&name); - if paths.persist { - if let Err(save_error) = state.save(&paths.state) { - tracing::warn!( - path = %paths.state.display(), - error = %save_error, - "failed to persist broker state" - ); - } - } - tracing::debug!( - child = %name, - "ignoring duplicate relaycast release for already exited worker" - ); - } else { - tracing::error!(child = %name, error = %error, "failed to release worker via relaycast"); - eprintln!("[agent-relay] failed to release '{}': {}", name, error); - } - } - } - continue; - } - WsEvent::AgentSpawnRequested(event) => { - let name = event.agent.name; - eprintln!("[agent-relay] received spawn request for '{}' (cli: {})", name, event.agent.cli); - if is_relaycast_self_control_target( - &name, - &workspace_self_name, - &workspace_self_names, - ) { - tracing::debug!( - worker = %name, - "ignoring relaycast spawn request for broker self" - ); - eprintln!("[agent-relay] ignoring spawn request for '{}' (broker self)", name); - continue; - } - let local_spawn_echo_key = - relaycast_spawn_control_dedup_key(&workspace_id, &name); - if relaycast_ws_should_apply_local_spawn_echo_dedup( - control_dedup_key.as_deref(), - &local_spawn_echo_key, - ) && !dedup.insert_if_new(&local_spawn_echo_key, Instant::now()) - { - tracing::info!( - worker = %name, - workspace_id = %workspace_id, - "dropping duplicate/local relaycast spawn request" - ); - eprintln!("[agent-relay] dropping duplicate spawn request for '{}'", name); - continue; - } - let cli = event.agent.cli; - let task = Some(event.agent.task).filter(|value| !value.trim().is_empty()); - let channel = event.agent.channel; - - tracing::info!(name = %name, cli = %cli, task = ?task, channel = ?channel, "handling spawn request from relaycast WS"); - let channels = channel - .as_deref() - .map(|ch| { - let mut chs = default_spawn_channels(); - if !chs.contains(&ch.to_string()) { - chs.push(ch.to_string()); - } - chs - }) - .unwrap_or_else(default_spawn_channels); - let spec = AgentSpec { - name: name.clone(), - runtime: AgentRuntime::Pty, - provider: None, - cli: Some(cli.clone()), - model: None, - cwd: None, - team: None, - shadow_of: None, - shadow_mode: None, - args: vec![], - channels: channels.clone(), - restart_policy: None, - }; - let effective_task = normalize_initial_task(task.clone()); - - // Pre-register agent token. Claude doesn't need this — it - // bakes the API key into --mcp-config JSON and self-registers. - // Non-Claude CLIs need the token injected into their CLI args - // at spawn time, so we do a quick (3s) registration attempt. - let cli_command = parse_cli_command(&cli).map(|(cmd, _)| cmd).unwrap_or_else(|_| cli.clone()); - let cli_name_lower = normalize_cli_name(&cli_command).to_lowercase(); - let is_claude = cli_name_lower == "claude" || cli_name_lower.starts_with("claude:"); - let worker_relay_key = { - let ws_token = relaycast_ws_spawn_token(&ws_value); - if ws_token.is_some() { - ws_token - } else if is_claude { - // Claude self-registers via its MCP server — skip blocking call - None - } else { - const REG_TIMEOUT: Duration = Duration::from_secs(3); - match tokio::time::timeout( - REG_TIMEOUT, - workspace_http.register_agent_token(&name, Some(cli.as_str())), - ).await { - Ok(Ok(token)) => { - tracing::info!( - worker = %name, - "pre-registered agent via broker for WS spawn" - ); - Some(token) - } - Ok(Err(error)) => { - tracing::warn!( - worker = %name, - error = %error, - "WS spawn pre-registration failed; agent will self-register" - ); - None - } - Err(_) => { - tracing::warn!( - worker = %name, - "WS spawn pre-registration timed out (3s); agent will self-register" - ); - None - } - } - } - }; - - match workers.spawn( - spec, - Some("Relaycast".to_string()), - None, - worker_relay_key.clone(), - false, - Some(workspace_id.clone()), - ).await { - Ok(effective_spec) => { - if let Some(ref task_text) = effective_task { - workers.initial_tasks.insert(name.clone(), task_text.clone()); - } - agent_spawn_count += 1; - telemetry.track(TelemetryEvent::AgentSpawn { - cli: cli.clone(), - runtime: runtime_label(&effective_spec.runtime).to_string(), - spawn_source: ActionSource::Protocol, - has_task: effective_task.is_some(), - is_shadow: false, - }); - let pid = workers.worker_pid(&name).unwrap_or(0); - state.agents.insert( - name.clone(), - broker::PersistedAgent { - runtime: AgentRuntime::Pty, - parent: Some("Relaycast".to_string()), - channels, - pid: workers.worker_pid(&name), - started_at: Some( - std::time::SystemTime::now() - .duration_since(std::time::UNIX_EPOCH) - .unwrap_or_default() - .as_secs(), - ), - spec: Some(effective_spec.clone()), - restart_policy: None, - initial_task: effective_task, - - }, - ); - if paths.persist { let _ = state.save(&paths.state); } - let _ = send_event( - &sdk_out_tx, - json!({ - "kind": "agent_spawned", - "name": name, - "runtime": "pty", - "cli": cli, - "model": effective_spec.model.clone(), - "pid": pid, - "source": "relaycast_ws", - "pre_registered": worker_relay_key.is_some(), - }), - ).await; - publish_agent_state_transition( - &workspace_state.ws_control_tx, - &name, - "spawned", - Some("relaycast_spawn"), - ) - .await; - tracing::info!(child = %name, pid, "spawned worker via relaycast WS"); - eprintln!("[agent-relay] spawned worker '{}' via relaycast", name); - } - Err(e) => { - let msg = e.to_string(); - if msg.contains("already exists") { - tracing::debug!(child = %name, "agent already spawned via SDK, skipping duplicate relaycast WS spawn"); - } else { - tracing::error!(child = %name, error = %e, "failed to spawn worker via relaycast WS"); - eprintln!("[agent-relay] failed to spawn '{}': {}", name, e); - } - } - } - continue; - } - _ => {} - } - } else if ws_type == "agent.spawn_requested" { - // Fallback: the SDK failed to deserialize the event (e.g. missing - // fields like `already_existed` or `task: null`). Extract the - // spawn info directly from the raw JSON so we don't silently - // drop the request. - let agent_obj = ws_value.get("agent"); - let name = agent_obj - .and_then(|a| a.get("name")) - .and_then(Value::as_str) - .unwrap_or("") - .to_string(); - let cli = agent_obj - .and_then(|a| a.get("cli")) - .and_then(Value::as_str) - .unwrap_or("claude") - .to_string(); - let task = agent_obj - .and_then(|a| a.get("task")) - .and_then(Value::as_str) - .unwrap_or("") - .to_string(); - let channel = agent_obj - .and_then(|a| a.get("channel")) - .and_then(Value::as_str) - .map(String::from); - - if !name.is_empty() { - eprintln!("[agent-relay] handling spawn request for '{}' via JSON fallback (cli: {})", name, cli); - - if is_relaycast_self_control_target( - &name, - &workspace_self_name, - &workspace_self_names, - ) { - eprintln!("[agent-relay] ignoring spawn request for '{}' (broker self)", name); - } else { - let local_spawn_echo_key = - relaycast_spawn_control_dedup_key(&workspace_id, &name); - let should_dedup = relaycast_ws_should_apply_local_spawn_echo_dedup( - control_dedup_key.as_deref(), - &local_spawn_echo_key, - ); - // Always insert the local echo key for consistency with the primary path - let is_new = dedup.insert_if_new(&local_spawn_echo_key, Instant::now()); - if !should_dedup || is_new - { - let channels = channel - .as_deref() - .map(|ch| { - let mut chs = default_spawn_channels(); - if !chs.contains(&ch.to_string()) { - chs.push(ch.to_string()); - } - chs - }) - .unwrap_or_else(default_spawn_channels); - let spec = AgentSpec { - name: name.clone(), - runtime: AgentRuntime::Pty, - provider: None, - cli: Some(cli.clone()), - model: None, - cwd: None, - team: None, - shadow_of: None, - shadow_mode: None, - args: vec![], - channels: channels.clone(), - restart_policy: None, - }; - let task_opt = Some(task).filter(|v| !v.trim().is_empty()); - let effective_task = normalize_initial_task(task_opt.clone()); - - // Pre-register (same logic as primary WS spawn path). - let cli_command = parse_cli_command(&cli).map(|(cmd, _)| cmd).unwrap_or_else(|_| cli.clone()); - let cli_name_lower = normalize_cli_name(&cli_command).to_lowercase(); - let is_claude = cli_name_lower == "claude" || cli_name_lower.starts_with("claude:"); - let worker_relay_key = { - let ws_token = relaycast_ws_spawn_token(&ws_value); - if ws_token.is_some() { - ws_token - } else if is_claude { - None - } else { - const REG_TIMEOUT: Duration = Duration::from_secs(3); - match tokio::time::timeout( - REG_TIMEOUT, - workspace_http.register_agent_token(&name, Some(cli.as_str())), - ).await { - Ok(Ok(token)) => Some(token), - Ok(Err(error)) => { - tracing::warn!( - worker = %name, - error = %error, - "WS spawn fallback pre-registration failed" - ); - None - } - Err(_) => { - tracing::warn!(worker = %name, "WS spawn fallback pre-registration timed out (3s)"); - None - } - } - } - }; - - match workers.spawn( - spec, - Some("Relaycast".to_string()), - None, - worker_relay_key.clone(), - false, - Some(workspace_id.clone()), - ).await { - Ok(effective_spec) => { - if let Some(ref task_text) = effective_task { - workers.initial_tasks.insert(name.clone(), task_text.clone()); - } - agent_spawn_count += 1; - telemetry.track(TelemetryEvent::AgentSpawn { - cli: cli.clone(), - runtime: runtime_label(&effective_spec.runtime).to_string(), - spawn_source: ActionSource::Protocol, - has_task: effective_task.is_some(), - is_shadow: false, - }); - let pid = workers.worker_pid(&name).unwrap_or(0); - state.agents.insert( - name.clone(), - broker::PersistedAgent { - runtime: AgentRuntime::Pty, - parent: Some("Relaycast".to_string()), - channels, - pid: workers.worker_pid(&name), - started_at: Some( - std::time::SystemTime::now() - .duration_since(std::time::UNIX_EPOCH) - .unwrap_or_default() - .as_secs(), - ), - spec: Some(effective_spec.clone()), - restart_policy: None, - initial_task: effective_task, - - }, - ); - if paths.persist { let _ = state.save(&paths.state); } - let _ = send_event( - &sdk_out_tx, - json!({ - "kind": "agent_spawned", - "name": name, - "runtime": "pty", - "cli": cli, - "model": effective_spec.model.clone(), - "pid": pid, - "source": "relaycast_ws_fallback", - "pre_registered": worker_relay_key.is_some(), - }), - ).await; - publish_agent_state_transition( - &workspace_state.ws_control_tx, - &name, - "spawned", - Some("relaycast_spawn"), - ) - .await; - eprintln!("[agent-relay] spawned worker '{}' via relaycast (JSON fallback)", name); - } - Err(e) => { - let msg = e.to_string(); - if !msg.contains("already exists") { - eprintln!("[agent-relay] failed to spawn '{}': {}", name, e); - } - } - } - } else { - eprintln!("[agent-relay] dropping duplicate spawn request for '{}' (fallback)", name); - } - } - } - // Don't fall through to map_ws_event for control events - // handled by the JSON fallback path. - continue; - } - - // Preserve the raw channel from the WS event for thread replies. - // The mapper may set target = "thread" (synthetic) when the SDK - // struct lacks a channel field; we use the raw value to fix - // display_target so the dashboard can route the message correctly. - let raw_ws_channel = ws_value - .get("channel") - .and_then(Value::as_str) - .map(String::from); - - if let Some(mapped) = map_ws_event(&ws_value, &workspace_id, workspace_alias.as_deref()) { - tracing::info!( - from = %mapped.from, - target = %mapped.target, - kind = ?mapped.kind, - event_id = %mapped.event_id, - text_len = mapped.text.len(), - "mapped inbound WS event" - ); - let dedup_key = format!("{}:{}", mapped.workspace_id, mapped.event_id); - if !dedup.insert_if_new(&dedup_key, Instant::now()) { - tracing::info!(event_id = %mapped.event_id, workspace_id = %mapped.workspace_id, "dropping duplicate event"); - continue; - } - let has_local_target = if mapped.target.starts_with('#') { - !workers - .worker_names_for_channel_delivery(&mapped.target, &mapped.from, Some(&workspace_id)) - .is_empty() - } else if matches!(mapped.kind, InboundKind::ThreadReply) && mapped.target == "thread" { - // Thread replies target "thread" (synthetic), not a specific worker. - // Treat as having a local target when any worker exists so the - // self-echo filter doesn't drop dashboard-originated thread replies. - workers.has_any_worker() - } else { - workers.has_worker_by_name_ignoring_case(&mapped.target) - }; - if routing::is_self_echo( - &mapped, - &workspace_self_names, - &workspace_self_agent_ids, - has_local_target, - ) { - tracing::info!(from = %mapped.from, sender_agent_id = ?mapped.sender_agent_id, self_names = ?workspace_self_names, "skipping self-echo in broker loop"); - continue; - } - - telemetry.track(TelemetryEvent::MessageSend { - is_broadcast: mapped.target.starts_with('#'), - has_thread: mapped.thread_id.is_some(), - }); - - let mut delivery_plan = { - let worker_view = workers.routing_workers(); - routing::resolve_delivery_targets(&mapped, &worker_view) - }; - - // For thread replies with synthetic target "thread", override - // display_target with the actual channel so the dashboard can - // route the message to the correct channel/DM view. - if matches!(mapped.kind, InboundKind::ThreadReply) - && delivery_plan.display_target == "thread" - { - if let Some(ref ch) = raw_ws_channel { - let chan_target = if ch.starts_with('#') { - ch.clone() - } else { - format!("#{ch}") - }; - tracing::info!( - original_target = "thread", - resolved_target = %chan_target, - "overriding thread reply display_target with raw WS channel" - ); - delivery_plan.display_target = chan_target; - } - } - - if mapped.target.starts_with('#') { - tracing::info!( - channel = %mapped.target, - from = %mapped.from, - target_count = delivery_plan.targets.len(), - targets = ?delivery_plan.targets, - "channel delivery targets" - ); - } else { - tracing::info!( - target = %mapped.target, - from = %mapped.from, - kind = ?mapped.kind, - direct_targets = ?delivery_plan.targets, - "direct message routing" - ); - } - - if delivery_plan.needs_dm_resolution { - let conversation_id = mapped.target.clone(); - tracing::info!(conversation_id = %conversation_id, "resolving DM participants"); - let participants = resolve_dm_participants_cached( - &workspace_http, - &mut dm_participants_cache, - &workspace_id, - &conversation_id, - ) - .await; - tracing::info!(participants = ?participants, "resolved DM participants"); - - if let Some(participant) = participants - .iter() - .find(|participant| !agent_name_eq(participant, &mapped.from)) - { - delivery_plan.display_target = participant.clone(); - } - - let worker_view = workers.routing_workers(); - delivery_plan.targets = routing::worker_names_for_dm_participants( - &worker_view, - &participants, - &mapped.from, - Some(&workspace_id), - ); - tracing::info!(dm_targets = ?delivery_plan.targets, "DM participant-based routing targets"); - } - - for worker_name in delivery_plan.targets { - // Inbound-delivery queue: mirrors the /api/send - // queue above. Auto-inject workers drain the queue - // immediately; manual-flush workers leave relaycast - // messages parked until flush. The same full-context - // capture makes drains reproduce the original - // delivery (channel/thread/workspace). - match queue_inbound_for_delivery_mode( - &mut delivery_states, - &workers, - &worker_name, - InboundContext { - from: &mapped.from, - body: &mapped.text, - target: &mapped.target, - thread_id: mapped.thread_id.as_deref(), - workspace_id: Some(mapped.workspace_id.as_str()), - workspace_alias: mapped.workspace_alias.as_deref(), - priority: mapped.priority.as_u8(), - mode: MessageInjectionMode::Wait, - event_id: Some(&mapped.event_id), - }, - ) { - InboundQueueOutcome::Queued => { - tracing::info!( - target = "agent_relay::broker", - event_id = %mapped.event_id, - worker = %worker_name, - "queued inbound relay message (manual_flush inbound delivery mode)" - ); - let _ = send_event( - &sdk_out_tx, - json!({ - "kind":"delivery_queued", - "name":&worker_name, - "event_id":&mapped.event_id, - "from":&mapped.from, - "target":&mapped.target, - "reason":"inbound_delivery_manual_flush", - }), - ).await; - continue; - } - InboundQueueOutcome::DrainNow(to_drain) => { - for queued in to_drain { - if let Err(error) = try_inject_pending_relay_message( - &mut workers, - &mut pending_deliveries, - &worker_name, - &queued, - delivery_retry_interval, - ) - .await - { - let _ = send_error( - &sdk_out_tx, - None, - "delivery_failed", - error.to_string(), - true, - Some(json!({"worker": worker_name})), - ) - .await; - } - } - continue; - } - InboundQueueOutcome::WorkerMissing => {} - } - if let Err(error) = queue_and_try_delivery( - &mut workers, - &mut pending_deliveries, - &worker_name, - &mapped, - delivery_retry_interval, - ).await { - let _ = send_error(&sdk_out_tx, None, "delivery_failed", error.to_string(), true, Some(json!({"worker": worker_name}))).await; - } - } - - let display_target = - display_target_for_dashboard(&delivery_plan.display_target, &workspace_self_names, &workspace_self_name); - let display_from = if is_self_name(&workspace_self_names, &mapped.from) - { - workspace_self_name.clone() - } else { - mapped.from.clone() - }; - tracing::info!( - from = %display_from, - display_target = %display_target, - event_id = %mapped.event_id, - body_len = mapped.text.len(), - "broadcasting relay_inbound to dashboard" - ); - record_thread_history_event( - &mut recent_thread_messages, - json!({ - "event_id": mapped.event_id.clone(), - "from": display_from.clone(), - "target": display_target.clone(), - "text": mapped.text.clone(), - "thread_id": mapped.thread_id.clone(), - "workspace_id": mapped.workspace_id.clone(), - "workspace_alias": mapped.workspace_alias.clone(), - "timestamp": chrono::Utc::now().to_rfc3339(), - }), - ); - let _ = send_event( - &sdk_out_tx, - json!({ - "kind": "relay_inbound", - "event_id": mapped.event_id, - "from": display_from, - "target": display_target, - "body": mapped.text, - "thread_id": mapped.thread_id, - "workspace_id": mapped.workspace_id, - "workspace_alias": mapped.workspace_alias, - }), - ).await; - } else if ws_type != "broker.connection" && ws_type != "broker.channel_join" { - tracing::info!( - target = "agent_relay::broker", - ws_type = %ws_type, - event = %ws_value, - "relaycast ws event ignored by inbound mapper" - ); - } - } - } - - worker_event = worker_event_rx.recv() => { - if let Some(worker_event) = worker_event { - match worker_event { - WorkerEvent::Message { name, value } => { - if let Some(msg_type) = value.get("type").and_then(Value::as_str) { - if msg_type == "delivery_ack" { - if let Some(payload) = value.get("payload") { - let delivery_id = payload - .get("delivery_id") - .and_then(Value::as_str) - .unwrap_or(""); - - // Terminal guard: ignore late delivery_ack events once a - // delivery has reached terminal failed status. - if !delivery_id.is_empty() - && terminal_failed_deliveries.contains(delivery_id) - { - tracing::info!( - worker = %name, - delivery_id = %delivery_id, - "ignoring late delivery_ack after terminal failed status" - ); - continue; - } - - if let Ok(ack) = serde_json::from_value::(payload.clone()) { - clear_pending_delivery_if_event_matches( - &mut pending_deliveries, - &ack.delivery_id, - Some(&ack.event_id), - &name, - "delivery_ack", - ); - terminal_failed_deliveries.remove(&ack.delivery_id); - } - let _ = send_event(&sdk_out_tx, json!({ - "kind": "delivery_ack", - "name": name, - "delivery_id": payload.get("delivery_id"), - "event_id": payload.get("event_id"), - "timestamp": payload.get("timestamp"), - })).await; - } - } else if msg_type == "delivery_queued" { - if let Some(payload) = value.get("payload") { - let _ = send_event(&sdk_out_tx, json!({ - "kind": msg_type, - "name": name, - "delivery_id": payload.get("delivery_id"), - "event_id": payload.get("event_id"), - "timestamp": payload.get("timestamp"), - })).await; - } - } else if msg_type == "delivery_injected" { - if let Some(payload) = value.get("payload") { - let delivery_id = payload - .get("delivery_id") - .and_then(Value::as_str) - .unwrap_or(""); - let event_id = - payload.get("event_id").and_then(Value::as_str); - clear_pending_delivery_if_event_matches( - &mut pending_deliveries, - delivery_id, - event_id, - &name, - "delivery_injected", - ); - let _ = send_event(&sdk_out_tx, json!({ - "kind": msg_type, - "name": name, - "delivery_id": payload.get("delivery_id"), - "event_id": payload.get("event_id"), - "timestamp": payload.get("timestamp"), - })).await; - } - } else if msg_type == "delivery_verified" { - if let Some(payload) = value.get("payload") { - let delivery_id = payload.get("delivery_id").and_then(Value::as_str).unwrap_or(""); - let event_id = payload.get("event_id").and_then(Value::as_str).unwrap_or(""); - tracing::debug!( - target = "agent_relay::broker", - worker = %name, - delivery_id = %delivery_id, - event_id = %event_id, - "delivery verified by echo detection" - ); - clear_pending_delivery_if_event_matches( - &mut pending_deliveries, - delivery_id, - Some(event_id), - &name, - "delivery_verified", - ); - let _ = send_event(&sdk_out_tx, json!({ - "kind": "delivery_verified", - "name": name, - "delivery_id": delivery_id, - "event_id": event_id, - })).await; - } - } else if msg_type == "delivery_active" { - if let Some(payload) = value.get("payload") { - let _ = send_event(&sdk_out_tx, json!({ - "kind": "delivery_active", - "name": name, - "delivery_id": payload.get("delivery_id"), - "event_id": payload.get("event_id"), - "pattern": payload.get("pattern"), - })).await; - } - } else if msg_type == "delivery_failed" { - if let Some(payload) = value.get("payload") { - let delivery_id = payload.get("delivery_id").and_then(Value::as_str).unwrap_or(""); - let event_id = payload.get("event_id").and_then(Value::as_str).unwrap_or(""); - let reason = payload.get("reason").and_then(Value::as_str).unwrap_or("unknown"); - tracing::warn!( - target = "agent_relay::broker", - worker = %name, - delivery_id = %delivery_id, - event_id = %event_id, - reason = %reason, - "delivery failed — echo not detected" - ); - clear_pending_delivery_if_event_matches( - &mut pending_deliveries, - delivery_id, - Some(event_id), - &name, - "delivery_failed", - ); - if !delivery_id.is_empty() { - terminal_failed_deliveries - .insert(delivery_id.to_string()); - } - let _ = send_event(&sdk_out_tx, json!({ - "kind": "delivery_failed", - "name": name, - "delivery_id": delivery_id, - "event_id": event_id, - "reason": reason, - })).await; - } - } else if msg_type == "worker_error" { - let _ = send_event(&sdk_out_tx, json!({ - "kind": "worker_error", - "name": name, - "error": value.get("payload").cloned().unwrap_or(Value::Null) - })).await; - } else if msg_type.ends_with("_response") { - // Generic worker request/response dispatch. - // Any frame whose `type` ends in - // `_response` is routed by `request_id` - // into the matching parked `oneshot` in - // `pending_requests`. The pending entry - // owns the format/error decoding logic - // via `worker_request::fulfil_response_frame`. - let routed = worker_request::fulfil_response_frame( - &mut pending_requests, - &value, - ); - if !routed { - let req_id = value - .get("request_id") - .and_then(Value::as_str) - .unwrap_or(""); - tracing::debug!( - target = "agent_relay::broker", - worker = %name, - msg_type = %msg_type, - request_id = %req_id, - "worker response with no pending caller — dropping" - ); - } - } else if msg_type == "worker_stream" { - let _ = send_event(&sdk_out_tx, json!({ - "kind": "worker_stream", - "name": name, - "stream": value.get("payload").and_then(|p| p.get("stream")).cloned().unwrap_or(Value::String("stdout".to_string())), - "chunk": value.get("payload").and_then(|p| p.get("chunk")).cloned().unwrap_or(Value::String(String::new())), - })).await; - } else if msg_type == "worker_ready" { - if let Some(task_text) = workers.initial_tasks.remove(&name) { - let event_id = format!("init_{}", Uuid::new_v4().simple()); - if let Err(e) = queue_and_try_delivery_raw( - &mut workers, - &mut pending_deliveries, - &name, - &event_id, - "broker", - &name, - &task_text, - None, - None, - None, - 2, - MessageInjectionMode::Wait, - delivery_retry_interval, - ).await { - tracing::warn!(worker = %name, error = %e, "failed to deliver initial_task"); - } - } - let runtime = value.get("payload") - .and_then(|p| p.get("runtime")) - .and_then(Value::as_str) - .unwrap_or("pty"); - let (provider_val, cli_val, model_val) = workers.workers.get(&name) - .map(|h| (h.spec.provider.clone(), h.spec.cli.clone(), h.spec.model.clone())) - .unwrap_or((None, None, None)); - let _ = send_event(&sdk_out_tx, json!({ - "kind": "worker_ready", - "name": name, - "runtime": runtime, - "provider": provider_val, - "cli": cli_val, - "model": model_val, - })).await; - } else if msg_type == "agent_idle" { - let idle_secs = value.get("payload") - .and_then(|p| p.get("idle_secs")) - .and_then(Value::as_u64) - .unwrap_or(0); - let _ = send_event(&sdk_out_tx, json!({ - "kind": "agent_idle", - "name": name, - "idle_secs": idle_secs, - })).await; - publish_agent_state_transition( - &ws_control_tx, - &name, - "idle", - Some("idle_threshold"), - ) - .await; - } else if msg_type == "agent_exit" { - let reason = value.get("payload") - .and_then(|p| p.get("reason")) - .and_then(Value::as_str) - .unwrap_or("unknown"); - tracing::info!(agent = %name, reason = %reason, "agent requested exit"); - let _ = send_event(&sdk_out_tx, json!({ - "kind": "agent_exit", - "name": name, - "reason": reason, - })).await; - } else if msg_type == "continuity_command" { - // Agent-initiated continuity: the pty_worker detected a - // KIND: continuity block in PTY output and emitted this event. - let action = value.get("payload") - .and_then(|p| p.get("action")) - .and_then(Value::as_str) - .unwrap_or(""); - let content = value.get("payload") - .and_then(|p| p.get("content")) - .and_then(Value::as_str) - .unwrap_or(""); - match action { - "save" => { - let cont_dir = continuity_dir(&paths.state); - if let Err(e) = std::fs::create_dir_all(&cont_dir) { - tracing::warn!( - agent = %name, - error = %e, - "continuity_command save: failed to create dir" - ); - } else { - // Build a minimal continuity record with the provided summary. - let agent_data = state.agents.get(&name); - let cli = agent_data - .and_then(|d| d.spec.as_ref()) - .and_then(|s| s.cli.clone()); - let initial_task = agent_data - .and_then(|d| d.initial_task.clone()); - let continuity = json!({ - "agent_name": name, - "cli": cli, - "initial_task": initial_task, - "released_at": null, - "lifetime_seconds": null, - "message_history": [], - "summary": content, - }); - let cont_file = cont_dir.join(format!("{}.json", name)); - match std::fs::write( - &cont_file, - serde_json::to_string_pretty(&continuity) - .unwrap_or_default(), - ) { - Ok(()) => tracing::info!( - agent = %name, - path = %cont_file.display(), - "continuity_command: saved agent-initiated continuity" - ), - Err(e) => tracing::warn!( - agent = %name, - error = %e, - "continuity_command save: failed to write file" - ), - } - } - } - "load" => { - let cont_dir = continuity_dir(&paths.state); - let cont_file = cont_dir.join(format!("{}.json", name)); - if cont_file.exists() { - match std::fs::read_to_string(&cont_file) { - Ok(raw) => { - if let Ok(ctx) = serde_json::from_str::(&raw) { - // Build a context summary and inject it - let prev_task = ctx.get("initial_task") - .and_then(Value::as_str) - .unwrap_or("unknown"); - let summary = ctx.get("summary") - .and_then(Value::as_str) - .unwrap_or("no summary"); - let history_str = ctx.get("message_history") - .and_then(Value::as_array) - .map(|msgs| { - msgs.iter() - .filter_map(|m| { - let from = m.get("from")?.as_str()?; - let text = m.get("text") - .or_else(|| m.get("body"))? - .as_str()?; - Some(format!(" - {}: {}", from, text)) - }) - .collect::>() - .join("\n") - }) - .unwrap_or_default(); - let history_section = if history_str.is_empty() { - String::new() - } else { - format!("\nRecent messages:\n{}", history_str) - }; - let inject_body = format!( - "## Continuity Context (from previous session as '{}')\n\ - Previous task: {}\n\ - Session summary: {}{}", - name, prev_task, summary, history_section - ); - let event_id = format!("cont_load_{}", Uuid::new_v4().simple()); - if let Err(e) = queue_and_try_delivery_raw( - &mut workers, - &mut pending_deliveries, - &name, - &event_id, - "broker", - &name, - &inject_body, - None, - None, - None, - 2, - MessageInjectionMode::Wait, - delivery_retry_interval, - ).await { - tracing::warn!( - agent = %name, - error = %e, - "continuity_command load: failed to inject context" - ); - } else { - tracing::info!( - agent = %name, - "continuity_command: injected loaded context" - ); - } - } - } - Err(e) => tracing::warn!( - agent = %name, - error = %e, - "continuity_command load: failed to read file" - ), - } - } else { - tracing::debug!( - agent = %name, - "continuity_command load: no continuity file found" - ); - } - } - "uncertain" => { - tracing::info!( - agent = %name, - content = %content, - "continuity_command: agent reported uncertainty" - ); - } - other => { - tracing::warn!( - agent = %name, - action = %other, - "continuity_command: unknown action ignored" - ); - } - } - } else if msg_type == "worker_exited" { - // PTY worker process is exiting — clean up and - // emit agent_exited so the SDK doesn't have to - // wait for the reap_exited polling cycle. - let code = value.get("payload") - .and_then(|p| p.get("code")) - .and_then(Value::as_i64) - .map(|c| c as i32); - let signal = value.get("payload") - .and_then(|p| p.get("signal")) - .and_then(Value::as_str) - .map(String::from); - tracing::info!( - agent = %name, - code = ?code, - signal = ?signal, - "worker_exited received — cleaning up" - ); - // Remove from registry so reap_exited won't - // double-process this worker. - workers.workers.remove(&name); - workers.initial_tasks.remove(&name); - // Drop pending deliveries for this worker - let dropped = drop_pending_for_worker(&mut pending_deliveries, &name); - if dropped > 0 { - let _ = send_event( - &sdk_out_tx, - json!({ - "kind": "delivery_dropped", - "name": name, - "count": dropped, - "reason": "worker_exited", - }), - ).await; - } - fail_pending_requests_for_worker(&mut pending_requests, &name, "worker_exited"); - delivery_states.remove(&name); - let _ = send_event( - &sdk_out_tx, - json!({ - "kind": "agent_exited", - "name": name, - "code": code, - "signal": signal, - }), - ).await; - publish_agent_state_transition( - &ws_control_tx, - &name, - "exited", - Some("worker_exited"), - ) - .await; - if let Err(error) = relaycast_http.mark_agent_offline(&name).await { - tracing::warn!( - worker = %name, - error = %error, - "failed to mark exited worker offline in relaycast" - ); - } - state.agents.remove(&name); - if paths.persist { - if let Err(error) = state.save(&paths.state) { - tracing::warn!( - path = %paths.state.display(), - error = %error, - "failed to persist broker state" - ); - } - } - } - } - } - } - } - } - - _ = reap_tick.tick() => { - let now = Instant::now(); - - // Time out worker request/response calls whose worker never - // responded. Common cause: worker crashed between us sending - // the request frame and it parsing the frame. Without this - // sweep the HTTP handler would hang forever on its oneshot. - for (req_id, worker_name, kind) in - worker_request::reap_expired(&mut pending_requests, now) - { - tracing::warn!( - target = "agent_relay::broker", - request_id = %req_id, - worker = %worker_name, - kind = %kind, - "worker request timed out before worker responded" - ); - } - - let due_ids: Vec = pending_deliveries - .iter() - .filter_map(|(delivery_id, pending)| { - if pending.next_retry_at <= now { - Some(delivery_id.clone()) - } else { - None - } - }) - .collect(); - - for delivery_id in due_ids { - let was_retry = pending_deliveries - .get(&delivery_id) - .map(|pending| pending.attempts > 0) - .unwrap_or(false); - - match retry_pending_delivery( - &delivery_id, - &mut workers, - &mut pending_deliveries, - delivery_retry_interval, - ) - .await { - Ok(Some((worker_name, attempts, event_id))) => { - if was_retry { - let _ = send_event( - &sdk_out_tx, - json!({ - "kind":"delivery_retry", - "name": worker_name, - "delivery_id": delivery_id, - "event_id": event_id, - "attempts": attempts, - }), - ).await; - } - } - Ok(None) => { - if was_retry { - let _ = send_event( - &sdk_out_tx, - json!({ - "kind": "delivery_dropped", - "delivery_id": delivery_id, - "reason": "max_retries_exceeded", - }), - ).await; - } - } - Err(error) => { - let _ = send_error( - &sdk_out_tx, - None, - "delivery_failed", - error.to_string(), - true, - Some(json!({"delivery_id": delivery_id})), - ).await; - } - } - } - - let exited = match workers.reap_exited().await { - Ok(v) => v, - Err(e) => { - tracing::warn!(err = %e, "reap_exited failed, skipping this cycle"); - vec![] - } - }; - for (name, code, signal) in &exited { - // Record crash in insights - let (category, description) = relay_broker::crash_insights::CrashInsights::analyze(*code, signal.as_deref()); - crash_insights.record(relay_broker::crash_insights::CrashRecord { - agent_name: name.clone(), - exit_code: *code, - signal: signal.clone(), - timestamp: std::time::SystemTime::now() - .duration_since(std::time::UNIX_EPOCH) - .unwrap_or_default() - .as_secs(), - uptime_secs: 0, - category, - description, - }); - - telemetry.track(TelemetryEvent::AgentCrash { - cli: String::new(), - exit_code: *code, - lifetime_seconds: 0, - }); - - // Check supervisor for restart decision - use relay_broker::supervisor::RestartDecision; - match workers.supervisor.on_exit(name, *code, signal.as_deref()) { - Some(RestartDecision::Restart { delay }) => { - // Keep pending deliveries — we'll redeliver after restart - workers.metrics.on_crash(name); - let restart_count = workers.supervisor.restart_count(name) + 1; - tracing::info!( - name = %name, - exit_code = ?code, - signal = ?signal, - restart_count, - delay_ms = delay.as_millis() as u64, - "agent will be restarted" - ); - let _ = send_event( - &sdk_out_tx, - json!({ - "kind": "agent_restarting", - "name": name, - "code": code, - "signal": signal, - "restart_count": restart_count, - "delay_ms": delay.as_millis() as u64, - }), - ).await; - publish_agent_state_transition( - &ws_control_tx, - name, - "stuck", - Some("restarting"), - ) - .await; - } - Some(RestartDecision::PermanentlyDead { reason }) => { - workers.metrics.on_permanent_death(name); - let dropped = drop_pending_for_worker(&mut pending_deliveries, name); - if dropped > 0 { - let _ = send_event( - &sdk_out_tx, - json!({ - "kind":"delivery_dropped", - "name": name, - "count": dropped, - "reason":"worker_permanently_dead", - }), - ).await; - } - fail_pending_requests_for_worker(&mut pending_requests, name, "worker_permanently_dead"); - delivery_states.remove(name); - let _ = send_event( - &sdk_out_tx, - json!({"kind":"agent_permanently_dead","name":name,"reason":reason}), - ).await; - publish_agent_state_transition( - &ws_control_tx, - name, - "stuck", - Some("permanently_dead"), - ) - .await; - if let Err(error) = relaycast_http.mark_agent_offline(name).await { - tracing::warn!( - worker = %name, - error = %error, - "failed to mark permanently dead worker offline in relaycast" - ); - } - state.agents.remove(name); - if paths.persist { - if let Err(error) = state.save(&paths.state) { - tracing::warn!(path = %paths.state.display(), error = %error, "failed to persist broker state"); - } - } - } - None => { - // Not supervised — original behavior - let dropped = drop_pending_for_worker(&mut pending_deliveries, name); - if dropped > 0 { - let _ = send_event( - &sdk_out_tx, - json!({ - "kind":"delivery_dropped", - "name": name, - "count": dropped, - "reason":"worker_exited", - }), - ).await; - } - fail_pending_requests_for_worker(&mut pending_requests, name, "worker_exited"); - delivery_states.remove(name); - let _ = send_event( - &sdk_out_tx, - json!({"kind":"agent_exited","name":name,"code":code,"signal":signal}), - ).await; - publish_agent_state_transition( - &ws_control_tx, - name, - "exited", - Some("worker_exited"), - ) - .await; - if let Err(error) = relaycast_http.mark_agent_offline(name).await { - tracing::warn!( - worker = %name, - error = %error, - "failed to mark exited worker offline in relaycast" - ); - } - state.agents.remove(name); - if paths.persist { - if let Err(error) = state.save(&paths.state) { - tracing::warn!(path = %paths.state.display(), error = %error, "failed to persist broker state"); - } - } - } - } - } - - // Check for agents ready to restart (past cooldown) - if !shutdown { - let pending_restarts = workers.supervisor.pending_restarts(); - for (name, rst) in pending_restarts { - if let Some(remaining) = relaycast_http.registration_block_remaining(&name) - { - tracing::debug!( - worker = %name, - retry_after_secs = remaining.as_secs().max(1), - "skipping restart while relaycast registration is rate-limited" - ); - continue; - } - - let worker_relay_key = if rst.skip_relay_prompt { - None - } else { - match relaycast_http - .register_agent_token(&name, rst.spec.cli.as_deref()) - .await - { - Ok(token) => Some(token), - Err(error) => { - match registration_retry_after_secs(&error) { - Some(retry_after_secs) => { - tracing::warn!( - worker = %name, - retry_after_secs, - error = %error, - "restart blocked by relaycast registration rate limit" - ); - } - None => { - tracing::error!( - worker = %name, - error = %error, - "failed to pre-register worker before restart" - ); - } - } - continue; - } - } - }; - - match workers - .spawn( - rst.spec.clone(), - rst.parent.clone(), - None, - worker_relay_key, - rst.skip_relay_prompt, - None, - ) - .await - { - Ok(_) => { - workers.supervisor.on_restarted(&name); - workers.metrics.on_restart(&name); - if let Some(task) = rst.initial_task { - workers.initial_tasks.insert(name.clone(), task); - } - tracing::info!(name = %name, restart_count = rst.restart_count, "agent restarted"); - let _ = send_event( - &sdk_out_tx, - json!({ - "kind": "agent_restarted", - "name": name, - "restart_count": rst.restart_count, - }), - ).await; - publish_agent_state_transition( - &ws_control_tx, - &name, - "spawned", - Some("restarted"), - ) - .await; - } - Err(e) => { - tracing::error!(name = %name, error = %e, "restart failed"); - } - } - } - } - - // Persist pending deliveries for crash recovery - if paths.persist { - if let Err(error) = save_pending_deliveries(&paths.pending, &pending_deliveries) { - tracing::warn!(path = %paths.pending.display(), error = %error, "failed to persist pending deliveries"); - } - } - } - } - } - - // Save crash insights before shutdown (only in persist mode) - if paths.persist { - if let Err(error) = crash_insights.save(&crash_insights_path) { - tracing::warn!(error = %error, "failed to save crash insights"); - } - } - - telemetry.track(TelemetryEvent::BrokerStop { - uptime_seconds: broker_start.elapsed().as_secs(), + let runtime = BrokerRuntime { + persist: cmd.persist, + broker_start, agent_spawn_count, - }); - telemetry.shutdown(); - - let active_workers: Vec = workers.workers.keys().cloned().collect(); - for worker_name in active_workers { - if let Err(error) = relaycast_http.mark_agent_offline(&worker_name).await { - tracing::warn!( - worker = %worker_name, - error = %error, - "failed to mark worker offline during shutdown" - ); - } - } - - // Mark broker agent offline in Relaycast before shutting down WS - if let Err(error) = relaycast_http.mark_offline().await { - tracing::warn!(error = %error, "failed to mark broker offline during shutdown"); - } - - if let Err(error) = ws_control_tx.send(WsControl::Shutdown).await { - tracing::warn!(error = %error, "failed to send ws shutdown signal"); - } - pending_deliveries.clear(); - // Clean shutdown — remove pending file since nothing is pending - if paths.persist { - let _ = std::fs::remove_file(&paths.pending); - } - workers.shutdown_all().await?; - - // Clean up state and connection files on graceful shutdown - if paths.persist { - let _ = std::fs::remove_file(&paths.state); - } - let connection_path = paths.state.parent().unwrap().join("connection.json"); - let _ = std::fs::remove_file(&connection_path); + paths, + state, + workspaces, + workspace_lookup, + default_workspace, + default_workspace_id, + self_names, + ws_control_tx, + relaycast_http, + api_rx, + ws_inbound_rx, + sdk_out_tx, + worker_event_rx, + workers, + crash_insights, + crash_insights_path, + sdk_lines, + stdin_open, + reap_tick, + dedup, + delivery_retry_interval, + pending_deliveries, + terminal_failed_deliveries, + pending_requests, + delivery_states, + dm_participants_cache, + recent_thread_messages, + shutdown, + lease_duration, + last_lease_renewal, + lease_check, + sigterm, + telemetry, + }; - Ok(()) + runtime.run().await } diff --git a/crates/broker/src/runtime/maintenance.rs b/crates/broker/src/runtime/maintenance.rs new file mode 100644 index 000000000..508d4db46 --- /dev/null +++ b/crates/broker/src/runtime/maintenance.rs @@ -0,0 +1,353 @@ +use super::*; + +impl BrokerRuntime { + pub(super) async fn handle_maintenance_tick(&mut self) { + let paths = &self.paths; + let state = &mut self.state; + let sdk_out_tx = &self.sdk_out_tx; + let ws_control_tx = &self.ws_control_tx; + let relaycast_http = &self.relaycast_http; + let workers = &mut self.workers; + let telemetry = &self.telemetry; + let crash_insights = &mut self.crash_insights; + let pending_deliveries = &mut self.pending_deliveries; + let pending_requests = &mut self.pending_requests; + let delivery_states = &mut self.delivery_states; + let delivery_retry_interval = self.delivery_retry_interval; + let shutdown = &self.shutdown; + + let now = Instant::now(); + + // Time out worker request/response calls whose worker never + // responded. Common cause: worker crashed between us sending + // the request frame and it parsing the frame. Without this + // sweep the HTTP handler would hang forever on its oneshot. + for (req_id, worker_name, kind) in worker_request::reap_expired(pending_requests, now) { + tracing::warn!( + target = "agent_relay::broker", + request_id = %req_id, + worker = %worker_name, + kind = %kind, + "worker request timed out before worker responded" + ); + } + + let due_ids: Vec = pending_deliveries + .iter() + .filter_map(|(delivery_id, pending)| { + if pending.next_retry_at <= now { + Some(delivery_id.clone()) + } else { + None + } + }) + .collect(); + + for delivery_id in due_ids { + let was_retry = pending_deliveries + .get(&delivery_id) + .map(|pending| pending.attempts > 0) + .unwrap_or(false); + + match retry_pending_delivery( + &delivery_id, + workers, + pending_deliveries, + delivery_retry_interval, + ) + .await + { + Ok(Some((worker_name, attempts, event_id))) => { + if was_retry { + let _ = send_event( + sdk_out_tx, + json!({ + "kind":"delivery_retry", + "name": worker_name, + "delivery_id": delivery_id, + "event_id": event_id, + "attempts": attempts, + }), + ) + .await; + } + } + Ok(None) => { + if was_retry { + let _ = send_event( + sdk_out_tx, + json!({ + "kind": "delivery_dropped", + "delivery_id": delivery_id, + "reason": "max_retries_exceeded", + }), + ) + .await; + } + } + Err(error) => { + let _ = send_error( + sdk_out_tx, + None, + "delivery_failed", + error.to_string(), + true, + Some(json!({"delivery_id": delivery_id})), + ) + .await; + } + } + } + + let exited = match workers.reap_exited().await { + Ok(v) => v, + Err(e) => { + tracing::warn!(err = %e, "reap_exited failed, skipping this cycle"); + vec![] + } + }; + for (name, code, signal) in &exited { + // Record crash in insights + let (category, description) = + relay_broker::crash_insights::CrashInsights::analyze(*code, signal.as_deref()); + crash_insights.record(relay_broker::crash_insights::CrashRecord { + agent_name: name.clone(), + exit_code: *code, + signal: signal.clone(), + timestamp: std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_secs(), + uptime_secs: 0, + category, + description, + }); + + telemetry.track(TelemetryEvent::AgentCrash { + cli: String::new(), + exit_code: *code, + lifetime_seconds: 0, + }); + + // Check supervisor for restart decision + use relay_broker::supervisor::RestartDecision; + match workers.supervisor.on_exit(name, *code, signal.as_deref()) { + Some(RestartDecision::Restart { delay }) => { + // Keep pending deliveries — we'll redeliver after restart + workers.metrics.on_crash(name); + let restart_count = workers.supervisor.restart_count(name) + 1; + tracing::info!( + name = %name, + exit_code = ?code, + signal = ?signal, + restart_count, + delay_ms = delay.as_millis() as u64, + "agent will be restarted" + ); + let _ = send_event( + sdk_out_tx, + json!({ + "kind": "agent_restarting", + "name": name, + "code": code, + "signal": signal, + "restart_count": restart_count, + "delay_ms": delay.as_millis() as u64, + }), + ) + .await; + publish_agent_state_transition( + ws_control_tx, + name, + "stuck", + Some("restarting"), + ) + .await; + } + Some(RestartDecision::PermanentlyDead { reason }) => { + workers.metrics.on_permanent_death(name); + let dropped = drop_pending_for_worker(pending_deliveries, name); + if dropped > 0 { + let _ = send_event( + sdk_out_tx, + json!({ + "kind":"delivery_dropped", + "name": name, + "count": dropped, + "reason":"worker_permanently_dead", + }), + ) + .await; + } + fail_pending_requests_for_worker( + pending_requests, + name, + "worker_permanently_dead", + ); + delivery_states.remove(name); + let _ = send_event( + sdk_out_tx, + json!({"kind":"agent_permanently_dead","name":name,"reason":reason}), + ) + .await; + publish_agent_state_transition( + ws_control_tx, + name, + "stuck", + Some("permanently_dead"), + ) + .await; + if let Err(error) = relaycast_http.mark_agent_offline(name).await { + tracing::warn!( + worker = %name, + error = %error, + "failed to mark permanently dead worker offline in relaycast" + ); + } + state.agents.remove(name); + if paths.persist { + if let Err(error) = state.save(&paths.state) { + tracing::warn!(path = %paths.state.display(), error = %error, "failed to persist broker state"); + } + } + } + None => { + // Not supervised — original behavior + let dropped = drop_pending_for_worker(pending_deliveries, name); + if dropped > 0 { + let _ = send_event( + sdk_out_tx, + json!({ + "kind":"delivery_dropped", + "name": name, + "count": dropped, + "reason":"worker_exited", + }), + ) + .await; + } + fail_pending_requests_for_worker(pending_requests, name, "worker_exited"); + delivery_states.remove(name); + let _ = send_event( + sdk_out_tx, + json!({"kind":"agent_exited","name":name,"code":code,"signal":signal}), + ) + .await; + publish_agent_state_transition( + ws_control_tx, + name, + "exited", + Some("worker_exited"), + ) + .await; + if let Err(error) = relaycast_http.mark_agent_offline(name).await { + tracing::warn!( + worker = %name, + error = %error, + "failed to mark exited worker offline in relaycast" + ); + } + state.agents.remove(name); + if paths.persist { + if let Err(error) = state.save(&paths.state) { + tracing::warn!(path = %paths.state.display(), error = %error, "failed to persist broker state"); + } + } + } + } + } + + // Check for agents ready to restart (past cooldown) + if !*shutdown { + let pending_restarts = workers.supervisor.pending_restarts(); + for (name, rst) in pending_restarts { + if let Some(remaining) = relaycast_http.registration_block_remaining(&name) { + tracing::debug!( + worker = %name, + retry_after_secs = remaining.as_secs().max(1), + "skipping restart while relaycast registration is rate-limited" + ); + continue; + } + + let worker_relay_key = if rst.skip_relay_prompt { + None + } else { + match relaycast_http + .register_agent_token(&name, rst.spec.cli.as_deref()) + .await + { + Ok(token) => Some(token), + Err(error) => { + match registration_retry_after_secs(&error) { + Some(retry_after_secs) => { + tracing::warn!( + worker = %name, + retry_after_secs, + error = %error, + "restart blocked by relaycast registration rate limit" + ); + } + None => { + tracing::error!( + worker = %name, + error = %error, + "failed to pre-register worker before restart" + ); + } + } + continue; + } + } + }; + + match workers + .spawn( + rst.spec.clone(), + rst.parent.clone(), + None, + worker_relay_key, + rst.skip_relay_prompt, + None, + ) + .await + { + Ok(_) => { + workers.supervisor.on_restarted(&name); + workers.metrics.on_restart(&name); + if let Some(task) = rst.initial_task { + workers.initial_tasks.insert(name.clone(), task); + } + tracing::info!(name = %name, restart_count = rst.restart_count, "agent restarted"); + let _ = send_event( + sdk_out_tx, + json!({ + "kind": "agent_restarted", + "name": name, + "restart_count": rst.restart_count, + }), + ) + .await; + publish_agent_state_transition( + ws_control_tx, + &name, + "spawned", + Some("restarted"), + ) + .await; + } + Err(e) => { + tracing::error!(name = %name, error = %e, "restart failed"); + } + } + } + } + + // Persist pending deliveries for crash recovery + if paths.persist { + if let Err(error) = save_pending_deliveries(&paths.pending, pending_deliveries) { + tracing::warn!(path = %paths.pending.display(), error = %error, "failed to persist pending deliveries"); + } + } + } +} diff --git a/crates/broker/src/runtime/mod.rs b/crates/broker/src/runtime/mod.rs index 10ff87895..a16da92dd 100644 --- a/crates/broker/src/runtime/mod.rs +++ b/crates/broker/src/runtime/mod.rs @@ -62,22 +62,28 @@ const DEFAULT_HTTP_API_RELAYCAST_SEND_TIMEOUT_MS: u64 = 20_000; const DEFAULT_HTTP_API_EVENT_EMIT_TIMEOUT_MS: u64 = 200; static TRACING_GUARD: OnceLock = OnceLock::new(); +mod api; mod connection; mod delivery; +mod event_loop; mod headless; mod init; mod io; +mod maintenance; mod messages; mod paths; +mod relaycast_events; mod session; mod spawn_spec; mod system; #[cfg(test)] mod tests; mod util; +mod worker_events; pub(crate) use connection::*; pub(crate) use delivery::*; +pub(crate) use event_loop::*; pub(crate) use headless::*; pub(crate) use init::*; pub(crate) use io::*; diff --git a/crates/broker/src/runtime/relaycast_events.rs b/crates/broker/src/runtime/relaycast_events.rs new file mode 100644 index 000000000..656e7fc3e --- /dev/null +++ b/crates/broker/src/runtime/relaycast_events.rs @@ -0,0 +1,840 @@ +use super::*; + +impl BrokerRuntime { + pub(super) async fn handle_relaycast_message(&mut self, ws_msg: WorkspaceInboundMessage) { + let paths = &self.paths; + let state = &mut self.state; + let workspace_lookup = &self.workspace_lookup; + let default_workspace = &self.default_workspace; + let sdk_out_tx = &self.sdk_out_tx; + let workers = &mut self.workers; + let telemetry = &self.telemetry; + let agent_spawn_count = &mut self.agent_spawn_count; + let dedup = &mut self.dedup; + let pending_deliveries = &mut self.pending_deliveries; + let pending_requests = &mut self.pending_requests; + let delivery_states = &mut self.delivery_states; + let dm_participants_cache = &mut self.dm_participants_cache; + let recent_thread_messages = &mut self.recent_thread_messages; + let delivery_retry_interval = self.delivery_retry_interval; + + let workspace_id = ws_msg.workspace_id.clone(); + let workspace_alias = ws_msg.workspace_alias.clone(); + let ws_value = ws_msg.value; + let workspace_state = workspace_lookup + .get(&workspace_id) + .cloned() + .unwrap_or_else(|| default_workspace.clone()); + let workspace_self_name = workspace_state.self_name.clone(); + let workspace_self_names = workspace_state.self_names.clone(); + let workspace_self_agent_ids = workspace_state.self_agent_ids.clone(); + let workspace_http = workspace_state.http_client.clone(); + let ws_type = ws_value + .get("type") + .and_then(Value::as_str) + .unwrap_or(""); + tracing::info!( + target = "agent_relay::broker", + ws_type = %ws_type, + workspace_id = %workspace_id, + event = %ws_value, + "received relaycast ws event" + ); + + let control_dedup_key = + if matches!(ws_type, "agent.spawn_requested" | "agent.release_requested") { + relaycast_ws_control_dedup_key(&workspace_id, ws_type, &ws_value) + } else { + None + }; + + if let Some(ref control_dedup_key) = control_dedup_key { + if !dedup.insert_if_new(control_dedup_key, Instant::now()) { + tracing::info!( + ws_type = %ws_type, + workspace_id = %workspace_id, + "dropping duplicate relaycast control event" + ); + return; + } + } + + if matches!(ws_type, "agent.spawn_requested" | "agent.release_requested") { + if let Err(ref deser_err) = serde_json::from_value::(ws_value.clone()) { + eprintln!( + "[agent-relay] WARNING: failed to deserialize {} event: {}", + ws_type, deser_err + ); + } + } + if let Ok(ws_event) = serde_json::from_value::(ws_value.clone()) { + match ws_event { + WsEvent::AgentReleaseRequested(event) => { + let name = event.agent.name; + if is_relaycast_self_control_target( + &name, + &workspace_self_name, + &workspace_self_names, + ) { + workspace_http.forget_agent_registration(&name); + tracing::debug!( + worker = %name, + "ignoring relaycast release request for broker self" + ); + return; + } + workers.supervisor.unregister(&name); + workers.metrics.on_release(&name); + match workers.release(&name).await { + Ok(()) => { + workspace_http.forget_agent_registration(&name); + let dropped = drop_pending_for_worker(pending_deliveries, &name); + if dropped > 0 { + let _ = send_event( + sdk_out_tx, + json!({"kind":"delivery_dropped","name":name,"count":dropped,"reason":"agent_released"}), + ).await; + } + fail_pending_requests_for_worker( + pending_requests, + &name, + "relaycast_release", + ); + delivery_states.remove(&name); + telemetry.track(TelemetryEvent::AgentRelease { + cli: String::new(), + release_reason: "relaycast_release".to_string(), + lifetime_seconds: 0, + release_source: ActionSource::Protocol, + }); + state.agents.remove(&name); + if paths.persist { + if let Err(error) = state.save(&paths.state) { + tracing::warn!(path = %paths.state.display(), error = %error, "failed to persist broker state"); + } + } + let _ = send_event( + sdk_out_tx, + json!({"kind":"agent_released","name":name}), + ) + .await; + publish_agent_state_transition( + &workspace_state.ws_control_tx, + &name, + "exited", + Some("relaycast_release"), + ) + .await; + tracing::info!(child = %name, "released worker via relaycast in broker mode"); + eprintln!("[agent-relay] released worker '{}' via relaycast", name); + } + Err(error) => { + let message = error.to_string(); + if is_unknown_worker_error_message(&message) { + workspace_http.forget_agent_registration(&name); + state.agents.remove(&name); + if paths.persist { + if let Err(save_error) = state.save(&paths.state) { + tracing::warn!( + path = %paths.state.display(), + error = %save_error, + "failed to persist broker state" + ); + } + } + tracing::debug!( + child = %name, + "ignoring duplicate relaycast release for already exited worker" + ); + } else { + tracing::error!(child = %name, error = %error, "failed to release worker via relaycast"); + eprintln!("[agent-relay] failed to release '{}': {}", name, error); + } + } + } + return; + } + WsEvent::AgentSpawnRequested(event) => { + let name = event.agent.name; + eprintln!( + "[agent-relay] received spawn request for '{}' (cli: {})", + name, event.agent.cli + ); + if is_relaycast_self_control_target( + &name, + &workspace_self_name, + &workspace_self_names, + ) { + tracing::debug!( + worker = %name, + "ignoring relaycast spawn request for broker self" + ); + eprintln!( + "[agent-relay] ignoring spawn request for '{}' (broker self)", + name + ); + return; + } + let local_spawn_echo_key = + relaycast_spawn_control_dedup_key(&workspace_id, &name); + if relaycast_ws_should_apply_local_spawn_echo_dedup( + control_dedup_key.as_deref(), + &local_spawn_echo_key, + ) && !dedup.insert_if_new(&local_spawn_echo_key, Instant::now()) + { + tracing::info!( + worker = %name, + workspace_id = %workspace_id, + "dropping duplicate/local relaycast spawn request" + ); + eprintln!( + "[agent-relay] dropping duplicate spawn request for '{}'", + name + ); + return; + } + let cli = event.agent.cli; + let task = Some(event.agent.task).filter(|value| !value.trim().is_empty()); + let channel = event.agent.channel; + + tracing::info!(name = %name, cli = %cli, task = ?task, channel = ?channel, "handling spawn request from relaycast WS"); + let channels = channel + .as_deref() + .map(|ch| { + let mut chs = default_spawn_channels(); + if !chs.contains(&ch.to_string()) { + chs.push(ch.to_string()); + } + chs + }) + .unwrap_or_else(default_spawn_channels); + let spec = AgentSpec { + name: name.clone(), + runtime: AgentRuntime::Pty, + provider: None, + cli: Some(cli.clone()), + model: None, + cwd: None, + team: None, + shadow_of: None, + shadow_mode: None, + args: vec![], + channels: channels.clone(), + restart_policy: None, + }; + let effective_task = normalize_initial_task(task.clone()); + + // Pre-register agent token. Claude doesn't need this — it + // bakes the API key into --mcp-config JSON and self-registers. + // Non-Claude CLIs need the token injected into their CLI args + // at spawn time, so we do a quick (3s) registration attempt. + let cli_command = parse_cli_command(&cli) + .map(|(cmd, _)| cmd) + .unwrap_or_else(|_| cli.clone()); + let cli_name_lower = normalize_cli_name(&cli_command).to_lowercase(); + let is_claude = + cli_name_lower == "claude" || cli_name_lower.starts_with("claude:"); + let worker_relay_key = { + let ws_token = relaycast_ws_spawn_token(&ws_value); + if ws_token.is_some() { + ws_token + } else if is_claude { + // Claude self-registers via its MCP server — skip blocking call + None + } else { + const REG_TIMEOUT: Duration = Duration::from_secs(3); + match tokio::time::timeout( + REG_TIMEOUT, + workspace_http.register_agent_token(&name, Some(cli.as_str())), + ) + .await + { + Ok(Ok(token)) => { + tracing::info!( + worker = %name, + "pre-registered agent via broker for WS spawn" + ); + Some(token) + } + Ok(Err(error)) => { + tracing::warn!( + worker = %name, + error = %error, + "WS spawn pre-registration failed; agent will self-register" + ); + None + } + Err(_) => { + tracing::warn!( + worker = %name, + "WS spawn pre-registration timed out (3s); agent will self-register" + ); + None + } + } + } + }; + + match workers + .spawn( + spec, + Some("Relaycast".to_string()), + None, + worker_relay_key.clone(), + false, + Some(workspace_id.clone()), + ) + .await + { + Ok(effective_spec) => { + if let Some(ref task_text) = effective_task { + workers + .initial_tasks + .insert(name.clone(), task_text.clone()); + } + *agent_spawn_count += 1; + telemetry.track(TelemetryEvent::AgentSpawn { + cli: cli.clone(), + runtime: runtime_label(&effective_spec.runtime).to_string(), + spawn_source: ActionSource::Protocol, + has_task: effective_task.is_some(), + is_shadow: false, + }); + let pid = workers.worker_pid(&name).unwrap_or(0); + state.agents.insert( + name.clone(), + broker::PersistedAgent { + runtime: AgentRuntime::Pty, + parent: Some("Relaycast".to_string()), + channels, + pid: workers.worker_pid(&name), + started_at: Some( + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_secs(), + ), + spec: Some(effective_spec.clone()), + restart_policy: None, + initial_task: effective_task, + }, + ); + if paths.persist { + let _ = state.save(&paths.state); + } + let _ = send_event( + sdk_out_tx, + json!({ + "kind": "agent_spawned", + "name": name, + "runtime": "pty", + "cli": cli, + "model": effective_spec.model.clone(), + "pid": pid, + "source": "relaycast_ws", + "pre_registered": worker_relay_key.is_some(), + }), + ) + .await; + publish_agent_state_transition( + &workspace_state.ws_control_tx, + &name, + "spawned", + Some("relaycast_spawn"), + ) + .await; + tracing::info!(child = %name, pid, "spawned worker via relaycast WS"); + eprintln!("[agent-relay] spawned worker '{}' via relaycast", name); + } + Err(e) => { + let msg = e.to_string(); + if msg.contains("already exists") { + tracing::debug!(child = %name, "agent already spawned via SDK, skipping duplicate relaycast WS spawn"); + } else { + tracing::error!(child = %name, error = %e, "failed to spawn worker via relaycast WS"); + eprintln!("[agent-relay] failed to spawn '{}': {}", name, e); + } + } + } + return; + } + _ => {} + } + } else if ws_type == "agent.spawn_requested" { + // Fallback: the SDK failed to deserialize the event (e.g. missing + // fields like `already_existed` or `task: null`). Extract the + // spawn info directly from the raw JSON so we don't silently + // drop the request. + let agent_obj = ws_value.get("agent"); + let name = agent_obj + .and_then(|a| a.get("name")) + .and_then(Value::as_str) + .unwrap_or("") + .to_string(); + let cli = agent_obj + .and_then(|a| a.get("cli")) + .and_then(Value::as_str) + .unwrap_or("claude") + .to_string(); + let task = agent_obj + .and_then(|a| a.get("task")) + .and_then(Value::as_str) + .unwrap_or("") + .to_string(); + let channel = agent_obj + .and_then(|a| a.get("channel")) + .and_then(Value::as_str) + .map(String::from); + + if !name.is_empty() { + eprintln!( + "[agent-relay] handling spawn request for '{}' via JSON fallback (cli: {})", + name, cli + ); + + if is_relaycast_self_control_target( + &name, + &workspace_self_name, + &workspace_self_names, + ) { + eprintln!( + "[agent-relay] ignoring spawn request for '{}' (broker self)", + name + ); + } else { + let local_spawn_echo_key = + relaycast_spawn_control_dedup_key(&workspace_id, &name); + let should_dedup = relaycast_ws_should_apply_local_spawn_echo_dedup( + control_dedup_key.as_deref(), + &local_spawn_echo_key, + ); + // Always insert the local echo key for consistency with the primary path + let is_new = dedup.insert_if_new(&local_spawn_echo_key, Instant::now()); + if !should_dedup || is_new { + let channels = channel + .as_deref() + .map(|ch| { + let mut chs = default_spawn_channels(); + if !chs.contains(&ch.to_string()) { + chs.push(ch.to_string()); + } + chs + }) + .unwrap_or_else(default_spawn_channels); + let spec = AgentSpec { + name: name.clone(), + runtime: AgentRuntime::Pty, + provider: None, + cli: Some(cli.clone()), + model: None, + cwd: None, + team: None, + shadow_of: None, + shadow_mode: None, + args: vec![], + channels: channels.clone(), + restart_policy: None, + }; + let task_opt = Some(task).filter(|v| !v.trim().is_empty()); + let effective_task = normalize_initial_task(task_opt.clone()); + + // Pre-register (same logic as primary WS spawn path). + let cli_command = parse_cli_command(&cli) + .map(|(cmd, _)| cmd) + .unwrap_or_else(|_| cli.clone()); + let cli_name_lower = normalize_cli_name(&cli_command).to_lowercase(); + let is_claude = + cli_name_lower == "claude" || cli_name_lower.starts_with("claude:"); + let worker_relay_key = { + let ws_token = relaycast_ws_spawn_token(&ws_value); + if ws_token.is_some() { + ws_token + } else if is_claude { + None + } else { + const REG_TIMEOUT: Duration = Duration::from_secs(3); + match tokio::time::timeout( + REG_TIMEOUT, + workspace_http.register_agent_token(&name, Some(cli.as_str())), + ) + .await + { + Ok(Ok(token)) => Some(token), + Ok(Err(error)) => { + tracing::warn!( + worker = %name, + error = %error, + "WS spawn fallback pre-registration failed" + ); + None + } + Err(_) => { + tracing::warn!(worker = %name, "WS spawn fallback pre-registration timed out (3s)"); + None + } + } + } + }; + + match workers + .spawn( + spec, + Some("Relaycast".to_string()), + None, + worker_relay_key.clone(), + false, + Some(workspace_id.clone()), + ) + .await + { + Ok(effective_spec) => { + if let Some(ref task_text) = effective_task { + workers + .initial_tasks + .insert(name.clone(), task_text.clone()); + } + *agent_spawn_count += 1; + telemetry.track(TelemetryEvent::AgentSpawn { + cli: cli.clone(), + runtime: runtime_label(&effective_spec.runtime).to_string(), + spawn_source: ActionSource::Protocol, + has_task: effective_task.is_some(), + is_shadow: false, + }); + let pid = workers.worker_pid(&name).unwrap_or(0); + state.agents.insert( + name.clone(), + broker::PersistedAgent { + runtime: AgentRuntime::Pty, + parent: Some("Relaycast".to_string()), + channels, + pid: workers.worker_pid(&name), + started_at: Some( + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_secs(), + ), + spec: Some(effective_spec.clone()), + restart_policy: None, + initial_task: effective_task, + }, + ); + if paths.persist { + let _ = state.save(&paths.state); + } + let _ = send_event( + sdk_out_tx, + json!({ + "kind": "agent_spawned", + "name": name, + "runtime": "pty", + "cli": cli, + "model": effective_spec.model.clone(), + "pid": pid, + "source": "relaycast_ws_fallback", + "pre_registered": worker_relay_key.is_some(), + }), + ) + .await; + publish_agent_state_transition( + &workspace_state.ws_control_tx, + &name, + "spawned", + Some("relaycast_spawn"), + ) + .await; + eprintln!("[agent-relay] spawned worker '{}' via relaycast (JSON fallback)", name); + } + Err(e) => { + let msg = e.to_string(); + if !msg.contains("already exists") { + eprintln!("[agent-relay] failed to spawn '{}': {}", name, e); + } + } + } + } else { + eprintln!( + "[agent-relay] dropping duplicate spawn request for '{}' (fallback)", + name + ); + } + } + } + // Don't fall through to map_ws_event for control events + // handled by the JSON fallback path. + return; + } + + // Preserve the raw channel from the WS event for thread replies. + // The mapper may set target = "thread" (synthetic) when the SDK + // struct lacks a channel field; we use the raw value to fix + // display_target so the dashboard can route the message correctly. + let raw_ws_channel = ws_value + .get("channel") + .and_then(Value::as_str) + .map(String::from); + + if let Some(mapped) = map_ws_event(&ws_value, &workspace_id, workspace_alias.as_deref()) { + tracing::info!( + from = %mapped.from, + target = %mapped.target, + kind = ?mapped.kind, + event_id = %mapped.event_id, + text_len = mapped.text.len(), + "mapped inbound WS event" + ); + let dedup_key = format!("{}:{}", mapped.workspace_id, mapped.event_id); + if !dedup.insert_if_new(&dedup_key, Instant::now()) { + tracing::info!(event_id = %mapped.event_id, workspace_id = %mapped.workspace_id, "dropping duplicate event"); + return; + } + let has_local_target = if mapped.target.starts_with('#') { + !workers + .worker_names_for_channel_delivery( + &mapped.target, + &mapped.from, + Some(&workspace_id), + ) + .is_empty() + } else if matches!(mapped.kind, InboundKind::ThreadReply) && mapped.target == "thread" { + // Thread replies target "thread" (synthetic), not a specific worker. + // Treat as having a local target when any worker exists so the + // self-echo filter doesn't drop dashboard-originated thread replies. + workers.has_any_worker() + } else { + workers.has_worker_by_name_ignoring_case(&mapped.target) + }; + if routing::is_self_echo( + &mapped, + &workspace_self_names, + &workspace_self_agent_ids, + has_local_target, + ) { + tracing::info!(from = %mapped.from, sender_agent_id = ?mapped.sender_agent_id, self_names = ?workspace_self_names, "skipping self-echo in broker loop"); + return; + } + + telemetry.track(TelemetryEvent::MessageSend { + is_broadcast: mapped.target.starts_with('#'), + has_thread: mapped.thread_id.is_some(), + }); + + let mut delivery_plan = { + let worker_view = workers.routing_workers(); + routing::resolve_delivery_targets(&mapped, &worker_view) + }; + + // For thread replies with synthetic target "thread", override + // display_target with the actual channel so the dashboard can + // route the message to the correct channel/DM view. + if matches!(mapped.kind, InboundKind::ThreadReply) + && delivery_plan.display_target == "thread" + { + if let Some(ref ch) = raw_ws_channel { + let chan_target = if ch.starts_with('#') { + ch.clone() + } else { + format!("#{ch}") + }; + tracing::info!( + original_target = "thread", + resolved_target = %chan_target, + "overriding thread reply display_target with raw WS channel" + ); + delivery_plan.display_target = chan_target; + } + } + + if mapped.target.starts_with('#') { + tracing::info!( + channel = %mapped.target, + from = %mapped.from, + target_count = delivery_plan.targets.len(), + targets = ?delivery_plan.targets, + "channel delivery targets" + ); + } else { + tracing::info!( + target = %mapped.target, + from = %mapped.from, + kind = ?mapped.kind, + direct_targets = ?delivery_plan.targets, + "direct message routing" + ); + } + + if delivery_plan.needs_dm_resolution { + let conversation_id = mapped.target.clone(); + tracing::info!(conversation_id = %conversation_id, "resolving DM participants"); + let participants = resolve_dm_participants_cached( + &workspace_http, + dm_participants_cache, + &workspace_id, + &conversation_id, + ) + .await; + tracing::info!(participants = ?participants, "resolved DM participants"); + + if let Some(participant) = participants + .iter() + .find(|participant| !agent_name_eq(participant, &mapped.from)) + { + delivery_plan.display_target = participant.clone(); + } + + let worker_view = workers.routing_workers(); + delivery_plan.targets = routing::worker_names_for_dm_participants( + &worker_view, + &participants, + &mapped.from, + Some(&workspace_id), + ); + tracing::info!(dm_targets = ?delivery_plan.targets, "DM participant-based routing targets"); + } + + for worker_name in delivery_plan.targets { + // Inbound-delivery queue: mirrors the /api/send + // queue above. Auto-inject workers drain the queue + // immediately; manual-flush workers leave relaycast + // messages parked until flush. The same full-context + // capture makes drains reproduce the original + // delivery (channel/thread/workspace). + match queue_inbound_for_delivery_mode( + delivery_states, + workers, + &worker_name, + InboundContext { + from: &mapped.from, + body: &mapped.text, + target: &mapped.target, + thread_id: mapped.thread_id.as_deref(), + workspace_id: Some(mapped.workspace_id.as_str()), + workspace_alias: mapped.workspace_alias.as_deref(), + priority: mapped.priority.as_u8(), + mode: MessageInjectionMode::Wait, + event_id: Some(&mapped.event_id), + }, + ) { + InboundQueueOutcome::Queued => { + tracing::info!( + target = "agent_relay::broker", + event_id = %mapped.event_id, + worker = %worker_name, + "queued inbound relay message (manual_flush inbound delivery mode)" + ); + let _ = send_event( + sdk_out_tx, + json!({ + "kind":"delivery_queued", + "name":&worker_name, + "event_id":&mapped.event_id, + "from":&mapped.from, + "target":&mapped.target, + "reason":"inbound_delivery_manual_flush", + }), + ) + .await; + continue; + } + InboundQueueOutcome::DrainNow(to_drain) => { + for queued in to_drain { + if let Err(error) = try_inject_pending_relay_message( + workers, + pending_deliveries, + &worker_name, + &queued, + delivery_retry_interval, + ) + .await + { + let _ = send_error( + sdk_out_tx, + None, + "delivery_failed", + error.to_string(), + true, + Some(json!({"worker": worker_name})), + ) + .await; + } + } + continue; + } + InboundQueueOutcome::WorkerMissing => {} + } + if let Err(error) = queue_and_try_delivery( + workers, + pending_deliveries, + &worker_name, + &mapped, + delivery_retry_interval, + ) + .await + { + let _ = send_error( + sdk_out_tx, + None, + "delivery_failed", + error.to_string(), + true, + Some(json!({"worker": worker_name})), + ) + .await; + } + } + + let display_target = display_target_for_dashboard( + &delivery_plan.display_target, + &workspace_self_names, + &workspace_self_name, + ); + let display_from = if is_self_name(&workspace_self_names, &mapped.from) { + workspace_self_name.clone() + } else { + mapped.from.clone() + }; + tracing::info!( + from = %display_from, + display_target = %display_target, + event_id = %mapped.event_id, + body_len = mapped.text.len(), + "broadcasting relay_inbound to dashboard" + ); + record_thread_history_event( + recent_thread_messages, + json!({ + "event_id": mapped.event_id.clone(), + "from": display_from.clone(), + "target": display_target.clone(), + "text": mapped.text.clone(), + "thread_id": mapped.thread_id.clone(), + "workspace_id": mapped.workspace_id.clone(), + "workspace_alias": mapped.workspace_alias.clone(), + "timestamp": chrono::Utc::now().to_rfc3339(), + }), + ); + let _ = send_event( + sdk_out_tx, + json!({ + "kind": "relay_inbound", + "event_id": mapped.event_id, + "from": display_from, + "target": display_target, + "body": mapped.text, + "thread_id": mapped.thread_id, + "workspace_id": mapped.workspace_id, + "workspace_alias": mapped.workspace_alias, + }), + ) + .await; + } else if ws_type != "broker.connection" && ws_type != "broker.channel_join" { + tracing::info!( + target = "agent_relay::broker", + ws_type = %ws_type, + event = %ws_value, + "relaycast ws event ignored by inbound mapper" + ); + } + } +} diff --git a/crates/broker/src/runtime/tests.rs b/crates/broker/src/runtime/tests.rs index 606e75c0b..04a9b92cd 100644 --- a/crates/broker/src/runtime/tests.rs +++ b/crates/broker/src/runtime/tests.rs @@ -523,14 +523,14 @@ fn contract_timeout_fixture_requires_terminal_failed_guard_before_late_ack() { .and_then(Value::as_str) .expect("timeout fixture requires late_event_kind"); - let source = include_str!("init.rs"); + let source = include_str!("worker_events.rs"); let ack_branch = source .find("msg_type == \"delivery_ack\"") .map(|idx| { let end = (idx + 1200).min(source.len()); &source[idx..end] }) - .expect("main.rs must include delivery_ack handling"); + .expect("worker_events.rs must include delivery_ack handling"); assert!( ack_branch.contains(expected_terminal_status) || ack_branch.contains("terminal"), @@ -556,7 +556,11 @@ fn contract_broadcast_whitelist_fixture_requires_filtering_to_required_kinds() { .map(str::to_owned) .collect::>(); - let emitted = extract_kind_literals(include_str!("init.rs")); + let emitted = extract_kind_literals(concat!( + include_str!("api.rs"), + include_str!("relaycast_events.rs"), + include_str!("worker_events.rs"), + )); assert!( required.is_subset(&emitted), diff --git a/crates/broker/src/runtime/worker_events.rs b/crates/broker/src/runtime/worker_events.rs new file mode 100644 index 000000000..17c5729e0 --- /dev/null +++ b/crates/broker/src/runtime/worker_events.rs @@ -0,0 +1,573 @@ +use super::*; + +impl BrokerRuntime { + pub(super) async fn handle_worker_event(&mut self, worker_event: WorkerEvent) { + let paths = &self.paths; + let state = &mut self.state; + let sdk_out_tx = &self.sdk_out_tx; + let ws_control_tx = &self.ws_control_tx; + let relaycast_http = &self.relaycast_http; + let workers = &mut self.workers; + let pending_deliveries = &mut self.pending_deliveries; + let terminal_failed_deliveries = &mut self.terminal_failed_deliveries; + let pending_requests = &mut self.pending_requests; + let delivery_states = &mut self.delivery_states; + let delivery_retry_interval = self.delivery_retry_interval; + + match worker_event { + WorkerEvent::Message { name, value } => { + if let Some(msg_type) = value.get("type").and_then(Value::as_str) { + if msg_type == "delivery_ack" { + if let Some(payload) = value.get("payload") { + let delivery_id = payload + .get("delivery_id") + .and_then(Value::as_str) + .unwrap_or(""); + + // Terminal guard: ignore late delivery_ack events once a + // delivery has reached terminal failed status. + if !delivery_id.is_empty() + && terminal_failed_deliveries.contains(delivery_id) + { + tracing::info!( + worker = %name, + delivery_id = %delivery_id, + "ignoring late delivery_ack after terminal failed status" + ); + return; + } + + if let Ok(ack) = + serde_json::from_value::(payload.clone()) + { + clear_pending_delivery_if_event_matches( + pending_deliveries, + &ack.delivery_id, + Some(&ack.event_id), + &name, + "delivery_ack", + ); + terminal_failed_deliveries.remove(&ack.delivery_id); + } + let _ = send_event( + sdk_out_tx, + json!({ + "kind": "delivery_ack", + "name": name, + "delivery_id": payload.get("delivery_id"), + "event_id": payload.get("event_id"), + "timestamp": payload.get("timestamp"), + }), + ) + .await; + } + } else if msg_type == "delivery_queued" { + if let Some(payload) = value.get("payload") { + let _ = send_event( + sdk_out_tx, + json!({ + "kind": msg_type, + "name": name, + "delivery_id": payload.get("delivery_id"), + "event_id": payload.get("event_id"), + "timestamp": payload.get("timestamp"), + }), + ) + .await; + } + } else if msg_type == "delivery_injected" { + if let Some(payload) = value.get("payload") { + let delivery_id = payload + .get("delivery_id") + .and_then(Value::as_str) + .unwrap_or(""); + let event_id = payload.get("event_id").and_then(Value::as_str); + clear_pending_delivery_if_event_matches( + pending_deliveries, + delivery_id, + event_id, + &name, + "delivery_injected", + ); + let _ = send_event( + sdk_out_tx, + json!({ + "kind": msg_type, + "name": name, + "delivery_id": payload.get("delivery_id"), + "event_id": payload.get("event_id"), + "timestamp": payload.get("timestamp"), + }), + ) + .await; + } + } else if msg_type == "delivery_verified" { + if let Some(payload) = value.get("payload") { + let delivery_id = payload + .get("delivery_id") + .and_then(Value::as_str) + .unwrap_or(""); + let event_id = payload + .get("event_id") + .and_then(Value::as_str) + .unwrap_or(""); + tracing::debug!( + target = "agent_relay::broker", + worker = %name, + delivery_id = %delivery_id, + event_id = %event_id, + "delivery verified by echo detection" + ); + clear_pending_delivery_if_event_matches( + pending_deliveries, + delivery_id, + Some(event_id), + &name, + "delivery_verified", + ); + let _ = send_event( + sdk_out_tx, + json!({ + "kind": "delivery_verified", + "name": name, + "delivery_id": delivery_id, + "event_id": event_id, + }), + ) + .await; + } + } else if msg_type == "delivery_active" { + if let Some(payload) = value.get("payload") { + let _ = send_event( + sdk_out_tx, + json!({ + "kind": "delivery_active", + "name": name, + "delivery_id": payload.get("delivery_id"), + "event_id": payload.get("event_id"), + "pattern": payload.get("pattern"), + }), + ) + .await; + } + } else if msg_type == "delivery_failed" { + if let Some(payload) = value.get("payload") { + let delivery_id = payload + .get("delivery_id") + .and_then(Value::as_str) + .unwrap_or(""); + let event_id = payload + .get("event_id") + .and_then(Value::as_str) + .unwrap_or(""); + let reason = payload + .get("reason") + .and_then(Value::as_str) + .unwrap_or("unknown"); + tracing::warn!( + target = "agent_relay::broker", + worker = %name, + delivery_id = %delivery_id, + event_id = %event_id, + reason = %reason, + "delivery failed — echo not detected" + ); + clear_pending_delivery_if_event_matches( + pending_deliveries, + delivery_id, + Some(event_id), + &name, + "delivery_failed", + ); + if !delivery_id.is_empty() { + terminal_failed_deliveries.insert(delivery_id.to_string()); + } + let _ = send_event( + sdk_out_tx, + json!({ + "kind": "delivery_failed", + "name": name, + "delivery_id": delivery_id, + "event_id": event_id, + "reason": reason, + }), + ) + .await; + } + } else if msg_type == "worker_error" { + let _ = send_event( + sdk_out_tx, + json!({ + "kind": "worker_error", + "name": name, + "error": value.get("payload").cloned().unwrap_or(Value::Null) + }), + ) + .await; + } else if msg_type.ends_with("_response") { + // Generic worker request/response dispatch. + // Any frame whose `type` ends in + // `_response` is routed by `request_id` + // into the matching parked `oneshot` in + // `pending_requests`. The pending entry + // owns the format/error decoding logic + // via `worker_request::fulfil_response_frame`. + let routed = + worker_request::fulfil_response_frame(pending_requests, &value); + if !routed { + let req_id = value + .get("request_id") + .and_then(Value::as_str) + .unwrap_or(""); + tracing::debug!( + target = "agent_relay::broker", + worker = %name, + msg_type = %msg_type, + request_id = %req_id, + "worker response with no pending caller — dropping" + ); + } + } else if msg_type == "worker_stream" { + let _ = send_event(sdk_out_tx, json!({ + "kind": "worker_stream", + "name": name, + "stream": value.get("payload").and_then(|p| p.get("stream")).cloned().unwrap_or(Value::String("stdout".to_string())), + "chunk": value.get("payload").and_then(|p| p.get("chunk")).cloned().unwrap_or(Value::String(String::new())), + })).await; + } else if msg_type == "worker_ready" { + if let Some(task_text) = workers.initial_tasks.remove(&name) { + let event_id = format!("init_{}", Uuid::new_v4().simple()); + if let Err(e) = queue_and_try_delivery_raw( + workers, + pending_deliveries, + &name, + &event_id, + "broker", + &name, + &task_text, + None, + None, + None, + 2, + MessageInjectionMode::Wait, + delivery_retry_interval, + ) + .await + { + tracing::warn!(worker = %name, error = %e, "failed to deliver initial_task"); + } + } + let runtime = value + .get("payload") + .and_then(|p| p.get("runtime")) + .and_then(Value::as_str) + .unwrap_or("pty"); + let (provider_val, cli_val, model_val) = workers + .workers + .get(&name) + .map(|h| { + ( + h.spec.provider.clone(), + h.spec.cli.clone(), + h.spec.model.clone(), + ) + }) + .unwrap_or((None, None, None)); + let _ = send_event( + sdk_out_tx, + json!({ + "kind": "worker_ready", + "name": name, + "runtime": runtime, + "provider": provider_val, + "cli": cli_val, + "model": model_val, + }), + ) + .await; + } else if msg_type == "agent_idle" { + let idle_secs = value + .get("payload") + .and_then(|p| p.get("idle_secs")) + .and_then(Value::as_u64) + .unwrap_or(0); + let _ = send_event( + sdk_out_tx, + json!({ + "kind": "agent_idle", + "name": name, + "idle_secs": idle_secs, + }), + ) + .await; + publish_agent_state_transition( + ws_control_tx, + &name, + "idle", + Some("idle_threshold"), + ) + .await; + } else if msg_type == "agent_exit" { + let reason = value + .get("payload") + .and_then(|p| p.get("reason")) + .and_then(Value::as_str) + .unwrap_or("unknown"); + tracing::info!(agent = %name, reason = %reason, "agent requested exit"); + let _ = send_event( + sdk_out_tx, + json!({ + "kind": "agent_exit", + "name": name, + "reason": reason, + }), + ) + .await; + } else if msg_type == "continuity_command" { + // Agent-initiated continuity: the pty_worker detected a + // KIND: continuity block in PTY output and emitted this event. + let action = value + .get("payload") + .and_then(|p| p.get("action")) + .and_then(Value::as_str) + .unwrap_or(""); + let content = value + .get("payload") + .and_then(|p| p.get("content")) + .and_then(Value::as_str) + .unwrap_or(""); + match action { + "save" => { + let cont_dir = continuity_dir(&paths.state); + if let Err(e) = std::fs::create_dir_all(&cont_dir) { + tracing::warn!( + agent = %name, + error = %e, + "continuity_command save: failed to create dir" + ); + } else { + // Build a minimal continuity record with the provided summary. + let agent_data = state.agents.get(&name); + let cli = agent_data + .and_then(|d| d.spec.as_ref()) + .and_then(|s| s.cli.clone()); + let initial_task = + agent_data.and_then(|d| d.initial_task.clone()); + let continuity = json!({ + "agent_name": name, + "cli": cli, + "initial_task": initial_task, + "released_at": null, + "lifetime_seconds": null, + "message_history": [], + "summary": content, + }); + let cont_file = cont_dir.join(format!("{}.json", name)); + match std::fs::write( + &cont_file, + serde_json::to_string_pretty(&continuity) + .unwrap_or_default(), + ) { + Ok(()) => tracing::info!( + agent = %name, + path = %cont_file.display(), + "continuity_command: saved agent-initiated continuity" + ), + Err(e) => tracing::warn!( + agent = %name, + error = %e, + "continuity_command save: failed to write file" + ), + } + } + } + "load" => { + let cont_dir = continuity_dir(&paths.state); + let cont_file = cont_dir.join(format!("{}.json", name)); + if cont_file.exists() { + match std::fs::read_to_string(&cont_file) { + Ok(raw) => { + if let Ok(ctx) = serde_json::from_str::(&raw) { + // Build a context summary and inject it + let prev_task = ctx + .get("initial_task") + .and_then(Value::as_str) + .unwrap_or("unknown"); + let summary = ctx + .get("summary") + .and_then(Value::as_str) + .unwrap_or("no summary"); + let history_str = ctx + .get("message_history") + .and_then(Value::as_array) + .map(|msgs| { + msgs.iter() + .filter_map(|m| { + let from = + m.get("from")?.as_str()?; + let text = m + .get("text") + .or_else(|| m.get("body"))? + .as_str()?; + Some(format!( + " - {}: {}", + from, text + )) + }) + .collect::>() + .join("\n") + }) + .unwrap_or_default(); + let history_section = if history_str.is_empty() { + String::new() + } else { + format!("\nRecent messages:\n{}", history_str) + }; + let inject_body = format!( + "## Continuity Context (from previous session as '{}')\n\ + Previous task: {}\n\ + Session summary: {}{}", + name, prev_task, summary, history_section + ); + let event_id = format!( + "cont_load_{}", + Uuid::new_v4().simple() + ); + if let Err(e) = queue_and_try_delivery_raw( + workers, + pending_deliveries, + &name, + &event_id, + "broker", + &name, + &inject_body, + None, + None, + None, + 2, + MessageInjectionMode::Wait, + delivery_retry_interval, + ) + .await + { + tracing::warn!( + agent = %name, + error = %e, + "continuity_command load: failed to inject context" + ); + } else { + tracing::info!( + agent = %name, + "continuity_command: injected loaded context" + ); + } + } + } + Err(e) => tracing::warn!( + agent = %name, + error = %e, + "continuity_command load: failed to read file" + ), + } + } else { + tracing::debug!( + agent = %name, + "continuity_command load: no continuity file found" + ); + } + } + "uncertain" => { + tracing::info!( + agent = %name, + content = %content, + "continuity_command: agent reported uncertainty" + ); + } + other => { + tracing::warn!( + agent = %name, + action = %other, + "continuity_command: unknown action ignored" + ); + } + } + } else if msg_type == "worker_exited" { + // PTY worker process is exiting — clean up and + // emit agent_exited so the SDK doesn't have to + // wait for the reap_exited polling cycle. + let code = value + .get("payload") + .and_then(|p| p.get("code")) + .and_then(Value::as_i64) + .map(|c| c as i32); + let signal = value + .get("payload") + .and_then(|p| p.get("signal")) + .and_then(Value::as_str) + .map(String::from); + tracing::info!( + agent = %name, + code = ?code, + signal = ?signal, + "worker_exited received — cleaning up" + ); + // Remove from registry so reap_exited won't + // double-process this worker. + workers.workers.remove(&name); + workers.initial_tasks.remove(&name); + // Drop pending deliveries for this worker + let dropped = drop_pending_for_worker(pending_deliveries, &name); + if dropped > 0 { + let _ = send_event( + sdk_out_tx, + json!({ + "kind": "delivery_dropped", + "name": name, + "count": dropped, + "reason": "worker_exited", + }), + ) + .await; + } + fail_pending_requests_for_worker(pending_requests, &name, "worker_exited"); + delivery_states.remove(&name); + let _ = send_event( + sdk_out_tx, + json!({ + "kind": "agent_exited", + "name": name, + "code": code, + "signal": signal, + }), + ) + .await; + publish_agent_state_transition( + ws_control_tx, + &name, + "exited", + Some("worker_exited"), + ) + .await; + if let Err(error) = relaycast_http.mark_agent_offline(&name).await { + tracing::warn!( + worker = %name, + error = %error, + "failed to mark exited worker offline in relaycast" + ); + } + state.agents.remove(&name); + if paths.persist { + if let Err(error) = state.save(&paths.state) { + tracing::warn!( + path = %paths.state.display(), + error = %error, + "failed to persist broker state" + ); + } + } + } + } + } + } + } +} From 9f185d6b782aa77f67346605fd0c093c1673c190 Mon Sep 17 00:00:00 2001 From: Will Washburn Date: Mon, 18 May 2026 22:10:13 -0400 Subject: [PATCH 7/8] fix: address runtime review findings --- .../completed/2026-05/traj_pmrcfj6or3pz.json | 57 +++++ .../completed/2026-05/traj_pmrcfj6or3pz.md | 34 +++ .trajectories/index.json | 9 +- crates/broker/src/runtime/api.rs | 224 ++++++++++++++++-- crates/broker/src/runtime/delivery.rs | 8 +- crates/broker/src/runtime/event_loop.rs | 21 +- crates/broker/src/runtime/init.rs | 5 +- crates/broker/src/runtime/maintenance.rs | 44 +++- crates/broker/src/runtime/messages.rs | 10 +- crates/broker/src/runtime/paths.rs | 30 ++- crates/broker/src/runtime/relaycast_events.rs | 101 +++++--- crates/broker/src/runtime/session.rs | 3 - crates/broker/src/runtime/tests.rs | 47 +++- crates/broker/src/runtime/util.rs | 7 + crates/broker/src/runtime/worker_events.rs | 61 +---- crates/broker/src/worker_request.rs | 6 +- 16 files changed, 524 insertions(+), 143 deletions(-) create mode 100644 .trajectories/completed/2026-05/traj_pmrcfj6or3pz.json create mode 100644 .trajectories/completed/2026-05/traj_pmrcfj6or3pz.md diff --git a/.trajectories/completed/2026-05/traj_pmrcfj6or3pz.json b/.trajectories/completed/2026-05/traj_pmrcfj6or3pz.json new file mode 100644 index 000000000..0d6497517 --- /dev/null +++ b/.trajectories/completed/2026-05/traj_pmrcfj6or3pz.json @@ -0,0 +1,57 @@ +{ + "id": "traj_pmrcfj6or3pz", + "version": 1, + "task": { + "title": "Address runtime split review comments", + "source": { + "system": "plain", + "id": "PR-906" + } + }, + "status": "completed", + "startedAt": "2026-05-19T02:03:43.962Z", + "completedAt": "2026-05-19T02:09:31.002Z", + "agents": [ + { + "name": "default", + "role": "lead", + "joinedAt": "2026-05-19T02:09:20.475Z" + } + ], + "chapters": [ + { + "id": "chap_7s5frd9sadcc", + "title": "Work", + "agentName": "default", + "startedAt": "2026-05-19T02:09:20.475Z", + "endedAt": "2026-05-19T02:09:31.002Z", + "events": [ + { + "ts": 1779156560476, + "type": "decision", + "content": "Fixed runtime split review findings with behavioral changes: Fixed runtime split review findings with behavioral changes", + "raw": { + "question": "Fixed runtime split review findings with behavioral changes", + "chosen": "Fixed runtime split review findings with behavioral changes", + "alternatives": [], + "reasoning": "Channel subscription APIs now update live websocket subscriptions and persisted worker specs; worker_exited frames defer cleanup to reap_exited so supervisor restart decisions are preserved; relaycast local delivery now uses the same bounded timeout path as HTTP delivery." + }, + "significance": "high" + } + ] + } + ], + "retrospective": { + "summary": "Addressed runtime split review comments with behavioral fixes: channel subscribe/unsubscribe now synchronizes Relaycast websocket subscriptions and persisted specs, receiver closure no longer spins the event loop, restarts refresh persisted metadata, relaycast delivery is timeout-bounded, worker_exited frames defer to reap_exited, ephemeral paths are unique per broker instance, token prefixes are removed from identity debug files, numeric thread timestamps normalize to milliseconds, and env-mutating tests are serialized.", + "approach": "Standard approach", + "confidence": 0.9 + }, + "commits": [], + "filesChanged": [], + "projectId": "/Users/will/Projects/AgentWorkforce/relay", + "tags": [], + "_trace": { + "startRef": "7672f7081fd75a7bd49116ab8e91411f1b3123ba", + "endRef": "7672f7081fd75a7bd49116ab8e91411f1b3123ba" + } +} diff --git a/.trajectories/completed/2026-05/traj_pmrcfj6or3pz.md b/.trajectories/completed/2026-05/traj_pmrcfj6or3pz.md new file mode 100644 index 000000000..d1fa3b7e2 --- /dev/null +++ b/.trajectories/completed/2026-05/traj_pmrcfj6or3pz.md @@ -0,0 +1,34 @@ +# Trajectory: Address runtime split review comments + +> **Status:** ✅ Completed +> **Task:** PR-906 +> **Confidence:** 90% +> **Started:** May 18, 2026 at 10:03 PM +> **Completed:** May 18, 2026 at 10:09 PM + +--- + +## Summary + +Addressed runtime split review comments with behavioral fixes: channel subscribe/unsubscribe now synchronizes Relaycast websocket subscriptions and persisted specs, receiver closure no longer spins the event loop, restarts refresh persisted metadata, relaycast delivery is timeout-bounded, worker_exited frames defer to reap_exited, ephemeral paths are unique per broker instance, token prefixes are removed from identity debug files, numeric thread timestamps normalize to milliseconds, and env-mutating tests are serialized. + +**Approach:** Standard approach + +--- + +## Key Decisions + +### Fixed runtime split review findings with behavioral changes + +- **Chose:** Fixed runtime split review findings with behavioral changes +- **Reasoning:** Channel subscription APIs now update live websocket subscriptions and persisted worker specs; worker_exited frames defer cleanup to reap_exited so supervisor restart decisions are preserved; relaycast local delivery now uses the same bounded timeout path as HTTP delivery. + +--- + +## Chapters + +### 1. Work + +_Agent: default_ + +- Fixed runtime split review findings with behavioral changes: Fixed runtime split review findings with behavioral changes diff --git a/.trajectories/index.json b/.trajectories/index.json index c629337ae..4e6d93788 100644 --- a/.trajectories/index.json +++ b/.trajectories/index.json @@ -1,6 +1,6 @@ { "version": 1, - "lastUpdated": "2026-05-19T01:50:40.535Z", + "lastUpdated": "2026-05-19T02:09:31.149Z", "trajectories": { "traj_05xg7j388bc4": { "title": "Add browser workflow step integration", @@ -974,6 +974,13 @@ "startedAt": "2026-05-19T01:42:10.602Z", "completedAt": "2026-05-19T01:50:40.359Z", "path": "/Users/will/Projects/AgentWorkforce/relay/.trajectories/completed/2026-05/traj_x37bhga2j5ph.json" + }, + "traj_pmrcfj6or3pz": { + "title": "Address runtime split review comments", + "status": "completed", + "startedAt": "2026-05-19T02:03:43.962Z", + "completedAt": "2026-05-19T02:09:31.002Z", + "path": "/Users/will/Projects/AgentWorkforce/relay/.trajectories/completed/2026-05/traj_pmrcfj6or3pz.json" } } } diff --git a/crates/broker/src/runtime/api.rs b/crates/broker/src/runtime/api.rs index 3c621de82..e8622d73b 100644 --- a/crates/broker/src/runtime/api.rs +++ b/crates/broker/src/runtime/api.rs @@ -6,6 +6,7 @@ impl BrokerRuntime { let state = &mut self.state; let workspaces = &self.workspaces; let workspace_lookup = &self.workspace_lookup; + let default_workspace = &self.default_workspace; let default_workspace_id = &self.default_workspace_id; let self_names = &self.self_names; let relaycast_http = &self.relaycast_http; @@ -1044,23 +1045,75 @@ impl BrokerRuntime { channels, reply, } => { - let Some(handle) = workers.workers.get_mut(&name) else { - let _ = reply.send(Err(format!("unknown worker '{}'", name))); - return; + let (workspace_id, parent, spec, pid, added, all_channels) = { + let Some(handle) = workers.workers.get_mut(&name) else { + let _ = reply.send(Err(format!("unknown worker '{}'", name))); + return; + }; + let mut added = Vec::new(); + for ch in &channels { + let exists = handle + .spec + .channels + .iter() + .any(|c| c.eq_ignore_ascii_case(ch)); + if !exists { + handle.spec.channels.push(ch.clone()); + added.push(ch.clone()); + } + } + ( + handle.workspace_id.clone(), + handle.parent.clone(), + handle.spec.clone(), + handle.child.id(), + added, + handle.spec.channels.clone(), + ) }; - let mut added = Vec::new(); - for ch in &channels { - let exists = handle - .spec - .channels - .iter() - .any(|c| c.eq_ignore_ascii_case(ch)); - if !exists { - handle.spec.channels.push(ch.clone()); - added.push(ch.clone()); + + if !added.is_empty() { + let workspace = workspace_for_channel_update( + workspace_id.as_deref(), + workspace_lookup, + default_workspace_id.as_deref(), + default_workspace, + ); + if let Err(error) = workspace.http_client.ensure_extra_channels(&added).await { + tracing::warn!( + worker = %name, + workspace_id = %workspace.workspace_id, + channels = ?added, + error = %error, + "failed to ensure subscribed channels" + ); + } + if let Err(error) = workspace + .ws_control_tx + .send(WsControl::Subscribe(added.clone())) + .await + { + tracing::warn!( + worker = %name, + workspace_id = %workspace.workspace_id, + channels = ?added, + error = %error, + "failed to send ws channel subscribe control" + ); + } + } + + persist_agent_channels(state, &name, parent, spec, pid, all_channels.clone()); + if paths.persist { + if let Err(error) = state.save(&paths.state) { + tracing::warn!( + path = %paths.state.display(), + worker = %name, + error = %error, + "failed to persist channel subscriptions" + ); } } - let all_channels = handle.spec.channels.clone(); let _ = reply.send(Ok(json!({ "name": name, "channels": all_channels, @@ -1071,15 +1124,87 @@ impl BrokerRuntime { channels, reply, } => { - let Some(handle) = workers.workers.get_mut(&name) else { - let _ = reply.send(Err(format!("unknown worker '{}'", name))); - return; + let (workspace_id, parent, spec, pid, removed, remaining) = { + let Some(handle) = workers.workers.get_mut(&name) else { + let _ = reply.send(Err(format!("unknown worker '{}'", name))); + return; + }; + let before = handle.spec.channels.clone(); + handle + .spec + .channels + .retain(|c| !channels.iter().any(|rem| rem.eq_ignore_ascii_case(c))); + let remaining = handle.spec.channels.clone(); + let removed = before + .into_iter() + .filter(|channel| { + !remaining + .iter() + .any(|kept| kept.eq_ignore_ascii_case(channel)) + }) + .collect::>(); + ( + handle.workspace_id.clone(), + handle.parent.clone(), + handle.spec.clone(), + handle.child.id(), + removed, + remaining, + ) }; - handle - .spec - .channels - .retain(|c| !channels.iter().any(|rem| rem.eq_ignore_ascii_case(c))); - let remaining = handle.spec.channels.clone(); + + if !removed.is_empty() { + let workspace = workspace_for_channel_update( + workspace_id.as_deref(), + workspace_lookup, + default_workspace_id.as_deref(), + default_workspace, + ); + let target_workspace_id = effective_channel_workspace_id( + workspace_id.as_deref(), + default_workspace_id.as_deref(), + ); + let unsubscribe = removed + .iter() + .filter(|channel| { + !workers.workers.values().any(|handle| { + effective_channel_workspace_id( + handle.workspace_id.as_deref(), + default_workspace_id.as_deref(), + ) == target_workspace_id + && channel_in_list(&handle.spec.channels, channel) + }) + }) + .cloned() + .collect::>(); + if !unsubscribe.is_empty() { + if let Err(error) = workspace + .ws_control_tx + .send(WsControl::Unsubscribe(unsubscribe.clone())) + .await + { + tracing::warn!( + worker = %name, + workspace_id = %workspace.workspace_id, + channels = ?unsubscribe, + error = %error, + "failed to send ws channel unsubscribe control" + ); + } + } + } + + persist_agent_channels(state, &name, parent, spec, pid, remaining.clone()); + if paths.persist { + if let Err(error) = state.save(&paths.state) { + tracing::warn!( + path = %paths.state.display(), + worker = %name, + error = %error, + "failed to persist channel subscriptions" + ); + } + } let _ = reply.send(Ok(json!({ "name": name, "channels": remaining, @@ -1234,3 +1359,58 @@ impl BrokerRuntime { } } } + +fn workspace_for_channel_update<'a>( + workspace_id: Option<&str>, + workspace_lookup: &'a HashMap, + default_workspace_id: Option<&str>, + default_workspace: &'a RelayWorkspace, +) -> &'a RelayWorkspace { + workspace_id + .and_then(|id| workspace_lookup.get(id)) + .or_else(|| default_workspace_id.and_then(|id| workspace_lookup.get(id))) + .unwrap_or(default_workspace) +} + +fn effective_channel_workspace_id<'a>( + workspace_id: Option<&'a str>, + default_workspace_id: Option<&'a str>, +) -> Option<&'a str> { + workspace_id.or(default_workspace_id) +} + +fn channel_in_list(channels: &[String], channel: &str) -> bool { + channels + .iter() + .any(|existing| existing.eq_ignore_ascii_case(channel)) +} + +fn persist_agent_channels( + state: &mut broker::BrokerState, + name: &str, + parent: Option, + mut spec: AgentSpec, + pid: Option, + channels: Vec, +) { + spec.channels = channels.clone(); + let runtime = spec.runtime.clone(); + let agent = state + .agents + .entry(name.to_string()) + .or_insert_with(|| broker::PersistedAgent { + runtime: runtime.clone(), + parent: parent.clone(), + channels: channels.clone(), + pid, + started_at: Some(unix_timestamp_secs()), + spec: Some(spec.clone()), + restart_policy: None, + initial_task: None, + }); + agent.runtime = runtime; + agent.parent = parent; + agent.channels = channels; + agent.pid = pid; + agent.spec = Some(spec); +} diff --git a/crates/broker/src/runtime/delivery.rs b/crates/broker/src/runtime/delivery.rs index 48b101af3..e8f59e4c5 100644 --- a/crates/broker/src/runtime/delivery.rs +++ b/crates/broker/src/runtime/delivery.rs @@ -387,10 +387,10 @@ pub(crate) fn drop_pending_for_worker( /// Drain every in-flight worker request targeting `worker_name` and /// notify each awaiter with [`worker_request::RequestWorkerError::WorkerDisappeared`]. -/// Called from every worker-teardown path (explicit release, -/// `worker_exited` frame, `reap_exited` periodic sweep) so HTTP callers -/// don't have to wait out the request deadline when the worker has -/// clearly gone. Logs one structured warning per drained request. +/// Called from every worker-teardown path (explicit release or +/// `reap_exited` periodic sweep) so HTTP callers don't have to wait out +/// the request deadline when the worker has clearly gone. Logs one +/// structured warning per drained request. pub(crate) fn fail_pending_requests_for_worker( pending_requests: &mut HashMap, worker_name: &str, diff --git a/crates/broker/src/runtime/event_loop.rs b/crates/broker/src/runtime/event_loop.rs index b4d766b80..ed9ac9d1e 100644 --- a/crates/broker/src/runtime/event_loop.rs +++ b/crates/broker/src/runtime/event_loop.rs @@ -14,9 +14,12 @@ pub(crate) struct BrokerRuntime { pub(super) ws_control_tx: mpsc::Sender, pub(super) relaycast_http: RelaycastHttpClient, pub(super) api_rx: mpsc::Receiver, + pub(super) api_open: bool, pub(super) ws_inbound_rx: mpsc::Receiver, + pub(super) relaycast_open: bool, pub(super) sdk_out_tx: mpsc::Sender>, pub(super) worker_event_rx: mpsc::Receiver, + pub(super) worker_events_open: bool, pub(super) workers: WorkerRegistry, pub(super) crash_insights: relay_broker::crash_insights::CrashInsights, pub(super) crash_insights_path: PathBuf, @@ -61,13 +64,13 @@ impl BrokerRuntime { _ = tokio::signal::ctrl_c() => RuntimeEvent::CtrlC, _ = self.lease_check.tick() => RuntimeEvent::LeaseTick, _ = self.sigterm.recv() => RuntimeEvent::Sigterm, - request = self.api_rx.recv() => match request { + request = self.api_rx.recv(), if self.api_open => match request { Some(request) => RuntimeEvent::Api(Box::new(request)), None => RuntimeEvent::ApiClosed, }, result = self.sdk_lines.next_line(), if self.stdin_open => RuntimeEvent::Stdin(result), - message = self.ws_inbound_rx.recv() => RuntimeEvent::Relaycast(message), - event = self.worker_event_rx.recv() => RuntimeEvent::Worker(event), + message = self.ws_inbound_rx.recv(), if self.relaycast_open => RuntimeEvent::Relaycast(message), + event = self.worker_event_rx.recv(), if self.worker_events_open => RuntimeEvent::Worker(event), _ = self.reap_tick.tick() => RuntimeEvent::MaintenanceTick, }; @@ -85,7 +88,9 @@ impl BrokerRuntime { RuntimeEvent::Api(request) => { self.handle_api_request(*request).await; } - RuntimeEvent::ApiClosed => {} + RuntimeEvent::ApiClosed => { + self.api_open = false; + } RuntimeEvent::Stdin(result) => { if matches!(result, Ok(None) | Err(_)) { self.stdin_open = false; @@ -94,11 +99,15 @@ impl BrokerRuntime { RuntimeEvent::Relaycast(Some(message)) => { self.handle_relaycast_message(message).await; } - RuntimeEvent::Relaycast(None) => {} + RuntimeEvent::Relaycast(None) => { + self.relaycast_open = false; + } RuntimeEvent::Worker(Some(event)) => { self.handle_worker_event(event).await; } - RuntimeEvent::Worker(None) => {} + RuntimeEvent::Worker(None) => { + self.worker_events_open = false; + } RuntimeEvent::MaintenanceTick => { self.handle_maintenance_tick().await; } diff --git a/crates/broker/src/runtime/init.rs b/crates/broker/src/runtime/init.rs index 72100bb5c..728c4688c 100644 --- a/crates/broker/src/runtime/init.rs +++ b/crates/broker/src/runtime/init.rs @@ -389,7 +389,7 @@ pub(crate) async fn run_init(cmd: InitCommand, telemetry: TelemetryClient) -> Re // drain without touching `WorkerHandle` (which holds OS-level // process state). See `relay_broker::types::InboundDeliveryState`. Entries // are created lazily on first lookup and removed wherever workers - // exit (`Release` arm, `worker_exited` frame, `reap_exited` sweep). + // exit (`Release` arm or `reap_exited` sweep). let delivery_states: HashMap = HashMap::new(); let dm_participants_cache: HashMap)> = HashMap::new(); let recent_thread_messages: VecDeque = VecDeque::new(); @@ -437,9 +437,12 @@ pub(crate) async fn run_init(cmd: InitCommand, telemetry: TelemetryClient) -> Re ws_control_tx, relaycast_http, api_rx, + api_open: true, ws_inbound_rx, + relaycast_open: true, sdk_out_tx, worker_event_rx, + worker_events_open: true, workers, crash_insights, crash_insights_path, diff --git a/crates/broker/src/runtime/maintenance.rs b/crates/broker/src/runtime/maintenance.rs index 508d4db46..44bcf42f3 100644 --- a/crates/broker/src/runtime/maintenance.rs +++ b/crates/broker/src/runtime/maintenance.rs @@ -312,12 +312,52 @@ impl BrokerRuntime { ) .await { - Ok(_) => { + Ok(effective_spec) => { workers.supervisor.on_restarted(&name); workers.metrics.on_restart(&name); - if let Some(task) = rst.initial_task { + let initial_task = rst.initial_task.clone(); + if let Some(task) = initial_task.clone() { workers.initial_tasks.insert(name.clone(), task); } + let pid = workers.worker_pid(&name); + let restart_policy = state + .agents + .get(&name) + .and_then(|agent| agent.restart_policy.clone()) + .or_else(|| effective_spec.restart_policy.clone()); + state + .agents + .entry(name.clone()) + .and_modify(|agent| { + agent.runtime = effective_spec.runtime.clone(); + agent.parent = rst.parent.clone(); + agent.channels = effective_spec.channels.clone(); + agent.pid = pid; + agent.started_at = Some(unix_timestamp_secs()); + agent.spec = Some(effective_spec.clone()); + agent.restart_policy = restart_policy.clone(); + agent.initial_task = initial_task.clone(); + }) + .or_insert_with(|| broker::PersistedAgent { + runtime: effective_spec.runtime.clone(), + parent: rst.parent.clone(), + channels: effective_spec.channels.clone(), + pid, + started_at: Some(unix_timestamp_secs()), + spec: Some(effective_spec.clone()), + restart_policy, + initial_task, + }); + if paths.persist { + if let Err(error) = state.save(&paths.state) { + tracing::warn!( + path = %paths.state.display(), + worker = %name, + error = %error, + "failed to persist restarted worker state" + ); + } + } tracing::info!(name = %name, restart_count = rst.restart_count, "agent restarted"); let _ = send_event( sdk_out_tx, diff --git a/crates/broker/src/runtime/messages.rs b/crates/broker/src/runtime/messages.rs index ca6d3f61a..1449793a9 100644 --- a/crates/broker/src/runtime/messages.rs +++ b/crates/broker/src/runtime/messages.rs @@ -275,13 +275,21 @@ pub(crate) fn truncate_thread_preview(input: &str, max_len: usize) -> String { out } +/// Parse a message timestamp into a millisecond sort key. +/// +/// Numeric values below `4_102_444_800` are treated as Unix seconds so mixed +/// second, millisecond, and RFC3339 inputs sort in the same unit. pub(crate) fn parse_sort_key_from_raw_timestamp(raw: &str) -> Option { let trimmed = raw.trim(); if trimmed.is_empty() { return None; } if let Ok(epoch) = trimmed.parse::() { - return Some(epoch); + return Some(if epoch < 4_102_444_800 { + epoch.saturating_mul(1_000) + } else { + epoch + }); } chrono::DateTime::parse_from_rfc3339(trimmed) .ok() diff --git a/crates/broker/src/runtime/paths.rs b/crates/broker/src/runtime/paths.rs index eb0301b75..eef67eda5 100644 --- a/crates/broker/src/runtime/paths.rs +++ b/crates/broker/src/runtime/paths.rs @@ -23,8 +23,8 @@ pub(crate) fn continuity_dir(state_path: &Path) -> PathBuf { /// /// Unlike `ensure_runtime_paths`, this function: /// - Writes nothing to the project directory -/// - Uses a deterministic temp directory derived from cwd+broker name so -/// duplicate brokers still collide on the same lock/PID files +/// - Uses a unique temp directory per broker instance so concurrent +/// ephemeral brokers cannot collide on state files /// /// The temp directory is NOT removed on exit — the OS cleans it up on reboot. /// State and pending-delivery files are still written there so they don't @@ -34,10 +34,28 @@ pub(crate) fn continuity_dir(state_path: &Path) -> PathBuf { /// parent (SDK client) exits, stdin gets EOF and the broker shuts down. /// Single-instance enforcement is unnecessary here because each SDK client /// manages its own child process. -pub(crate) fn ensure_ephemeral_paths(_cwd: &Path, _broker_name: &str) -> Result { - // Use a random temp subdir so concurrent ephemeral brokers don't collide - // on state files. - let root = std::env::temp_dir().join(format!("agent-relay-ephemeral-{}", std::process::id())); +pub(crate) fn ensure_ephemeral_paths(_cwd: &Path, broker_name: &str) -> Result { + let safe_name: String = broker_name + .chars() + .map(|c| { + if c.is_alphanumeric() || c == '-' { + c + } else { + '-' + } + }) + .collect(); + let safe_name = if safe_name.is_empty() { + "broker".to_string() + } else { + safe_name + }; + let root = std::env::temp_dir().join(format!( + "agent-relay-ephemeral-{}-{}-{}", + std::process::id(), + safe_name, + Uuid::new_v4().simple() + )); std::fs::create_dir_all(&root) .with_context(|| format!("failed to create ephemeral temp dir {}", root.display()))?; diff --git a/crates/broker/src/runtime/relaycast_events.rs b/crates/broker/src/runtime/relaycast_events.rs index 656e7fc3e..63f5d5adf 100644 --- a/crates/broker/src/runtime/relaycast_events.rs +++ b/crates/broker/src/runtime/relaycast_events.rs @@ -693,6 +693,7 @@ impl BrokerRuntime { tracing::info!(dm_targets = ?delivery_plan.targets, "DM participant-based routing targets"); } + let local_delivery_timeout = http_api_local_delivery_timeout(); for worker_name in delivery_plan.targets { // Inbound-delivery queue: mirrors the /api/send // queue above. Auto-inject workers drain the queue @@ -739,48 +740,88 @@ impl BrokerRuntime { } InboundQueueOutcome::DrainNow(to_drain) => { for queued in to_drain { - if let Err(error) = try_inject_pending_relay_message( - workers, - pending_deliveries, - &worker_name, - &queued, - delivery_retry_interval, + match timeout( + local_delivery_timeout, + try_inject_pending_relay_message( + workers, + pending_deliveries, + &worker_name, + &queued, + delivery_retry_interval, + ), ) .await { - let _ = send_error( - sdk_out_tx, - None, - "delivery_failed", - error.to_string(), - true, - Some(json!({"worker": worker_name})), - ) - .await; + Ok(Ok(())) => {} + Ok(Err(error)) => { + let _ = send_error( + sdk_out_tx, + None, + "delivery_failed", + error.to_string(), + true, + Some(json!({"worker": worker_name})), + ) + .await; + } + Err(_) => { + let _ = send_error( + sdk_out_tx, + None, + "delivery_failed", + format!( + "relaycast delivery timed out after {}ms", + local_delivery_timeout.as_millis() + ), + true, + Some(json!({"worker": worker_name})), + ) + .await; + } } } continue; } InboundQueueOutcome::WorkerMissing => {} } - if let Err(error) = queue_and_try_delivery( - workers, - pending_deliveries, - &worker_name, - &mapped, - delivery_retry_interval, + match timeout( + local_delivery_timeout, + queue_and_try_delivery( + workers, + pending_deliveries, + &worker_name, + &mapped, + delivery_retry_interval, + ), ) .await { - let _ = send_error( - sdk_out_tx, - None, - "delivery_failed", - error.to_string(), - true, - Some(json!({"worker": worker_name})), - ) - .await; + Ok(Ok(())) => {} + Ok(Err(error)) => { + let _ = send_error( + sdk_out_tx, + None, + "delivery_failed", + error.to_string(), + true, + Some(json!({"worker": worker_name})), + ) + .await; + } + Err(_) => { + let _ = send_error( + sdk_out_tx, + None, + "delivery_failed", + format!( + "relaycast delivery timed out after {}ms", + local_delivery_timeout.as_millis() + ), + true, + Some(json!({"worker": worker_name})), + ) + .await; + } } } diff --git a/crates/broker/src/runtime/session.rs b/crates/broker/src/runtime/session.rs index 5625629d7..73fee5c28 100644 --- a/crates/broker/src/runtime/session.rs +++ b/crates/broker/src/runtime/session.rs @@ -158,7 +158,6 @@ pub(crate) async fn connect_relay(opts: RelaySessionOptions<'_>) -> Result) -> Result &'static Mutex<()> { + static LOCK: OnceLock> = OnceLock::new(); + LOCK.get_or_init(|| Mutex::new(())) +} + async fn make_worker_registry_with_worker(name: &str) -> WorkerRegistry { let (tx, _rx) = mpsc::channel::(16); let mut registry = WorkerRegistry::new( @@ -451,6 +458,7 @@ async fn contract_health_fixture_requires_rich_listen_health_shape() { #[tokio::test] async fn contract_startup_429_fixture_requires_degraded_health_status() { + let _guard = env_test_lock().lock().expect("env test lock"); let fixture: Value = serde_json::from_str(include_str!( "../../../../packages/contracts/fixtures/health-fixtures.json" )) @@ -558,6 +566,7 @@ fn contract_broadcast_whitelist_fixture_requires_filtering_to_required_kinds() { let emitted = extract_kind_literals(concat!( include_str!("api.rs"), + include_str!("maintenance.rs"), include_str!("relaycast_events.rs"), include_str!("worker_events.rs"), )); @@ -749,6 +758,22 @@ fn build_thread_infos_respects_explicit_unread_count() { assert_eq!(threads[0].unread_count, 7); } +#[test] +fn parse_sort_key_normalizes_numeric_seconds_to_millis() { + assert_eq!( + parse_sort_key_from_raw_timestamp("1771840800"), + Some(1_771_840_800_000) + ); + assert_eq!( + parse_sort_key_from_raw_timestamp("1771840800000"), + Some(1_771_840_800_000) + ); + assert_eq!( + parse_sort_key_from_raw_timestamp("2026-02-23T10:00:00Z"), + Some(1_771_840_800_000) + ); +} + #[test] fn build_agent_state_transition_event_has_expected_shape() { let payload = build_agent_state_transition_event("worker-a", "spawned", Some("sdk_spawn")); @@ -860,6 +885,7 @@ fn display_target_for_dashboard_maps_self_identity() { #[test] fn delivery_retry_interval_uses_default_and_env_override() { + let _guard = env_test_lock().lock().expect("env test lock"); std::env::remove_var("AGENT_RELAY_DELIVERY_RETRY_MS"); assert_eq!(delivery_retry_interval().as_millis(), 1_000); @@ -874,6 +900,7 @@ fn delivery_retry_interval_uses_default_and_env_override() { #[test] fn http_api_timeout_windows_use_default_and_env_override() { + let _guard = env_test_lock().lock().expect("env test lock"); std::env::remove_var("AGENT_RELAY_HTTP_API_LOCAL_DELIVERY_TIMEOUT_MS"); std::env::remove_var("AGENT_RELAY_HTTP_API_RELAYCAST_SEND_TIMEOUT_MS"); std::env::remove_var("AGENT_RELAY_HTTP_API_EVENT_EMIT_TIMEOUT_MS"); @@ -1551,6 +1578,18 @@ fn continuity_dir_preserves_relative_paths() { assert_eq!(result, std::path::PathBuf::from(".agent-relay/continuity")); } +#[test] +fn ephemeral_paths_are_unique_per_broker_instance() { + let cwd = PathBuf::from("/tmp/agent-relay-test-project"); + let first = ensure_ephemeral_paths(&cwd, "test broker").expect("first ephemeral paths"); + let second = ensure_ephemeral_paths(&cwd, "test broker").expect("second ephemeral paths"); + + assert_ne!(first.state, second.state); + assert_ne!(first.pending, second.pending); + assert!(first.state.parent().unwrap().exists()); + assert!(second.state.parent().unwrap().exists()); +} + #[test] fn http_api_spawn_spec_defaults_to_pty_runtime() { let spec = build_http_api_spawn_spec( diff --git a/crates/broker/src/runtime/util.rs b/crates/broker/src/runtime/util.rs index 7cccdce55..b2ca42c0a 100644 --- a/crates/broker/src/runtime/util.rs +++ b/crates/broker/src/runtime/util.rs @@ -19,6 +19,13 @@ pub(crate) fn log_startup_phase(enabled: bool, started_at: Instant, message: imp } } +pub(crate) fn unix_timestamp_secs() -> u64 { + std::time::SystemTime::now() + .duration_since(std::time::UNIX_EPOCH) + .unwrap_or_default() + .as_secs() +} + pub(crate) fn init_tracing() { let (writer, guard) = tracing_appender::non_blocking(std::io::stderr()); let subscriber = tracing_subscriber::fmt::Subscriber::builder() diff --git a/crates/broker/src/runtime/worker_events.rs b/crates/broker/src/runtime/worker_events.rs index 17c5729e0..1f4ff3fea 100644 --- a/crates/broker/src/runtime/worker_events.rs +++ b/crates/broker/src/runtime/worker_events.rs @@ -6,12 +6,10 @@ impl BrokerRuntime { let state = &mut self.state; let sdk_out_tx = &self.sdk_out_tx; let ws_control_tx = &self.ws_control_tx; - let relaycast_http = &self.relaycast_http; let workers = &mut self.workers; let pending_deliveries = &mut self.pending_deliveries; let terminal_failed_deliveries = &mut self.terminal_failed_deliveries; let pending_requests = &mut self.pending_requests; - let delivery_states = &mut self.delivery_states; let delivery_retry_interval = self.delivery_retry_interval; match worker_event { @@ -492,9 +490,6 @@ impl BrokerRuntime { } } } else if msg_type == "worker_exited" { - // PTY worker process is exiting — clean up and - // emit agent_exited so the SDK doesn't have to - // wait for the reap_exited polling cycle. let code = value .get("payload") .and_then(|p| p.get("code")) @@ -509,62 +504,8 @@ impl BrokerRuntime { agent = %name, code = ?code, signal = ?signal, - "worker_exited received — cleaning up" + "worker_exited received; deferring cleanup to reap_exited" ); - // Remove from registry so reap_exited won't - // double-process this worker. - workers.workers.remove(&name); - workers.initial_tasks.remove(&name); - // Drop pending deliveries for this worker - let dropped = drop_pending_for_worker(pending_deliveries, &name); - if dropped > 0 { - let _ = send_event( - sdk_out_tx, - json!({ - "kind": "delivery_dropped", - "name": name, - "count": dropped, - "reason": "worker_exited", - }), - ) - .await; - } - fail_pending_requests_for_worker(pending_requests, &name, "worker_exited"); - delivery_states.remove(&name); - let _ = send_event( - sdk_out_tx, - json!({ - "kind": "agent_exited", - "name": name, - "code": code, - "signal": signal, - }), - ) - .await; - publish_agent_state_transition( - ws_control_tx, - &name, - "exited", - Some("worker_exited"), - ) - .await; - if let Err(error) = relaycast_http.mark_agent_offline(&name).await { - tracing::warn!( - worker = %name, - error = %error, - "failed to mark exited worker offline in relaycast" - ); - } - state.agents.remove(&name); - if paths.persist { - if let Err(error) = state.save(&paths.state) { - tracing::warn!( - path = %paths.state.display(), - error = %error, - "failed to persist broker state" - ); - } - } } } } diff --git a/crates/broker/src/worker_request.rs b/crates/broker/src/worker_request.rs index e866653e8..4b626811f 100644 --- a/crates/broker/src/worker_request.rs +++ b/crates/broker/src/worker_request.rs @@ -190,9 +190,9 @@ pub(crate) fn reap_expired( /// Fail every pending request targeting `worker_name` immediately with /// [`RequestWorkerError::WorkerDisappeared`]. Called from the broker's -/// worker-teardown paths (explicit release, `worker_exited` frame, -/// `reap_exited` sweep) so that in-flight HTTP callers don't have to -/// wait out the full request deadline when a worker has clearly gone. +/// worker-teardown paths (explicit release or `reap_exited` sweep) so +/// that in-flight HTTP callers don't have to wait out the full request +/// deadline when a worker has clearly gone. /// /// Returns the `(request_id, kind)` pairs that were drained, for the /// caller to emit structured logs. From b788359294d57d5a31834146ea6340055ce3ef0c Mon Sep 17 00:00:00 2001 From: Will Washburn Date: Mon, 18 May 2026 22:19:55 -0400 Subject: [PATCH 8/8] fix: pass idle threshold to spawned workers --- crates/broker/src/listen_api.rs | 4 +++- crates/broker/src/runtime/api.rs | 15 ++++++++------- 2 files changed, 11 insertions(+), 8 deletions(-) diff --git a/crates/broker/src/listen_api.rs b/crates/broker/src/listen_api.rs index d14b816d6..d9dc85c5e 100644 --- a/crates/broker/src/listen_api.rs +++ b/crates/broker/src/listen_api.rs @@ -2037,7 +2037,7 @@ mod auth_tests { shadow_of, shadow_mode, continue_from, - idle_threshold_secs: _, + idle_threshold_secs, skip_relay_prompt: _, restart_policy: _, agent_token: _, @@ -2058,6 +2058,7 @@ mod auth_tests { assert_eq!(shadow_of.as_deref(), Some("Lead")); assert_eq!(shadow_mode.as_deref(), Some("subagent")); assert_eq!(continue_from.as_deref(), Some("worker-prev")); + assert_eq!(idle_threshold_secs, Some(30)); let _ = reply.send(Ok( json!({ "success": true, "name": "worker-a", "pid": 42 }), )); @@ -2087,6 +2088,7 @@ mod auth_tests { "shadowOf": "Lead", "shadowMode": "subagent", "continueFrom": "worker-prev", + "idleThresholdSecs": 30, }) .to_string(), )) diff --git a/crates/broker/src/runtime/api.rs b/crates/broker/src/runtime/api.rs index e8622d73b..84bfb927a 100644 --- a/crates/broker/src/runtime/api.rs +++ b/crates/broker/src/runtime/api.rs @@ -185,14 +185,19 @@ impl BrokerRuntime { } } + let spawn_workspace_id = default_workspace_id.clone().or_else(|| { + workspaces + .first() + .map(|workspace| workspace.workspace_id.clone()) + }); match workers .spawn( spec, Some("Dashboard".to_string()), - None, + idle_threshold_secs, worker_relay_key.clone(), skip_relay_prompt, - idle_threshold_secs.map(|s| s.to_string()), + spawn_workspace_id.clone(), ) .await { @@ -235,11 +240,7 @@ impl BrokerRuntime { } note_local_spawn_control_dedup( dedup, - default_workspace_id.as_deref().or_else(|| { - workspaces - .first() - .map(|workspace| workspace.workspace_id.as_str()) - }), + spawn_workspace_id.as_deref(), &name, worker_relay_key.as_deref(), );