From 506f930927474363c8098f6de041c1a73cf2a9d7 Mon Sep 17 00:00:00 2001 From: Khaliq Date: Fri, 20 Mar 2026 12:22:51 +0100 Subject: [PATCH 1/5] fix: reduce WS spawn pre-registration timeout from 15s to 3s PR #591 added a synchronous register_agent_token() HTTP call with a 15s timeout in the WS event loop before spawning agents. This blocked the event loop and delayed Codex agent spawns by up to 15s (on top of the existing 25s boot marker timeout), causing apparent spawn failures. Reduce the timeout to 3s so the spawn proceeds quickly. On timeout or failure, the agent self-registers via its MCP server (pre-#591 behavior). Also adds ~/.local/bin, ~/.opencode/bin, ~/.claude/local to the fallback PATH in pty.rs so CLIs installed in user-local directories are found. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/main.rs | 43 +++++++++++++++++++++++-------------------- src/pty.rs | 5 ++++- 2 files changed, 27 insertions(+), 21 deletions(-) diff --git a/src/main.rs b/src/main.rs index 235650f41..1f19219ca 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2552,16 +2552,17 @@ async fn run_init(cmd: InitCommand, telemetry: TelemetryClient) -> Result<()> { let spec_for_state = spec.clone(); let effective_task = normalize_initial_task(task.clone()); - // Pre-register the agent so its MCP server starts - // with a valid token (same as the SDK spawn_agent path). - // The WS event may include a token, but often doesn't — - // fall back to broker-side registration. + // Try to get the token from the WS event first (instant). + // If unavailable, attempt a quick registration with a short + // timeout. Previously this used a 15s timeout which blocked + // the WS event loop and delayed agent spawn significantly. + // On failure, the agent will self-register via its MCP server. let worker_relay_key = { let ws_token = relaycast_ws_spawn_token(&ws_value); if ws_token.is_some() { ws_token } else { - const REG_TIMEOUT: Duration = Duration::from_secs(15); + const REG_TIMEOUT: Duration = Duration::from_secs(3); match tokio::time::timeout( REG_TIMEOUT, workspace_http.register_agent_token(&name, Some(cli.as_str())), @@ -2584,7 +2585,7 @@ async fn run_init(cmd: InitCommand, telemetry: TelemetryClient) -> Result<()> { Err(_) => { tracing::warn!( worker = %name, - "WS spawn pre-registration timed out; agent will self-register" + "WS spawn pre-registration timed out (3s); agent will self-register" ); None } @@ -2739,13 +2740,13 @@ async fn run_init(cmd: InitCommand, telemetry: TelemetryClient) -> Result<()> { let task_opt = Some(task).filter(|v| !v.trim().is_empty()); let effective_task = normalize_initial_task(task_opt.clone()); - // Pre-register (same as primary WS spawn path above). + // Pre-register with short timeout (same as primary WS spawn path). let worker_relay_key = { let ws_token = relaycast_ws_spawn_token(&ws_value); if ws_token.is_some() { ws_token } else { - const REG_TIMEOUT: Duration = Duration::from_secs(15); + const REG_TIMEOUT: Duration = Duration::from_secs(3); match tokio::time::timeout( REG_TIMEOUT, workspace_http.register_agent_token(&name, Some(cli.as_str())), @@ -2760,7 +2761,7 @@ async fn run_init(cmd: InitCommand, telemetry: TelemetryClient) -> Result<()> { None } Err(_) => { - tracing::warn!(worker = %name, "WS spawn fallback pre-registration timed out"); + tracing::warn!(worker = %name, "WS spawn fallback pre-registration timed out (3s)"); None } } @@ -4142,6 +4143,19 @@ async fn handle_sdk_frame( None }; + // Seed the dedup cache BEFORE spawning so that a Relaycast WS echo + // arriving while the spawn is in progress is correctly deduplicated. + note_local_spawn_control_dedup( + dedup, + default_workspace_id.or_else(|| { + workspaces + .first() + .map(|workspace| workspace.workspace_id.as_str()) + }), + &name, + worker_relay_key.as_deref(), + ); + workers .spawn( payload.agent.clone(), @@ -4175,17 +4189,6 @@ async fn handle_sdk_frame( .await; } } - - note_local_spawn_control_dedup( - dedup, - default_workspace_id.or_else(|| { - workspaces - .first() - .map(|workspace| workspace.workspace_id.as_str()) - }), - &name, - worker_relay_key.as_deref(), - ); if let Some(task) = effective_task.clone() { workers.initial_tasks.insert(name.clone(), task); } diff --git a/src/pty.rs b/src/pty.rs index 9864ef23b..3b4d90294 100644 --- a/src/pty.rs +++ b/src/pty.rs @@ -55,7 +55,10 @@ fn resolve_command_path(command: &str) -> String { .unwrap_or_else(|| { #[cfg(unix)] { - OsString::from("/usr/local/bin:/usr/bin:/bin:/opt/homebrew/bin") + let home = env::var("HOME").unwrap_or_else(|_| String::from("/root")); + OsString::from(format!( + "{home}/.local/bin:{home}/.opencode/bin:{home}/.claude/local:/usr/local/bin:/usr/bin:/bin:/opt/homebrew/bin" + )) } #[cfg(windows)] { From 681e51cf79fa677abeb50c6f73d97f3fb0f8a0b9 Mon Sep 17 00:00:00 2001 From: Khaliq Date: Fri, 20 Mar 2026 12:29:06 +0100 Subject: [PATCH 2/5] fix: skip pre-registration for Claude agents (self-registers via MCP) Claude bakes the API key into --mcp-config JSON and self-registers reliably, so the blocking HTTP registration call is unnecessary. Non-Claude CLIs still get a 3s registration attempt since they need the token injected into their CLI args at spawn time. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/main.rs | 20 ++++++++++++++------ 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/src/main.rs b/src/main.rs index 1f19219ca..8792aec01 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2552,15 +2552,19 @@ async fn run_init(cmd: InitCommand, telemetry: TelemetryClient) -> Result<()> { let spec_for_state = spec.clone(); let effective_task = normalize_initial_task(task.clone()); - // Try to get the token from the WS event first (instant). - // If unavailable, attempt a quick registration with a short - // timeout. Previously this used a 15s timeout which blocked - // the WS event loop and delayed agent spawn significantly. - // On failure, the agent will self-register via its MCP server. + // Pre-register agent token. Claude doesn't need this — it + // bakes the API key into --mcp-config JSON and self-registers. + // Non-Claude CLIs need the token injected into their CLI args + // at spawn time, so we do a quick (3s) registration attempt. + let cli_name_lower = normalize_cli_name(&cli).to_lowercase(); + let is_claude = cli_name_lower == "claude" || cli_name_lower.starts_with("claude:"); let worker_relay_key = { let ws_token = relaycast_ws_spawn_token(&ws_value); if ws_token.is_some() { ws_token + } else if is_claude { + // Claude self-registers via its MCP server — skip blocking call + None } else { const REG_TIMEOUT: Duration = Duration::from_secs(3); match tokio::time::timeout( @@ -2740,11 +2744,15 @@ async fn run_init(cmd: InitCommand, telemetry: TelemetryClient) -> Result<()> { let task_opt = Some(task).filter(|v| !v.trim().is_empty()); let effective_task = normalize_initial_task(task_opt.clone()); - // Pre-register with short timeout (same as primary WS spawn path). + // Pre-register (same logic as primary WS spawn path). + let cli_name_lower = normalize_cli_name(&cli).to_lowercase(); + let is_claude = cli_name_lower == "claude" || cli_name_lower.starts_with("claude:"); let worker_relay_key = { let ws_token = relaycast_ws_spawn_token(&ws_value); if ws_token.is_some() { ws_token + } else if is_claude { + None } else { const REG_TIMEOUT: Duration = Duration::from_secs(3); match tokio::time::timeout( From 8e443b9f524ad5cad986fd4b778377484f2fbecc Mon Sep 17 00:00:00 2001 From: Khaliq Date: Fri, 20 Mar 2026 12:48:04 +0100 Subject: [PATCH 3/5] =?UTF-8?q?fix:=20address=20Devin=20review=20=E2=80=94?= =?UTF-8?q?=20CLI=20arg=20parsing=20and=20dedup-after-spawn?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Co-Authored-By: Claude Opus 4.6 (1M context) --- src/main.rs | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/src/main.rs b/src/main.rs index 8792aec01..8fb1bad49 100644 --- a/src/main.rs +++ b/src/main.rs @@ -2556,7 +2556,8 @@ async fn run_init(cmd: InitCommand, telemetry: TelemetryClient) -> Result<()> { // bakes the API key into --mcp-config JSON and self-registers. // Non-Claude CLIs need the token injected into their CLI args // at spawn time, so we do a quick (3s) registration attempt. - let cli_name_lower = normalize_cli_name(&cli).to_lowercase(); + let cli_command = parse_cli_command(&cli).map(|(cmd, _)| cmd).unwrap_or_else(|_| cli.clone()); + let cli_name_lower = normalize_cli_name(&cli_command).to_lowercase(); let is_claude = cli_name_lower == "claude" || cli_name_lower.starts_with("claude:"); let worker_relay_key = { let ws_token = relaycast_ws_spawn_token(&ws_value); @@ -2745,7 +2746,8 @@ async fn run_init(cmd: InitCommand, telemetry: TelemetryClient) -> Result<()> { let effective_task = normalize_initial_task(task_opt.clone()); // Pre-register (same logic as primary WS spawn path). - let cli_name_lower = normalize_cli_name(&cli).to_lowercase(); + let cli_command = parse_cli_command(&cli).map(|(cmd, _)| cmd).unwrap_or_else(|_| cli.clone()); + let cli_name_lower = normalize_cli_name(&cli_command).to_lowercase(); let is_claude = cli_name_lower == "claude" || cli_name_lower.starts_with("claude:"); let worker_relay_key = { let ws_token = relaycast_ws_spawn_token(&ws_value); @@ -4151,19 +4153,6 @@ async fn handle_sdk_frame( None }; - // Seed the dedup cache BEFORE spawning so that a Relaycast WS echo - // arriving while the spawn is in progress is correctly deduplicated. - note_local_spawn_control_dedup( - dedup, - default_workspace_id.or_else(|| { - workspaces - .first() - .map(|workspace| workspace.workspace_id.as_str()) - }), - &name, - worker_relay_key.as_deref(), - ); - workers .spawn( payload.agent.clone(), @@ -4175,6 +4164,19 @@ async fn handle_sdk_frame( ) .await?; + // Seed the dedup cache AFTER successful spawn so that a failed + // spawn does not block retries for the 5-minute dedup window. + note_local_spawn_control_dedup( + dedup, + default_workspace_id.or_else(|| { + workspaces + .first() + .map(|workspace| workspace.workspace_id.as_str()) + }), + &name, + worker_relay_key.as_deref(), + ); + // Subscribe the broker's WebSocket to any custom channels the // spawned agent needs so cloud-routed messages reach the broker. if !payload.agent.channels.is_empty() { From d4f1639396e55cd7fe850e4cbb6f04797aa048fa Mon Sep 17 00:00:00 2001 From: Khaliq Date: Fri, 20 Mar 2026 12:50:12 +0100 Subject: [PATCH 4/5] fix: seed dedup before spawn with cleanup on failure Issue 1: Keep dedup seeding before spawn (so WS echoes during spawn are deduplicated) but remove the dedup entry if spawn fails, preventing failed spawns from blocking retries for the 5-minute dedup window. Adds DedupCache::remove() and remove_local_spawn_control_dedup(). Issue 2: Already fixed in prior commit (parse_cli_command before normalize_cli_name for is_claude check). Co-Authored-By: Claude Opus 4.6 (1M context) --- src/dedup.rs | 5 +++++ src/main.rs | 62 +++++++++++++++++++++++++++++++++++++++++----------- 2 files changed, 54 insertions(+), 13 deletions(-) diff --git a/src/dedup.rs b/src/dedup.rs index d5f729d66..d1f1a1e61 100644 --- a/src/dedup.rs +++ b/src/dedup.rs @@ -54,6 +54,11 @@ impl DedupCache { } } + pub fn remove(&mut self, id: &str) { + self.seen.remove(id); + self.order.retain(|(key, _)| key != id); + } + pub fn len(&self) -> usize { self.seen.len() } diff --git a/src/main.rs b/src/main.rs index 8fb1bad49..7106a9099 100644 --- a/src/main.rs +++ b/src/main.rs @@ -4153,19 +4153,9 @@ async fn handle_sdk_frame( None }; - workers - .spawn( - payload.agent.clone(), - None, - payload.idle_threshold_secs, - worker_relay_key.clone(), - payload.skip_relay_prompt, - None, - ) - .await?; - - // Seed the dedup cache AFTER successful spawn so that a failed - // spawn does not block retries for the 5-minute dedup window. + // Seed the dedup cache BEFORE spawning so that a Relaycast WS echo + // arriving while the spawn is in progress is correctly deduplicated. + // If spawn fails we remove the entry so retries are not blocked. note_local_spawn_control_dedup( dedup, default_workspace_id.or_else(|| { @@ -4177,6 +4167,32 @@ async fn handle_sdk_frame( worker_relay_key.as_deref(), ); + if let Err(err) = workers + .spawn( + payload.agent.clone(), + None, + payload.idle_threshold_secs, + worker_relay_key.clone(), + payload.skip_relay_prompt, + None, + ) + .await + { + // Spawn failed — remove the dedup entry so WS retries are not + // blocked for the 5-minute dedup window. + remove_local_spawn_control_dedup( + dedup, + default_workspace_id.or_else(|| { + workspaces + .first() + .map(|workspace| workspace.workspace_id.as_str()) + }), + &name, + worker_relay_key.as_deref(), + ); + return Err(err); + } + // Subscribe the broker's WebSocket to any custom channels the // spawned agent needs so cloud-routed messages reach the broker. if !payload.agent.channels.is_empty() { @@ -5670,6 +5686,26 @@ fn note_local_spawn_control_dedup( } } +fn remove_local_spawn_control_dedup( + dedup: &mut DedupCache, + workspace_id: Option<&str>, + agent_name: &str, + relay_key: Option<&str>, +) { + let Some(workspace_id) = workspace_id else { + return; + }; + let agent_name = agent_name.trim(); + if !agent_name.is_empty() { + let key = relaycast_spawn_control_dedup_key(workspace_id, agent_name); + dedup.remove(&key); + } + if let Some(relay_key) = relay_key.map(str::trim).filter(|value| !value.is_empty()) { + let key = relaycast_spawn_control_dedup_key(workspace_id, relay_key); + dedup.remove(&key); + } +} + fn is_unknown_worker_error_message(message: &str) -> bool { message.contains("unknown worker '") } From 5f78fa8feda4add9b1f978a3b9e5e77883ae47b5 Mon Sep 17 00:00:00 2001 From: Khaliq Date: Fri, 20 Mar 2026 13:00:09 +0100 Subject: [PATCH 5/5] fix: preserve dedup entries when spawn fails with already-exists When a second spawn request for an already-running agent fails with "already exists", we must not remove the dedup entry from the first successful spawn. Doing so would allow WebSocket echoes through. Co-Authored-By: Claude Opus 4.6 (1M context) --- src/main.rs | 29 +++++++++++++++++------------ 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/src/main.rs b/src/main.rs index 7106a9099..a8c993f07 100644 --- a/src/main.rs +++ b/src/main.rs @@ -4178,18 +4178,23 @@ async fn handle_sdk_frame( ) .await { - // Spawn failed — remove the dedup entry so WS retries are not - // blocked for the 5-minute dedup window. - remove_local_spawn_control_dedup( - dedup, - default_workspace_id.or_else(|| { - workspaces - .first() - .map(|workspace| workspace.workspace_id.as_str()) - }), - &name, - worker_relay_key.as_deref(), - ); + let err_msg = format!("{err:#}"); + // Only clean up dedup if this was a genuinely new spawn attempt + // that failed, not a duplicate request for an already-running + // agent. When the error is "already exists" the dedup entry + // belongs to the prior successful spawn and must be preserved. + if !err_msg.contains("already exists") { + remove_local_spawn_control_dedup( + dedup, + default_workspace_id.or_else(|| { + workspaces + .first() + .map(|workspace| workspace.workspace_id.as_str()) + }), + &name, + worker_relay_key.as_deref(), + ); + } return Err(err); }