From 1c65117daaa1d2bbfd4f1a5d6c31b2b8f849f4bf Mon Sep 17 00:00:00 2001 From: Claude Date: Sat, 10 Jan 2026 16:25:19 +0000 Subject: [PATCH] feat: add workspace keepalive service to prevent Fly.io idle Implements cloud-to-workspace heartbeat mechanism to prevent Fly.io from idling machines that have active agents running. Problem: Fly.io uses request-based concurrency tracking. When Claude is running but no user has the dashboard open, there are no inbound HTTP requests, so Fly may idle the machine. Solution: - Add /keep-alive endpoint on workspace dashboard server - Create WorkspaceKeepaliveService that pings workspaces every 60s - Only pings workspaces that have online daemons with active agents - Inbound ping counts as activity for Fly's idle detection The service: - Queries all running workspaces with public URLs - Checks each workspace's daemons for online status and active agents - Pings /keep-alive endpoint which responds with active agent count - Stops pinging when no agents are active (allowing normal idle) --- src/cloud/server.ts | 18 +- src/cloud/services/index.ts | 9 + src/cloud/services/workspace-keepalive.ts | 297 ++++++++++++++++++++++ src/dashboard-server/server.ts | 30 +++ 4 files changed, 353 insertions(+), 1 deletion(-) create mode 100644 src/cloud/services/workspace-keepalive.ts diff --git a/src/cloud/server.ts b/src/cloud/server.ts index 745aa81de..0a69371a8 100644 --- a/src/cloud/server.ts +++ b/src/cloud/server.ts @@ -16,7 +16,7 @@ import { RedisStore } from 'connect-redis'; import { WebSocketServer, WebSocket } from 'ws'; import { getConfig } from './config.js'; import { runMigrations } from './db/index.js'; -import { getScalingOrchestrator, ScalingOrchestrator, getComputeEnforcementService, ComputeEnforcementService, getIntroExpirationService, IntroExpirationService } from './services/index.js'; +import { getScalingOrchestrator, ScalingOrchestrator, getComputeEnforcementService, ComputeEnforcementService, getIntroExpirationService, IntroExpirationService, getWorkspaceKeepaliveService, WorkspaceKeepaliveService } from './services/index.js'; const __filename = fileURLToPath(import.meta.url); const __dirname = path.dirname(__filename); @@ -382,6 +382,7 @@ export async function createServer(): Promise { let scalingOrchestrator: ScalingOrchestrator | null = null; let computeEnforcement: ComputeEnforcementService | null = null; let introExpiration: IntroExpirationService | null = null; + let workspaceKeepalive: WorkspaceKeepaliveService | null = null; let daemonStaleCheckInterval: ReturnType | null = null; // Create HTTP server for WebSocket upgrade handling @@ -758,6 +759,16 @@ export async function createServer(): Promise { } catch (error) { console.warn('[cloud] Failed to start intro expiration:', error); } + + // Start workspace keepalive service (pings workspaces with active agents) + // This prevents Fly.io from idling machines that have running Claude agents + try { + workspaceKeepalive = getWorkspaceKeepaliveService(); + workspaceKeepalive.start(); + console.log('[cloud] Workspace keepalive service started'); + } catch (error) { + console.warn('[cloud] Failed to start workspace keepalive:', error); + } } // Start daemon stale check (mark daemons offline if no heartbeat for 2+ minutes) @@ -800,6 +811,11 @@ export async function createServer(): Promise { introExpiration.stop(); } + // Stop workspace keepalive service + if (workspaceKeepalive) { + workspaceKeepalive.stop(); + } + // Stop daemon stale check if (daemonStaleCheckInterval) { clearInterval(daemonStaleCheckInterval); diff --git a/src/cloud/services/index.ts b/src/cloud/services/index.ts index 76f85fd7d..386750d9e 100644 --- a/src/cloud/services/index.ts +++ b/src/cloud/services/index.ts @@ -87,3 +87,12 @@ export { startIntroExpirationService, stopIntroExpirationService, } from './intro-expiration.js'; + +// Workspace keepalive (prevent Fly.io from idling machines with active agents) +export { + WorkspaceKeepaliveService, + WorkspaceKeepaliveConfig, + KeepaliveStats, + getWorkspaceKeepaliveService, + createWorkspaceKeepaliveService, +} from './workspace-keepalive.js'; diff --git a/src/cloud/services/workspace-keepalive.ts b/src/cloud/services/workspace-keepalive.ts new file mode 100644 index 000000000..4d71662a8 --- /dev/null +++ b/src/cloud/services/workspace-keepalive.ts @@ -0,0 +1,297 @@ +/** + * Workspace Keepalive Service + * + * Prevents Fly.io from idling workspace machines that have active agents running. + * + * Problem: Fly.io uses request-based concurrency tracking to determine when to + * idle a machine. If a Claude agent is running but no HTTP requests are coming + * in (e.g., no one has the dashboard open), Fly.io may idle the machine. + * + * Solution: The cloud server periodically pings workspace machines that have + * active agents. This inbound HTTP request counts as activity for Fly.io's + * idle detection, keeping the machine awake. + * + * Flow: + * 1. Daemons report their running agents via heartbeat + * 2. This service queries for workspaces with active agents + * 3. Pings each workspace's /keep-alive endpoint + * 4. Workspace stays awake as long as agents are active + */ + +import { EventEmitter } from 'events'; +import { db } from '../db/index.js'; + +export interface WorkspaceKeepaliveConfig { + /** How often to ping active workspaces (default: 60s) */ + pingIntervalMs: number; + /** Request timeout for keep-alive pings (default: 5s) */ + requestTimeoutMs: number; + /** Consider daemon stale if last heartbeat older than this (default: 2 min) */ + staleThresholdMs: number; + /** Enable verbose logging (default: false) */ + verbose: boolean; +} + +export interface KeepaliveStats { + lastRun: Date | null; + totalPings: number; + successfulPings: number; + failedPings: number; + activeWorkspaces: number; +} + +interface WorkspaceWithAgents { + workspaceId: string; + publicUrl: string; + daemonId: string; + daemonName: string; + agentCount: number; +} + +const DEFAULT_CONFIG: WorkspaceKeepaliveConfig = { + pingIntervalMs: 60_000, // 1 minute (well under Fly's ~5-10 min idle timeout) + requestTimeoutMs: 5_000, // 5 seconds + staleThresholdMs: 2 * 60 * 1000, // 2 minutes + verbose: false, +}; + +export class WorkspaceKeepaliveService extends EventEmitter { + private config: WorkspaceKeepaliveConfig; + private pingTimer: ReturnType | null = null; + private stats: KeepaliveStats = { + lastRun: null, + totalPings: 0, + successfulPings: 0, + failedPings: 0, + activeWorkspaces: 0, + }; + + constructor(config: Partial = {}) { + super(); + this.config = { ...DEFAULT_CONFIG, ...config }; + } + + /** + * Start the keepalive service + */ + start(): void { + if (this.pingTimer) { + return; // Already running + } + + console.log('[keepalive] Starting workspace keepalive service', { + intervalMs: this.config.pingIntervalMs, + }); + + // Initial ping + this.pingActiveWorkspaces().catch((err) => { + console.error('[keepalive] Initial ping failed:', err); + }); + + // Start periodic pings + this.pingTimer = setInterval(() => { + this.pingActiveWorkspaces().catch((err) => { + console.error('[keepalive] Periodic ping failed:', err); + }); + }, this.config.pingIntervalMs); + } + + /** + * Stop the keepalive service + */ + stop(): void { + if (this.pingTimer) { + clearInterval(this.pingTimer); + this.pingTimer = null; + console.log('[keepalive] Stopped workspace keepalive service'); + } + } + + /** + * Get current statistics + */ + getStats(): KeepaliveStats { + return { ...this.stats }; + } + + /** + * Find workspaces with active agents and ping them + */ + async pingActiveWorkspaces(): Promise { + const startTime = Date.now(); + + try { + // Find workspaces with active agents + const activeWorkspaces = await this.findWorkspacesWithActiveAgents(); + this.stats.activeWorkspaces = activeWorkspaces.length; + this.stats.lastRun = new Date(); + + if (activeWorkspaces.length === 0) { + if (this.config.verbose) { + console.log('[keepalive] No active workspaces to ping'); + } + return; + } + + if (this.config.verbose) { + console.log(`[keepalive] Pinging ${activeWorkspaces.length} active workspace(s)`); + } + + // Ping each workspace in parallel + const results = await Promise.allSettled( + activeWorkspaces.map((ws) => this.pingWorkspace(ws)) + ); + + // Update stats + for (const result of results) { + this.stats.totalPings++; + if (result.status === 'fulfilled' && result.value) { + this.stats.successfulPings++; + } else { + this.stats.failedPings++; + } + } + + const duration = Date.now() - startTime; + if (this.config.verbose) { + console.log(`[keepalive] Ping cycle complete`, { + workspaces: activeWorkspaces.length, + durationMs: duration, + }); + } + + this.emit('ping-cycle', { + workspaces: activeWorkspaces.length, + duration, + results: results.map((r) => r.status === 'fulfilled' && r.value), + }); + } catch (err) { + console.error('[keepalive] Error in ping cycle:', err); + this.emit('error', err); + } + } + + /** + * Find all workspaces that have daemons with active agents + */ + private async findWorkspacesWithActiveAgents(): Promise { + const staleThreshold = new Date(Date.now() - this.config.staleThresholdMs); + + // Get all workspaces and check each for active agents + const allWorkspaces = await db.workspaces.findAll(); + + const activeWorkspaces: WorkspaceWithAgents[] = []; + + for (const workspace of allWorkspaces) { + // Skip workspaces that aren't running or don't have a URL + if (workspace.status !== 'running' || !workspace.publicUrl) { + continue; + } + + // Get daemons for this workspace + const daemons = await db.linkedDaemons.findByWorkspaceId(workspace.id); + + for (const daemon of daemons) { + // Skip offline daemons or those with stale heartbeats + if (daemon.status !== 'online') continue; + if (daemon.lastSeenAt && daemon.lastSeenAt < staleThreshold) continue; + + // Check if daemon has any active agents + const metadata = daemon.metadata as Record | null; + const agents = (metadata?.agents as Array<{ name: string; status: string }>) || []; + + // Count agents that appear to be active (not offline/disconnected) + const activeAgents = agents.filter((a) => + a.status === 'online' || a.status === 'running' || a.status === 'active' + ); + + if (activeAgents.length > 0) { + activeWorkspaces.push({ + workspaceId: workspace.id, + publicUrl: workspace.publicUrl, + daemonId: daemon.id, + daemonName: daemon.name, + agentCount: activeAgents.length, + }); + // Only need one daemon per workspace to keep it alive + break; + } + } + } + + return activeWorkspaces; + } + + /** + * Ping a single workspace's keep-alive endpoint + */ + private async pingWorkspace(workspace: WorkspaceWithAgents): Promise { + const url = `${workspace.publicUrl.replace(/\/$/, '')}/keep-alive`; + + try { + const controller = new AbortController(); + const timeout = setTimeout(() => controller.abort(), this.config.requestTimeoutMs); + + const response = await fetch(url, { + method: 'GET', + signal: controller.signal, + headers: { + 'User-Agent': 'AgentRelay-Keepalive/1.0', + }, + }); + + clearTimeout(timeout); + + if (response.ok) { + const data = await response.json() as { ok: boolean; activeAgents?: number }; + if (this.config.verbose) { + console.log(`[keepalive] Pinged ${workspace.daemonName}`, { + workspaceId: workspace.workspaceId, + activeAgents: data.activeAgents, + }); + } + return true; + } else { + console.warn(`[keepalive] Ping failed for ${workspace.daemonName}:`, { + status: response.status, + url, + }); + return false; + } + } catch (err) { + // Don't log aborted requests as errors (timeout is expected for stopped machines) + if (err instanceof Error && err.name === 'AbortError') { + if (this.config.verbose) { + console.log(`[keepalive] Ping timeout for ${workspace.daemonName} (machine may be starting)`); + } + } else { + console.warn(`[keepalive] Ping error for ${workspace.daemonName}:`, err); + } + return false; + } + } +} + +// Singleton instance +let _keepaliveService: WorkspaceKeepaliveService | null = null; + +/** + * Get or create the keepalive service singleton + */ +export function getWorkspaceKeepaliveService( + config?: Partial +): WorkspaceKeepaliveService { + if (!_keepaliveService) { + _keepaliveService = new WorkspaceKeepaliveService(config); + } + return _keepaliveService; +} + +/** + * Create a new keepalive service (for testing) + */ +export function createWorkspaceKeepaliveService( + config?: Partial +): WorkspaceKeepaliveService { + return new WorkspaceKeepaliveService(config); +} diff --git a/src/dashboard-server/server.ts b/src/dashboard-server/server.ts index 1a9a13064..7c90c784f 100644 --- a/src/dashboard-server/server.ts +++ b/src/dashboard-server/server.ts @@ -2400,6 +2400,36 @@ export async function startDashboard( }); }); + /** + * GET /keep-alive - Keep-alive endpoint for Fly.io idle prevention + * Called by cloud server when workspace has active agents running. + * This inbound request counts as activity for Fly.io's request-based + * concurrency tracking, preventing the machine from being idled. + */ + app.get('/keep-alive', (req, res) => { + // Count online agents (seen within last 30 seconds) + let activeAgents = 0; + const agentsPath = path.join(teamDir, 'agents.json'); + if (fs.existsSync(agentsPath)) { + try { + const data = JSON.parse(fs.readFileSync(agentsPath, 'utf-8')); + const thirtySecondsAgo = Date.now() - 30 * 1000; + activeAgents = (data.agents || []).filter((a: { lastSeen?: string }) => { + if (!a.lastSeen) return false; + return new Date(a.lastSeen).getTime() > thirtySecondsAgo; + }).length; + } catch { + // Ignore parse errors + } + } + + res.json({ + ok: true, + activeAgents, + timestamp: Date.now(), + }); + }); + // ===== CLI Auth API (for workspace-based provider authentication) ===== /**