From e1e622144a21abb760cbf352ccaa607af4742cb8 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 21 Dec 2025 07:47:01 +0000
Subject: [PATCH 01/19] Add federation proposal for cross-server agent
 communication

Comprehensive design document for extending agent-relay to support
federated multi-server deployments while preserving the core
differentiator: automatic message injection via tmux.

Key design decisions:
- Separation of concerns: routing (network) vs injection (local)
- Hybrid topology: optional hub for discovery, direct peer connections
- Progressive enhancement: single-server unchanged, federation opt-in
- WebSocket + TLS for peer-to-peer daemon communication
- Message queuing for resilience during disconnects

Includes:
- Full protocol specification (PEER_HELLO, PEER_ROUTE, etc.)
- Agent discovery and registry design
- Security model (TLS, pre-shared tokens)
- Configuration schema
- CLI interface design
- 5-phase implementation plan (~4-5 weeks)

bd-TBD
---
 docs/FEDERATION_PROPOSAL.md | 1186 +++++++++++++++++++++++++++++++++++
 1 file changed, 1186 insertions(+)
 create mode 100644 docs/FEDERATION_PROPOSAL.md

diff --git a/docs/FEDERATION_PROPOSAL.md b/docs/FEDERATION_PROPOSAL.md
new file mode 100644
index 000000000..2aff708b5
--- /dev/null
+++ b/docs/FEDERATION_PROPOSAL.md
@@ -0,0 +1,1186 @@
+# Agent Relay Federation: Cross-Server Communication Proposal
+
+## Executive Summary
+
+This proposal extends agent-relay to support **federated multi-server deployments** while preserving the core differentiator: **automatic message injection via tmux**. Unlike polling-based systems (mcp_agent_mail OSS), federated agent-relay maintains real-time, interrupt-driven communication across server boundaries.
+
+```
+┌─────────────────────────────────────────────────────────────────────────────┐
+│                           FEDERATED AGENT-RELAY                              │
+│                                                                              │
+│  ┌─────────────┐         ┌─────────────┐         ┌─────────────┐           │
+│  │  Server A   │         │  Server B   │         │  Server C   │           │
+│  │             │         │             │         │             │           │
+│  │ ┌───┐ ┌───┐ │  wss:// │ ┌───┐ ┌───┐ │  wss:// │ ┌───┐       │           │
+│  │ │Ali│ │Bob│ │◄───────►│ │Car│ │Dav│ │◄───────►│ │Eve│       │           │
+│  │ └─┬─┘ └─┬─┘ │         │ └─┬─┘ └─┬─┘ │         │ └─┬─┘       │           │
+│  │   │     │   │         │   │     │   │         │   │         │           │
+│  │ ┌─┴─────┴─┐ │         │ ┌─┴─────┴─┐ │         │ ┌─┴───┐     │           │
+│  │ │ Daemon  │ │         │ │ Daemon  │ │         │ │Daemon│     │           │
+│  │ └─────────┘ │         │ └─────────┘ │         │ └─────┘     │           │
+│  └─────────────┘         └─────────────┘         └─────────────┘           │
+│                                                                              │
+│  • Agents run in tmux (unchanged)                                           │
+│  • Local injection via send-keys (unchanged)                                │
+│  • Cross-server routing via WebSocket (NEW)                                 │
+│  • Fleet-wide agent discovery (NEW)                                         │
+│                                                                              │
+└─────────────────────────────────────────────────────────────────────────────┘
+```
+
+---
+
+## Table of Contents
+
+1. [Design Principles](#1-design-principles)
+2. [Architecture Overview](#2-architecture-overview)
+3. [Network Topology](#3-network-topology)
+4. [Protocol Specification](#4-protocol-specification)
+5. [Agent Discovery & Registry](#5-agent-discovery--registry)
+6. [Message Routing](#6-message-routing)
+7. [Security Model](#7-security-model)
+8. [Failure Handling & Resilience](#8-failure-handling--resilience)
+9. [Configuration](#9-configuration)
+10. [CLI Interface](#10-cli-interface)
+11. [Implementation Plan](#11-implementation-plan)
+12. [Migration Path](#12-migration-path)
+
+---
+
+## 1. Design Principles
+
+### 1.1 Preserve the Core Magic
+
+The #1 requirement is preserving **automatic message injection**:
+
+```
+Message arrives → tmux send-keys → Agent receives as user input
+```
+
+This is what differentiates agent-relay from polling-based systems. Federation must not compromise this.
+
+### 1.2 Separation of Concerns
+
+```
+┌─────────────────────────────────────────────────────────┐
+│                    ROUTING LAYER (NEW)                   │
+│         Cross-server message delivery via WebSocket      │
+├─────────────────────────────────────────────────────────┤
+│                  INJECTION LAYER (UNCHANGED)             │
+│         Local tmux send-keys for each server             │
+└─────────────────────────────────────────────────────────┘
+```
+
+- **Routing** is a network problem → WebSocket between daemons
+- **Injection** is a local problem → tmux send-keys (unchanged)
+
+### 1.3 Progressive Enhancement
+
+- Single-server deployments work exactly as before
+- Federation is opt-in via configuration
+- No breaking changes to existing setups
+
+### 1.4 Operational Simplicity
+
+- No external dependencies (Redis, NATS) required
+- Optional hub for convenience, not required
+- Static peer configuration works fine
+- Simple CLI for fleet management
+
+---
+
+## 2. Architecture Overview
+
+### 2.1 Component Diagram
+
+```
+┌─────────────────────────────────────────────────────────────────────────────┐
+│                              SERVER NODE                                     │
+│                                                                              │
+│  ┌─────────────────────────────────────────────────────────────────────┐   │
+│  │                         AGENT LAYER                                  │   │
+│  │                                                                      │   │
+│  │   ┌──────────────┐    ┌──────────────┐    ┌──────────────┐         │   │
+│  │   │ tmux: Alice  │    │ tmux: Bob    │    │ tmux: Carol  │         │   │
+│  │   │ (claude)     │    │ (codex)      │    │ (gemini)     │         │   │
+│  │   └──────┬───────┘    └──────┬───────┘    └──────┬───────┘         │   │
+│  │          │ capture-pane      │                   │                  │   │
+│  │          │ send-keys         │                   │                  │   │
+│  │          ▼                   ▼                   ▼                  │   │
+│  │   ┌──────────────────────────────────────────────────────────┐     │   │
+│  │   │                   TmuxWrapper (per agent)                 │     │   │
+│  │   │  • Parse @relay: patterns                                 │     │   │
+│  │   │  • Inject incoming messages                               │     │   │
+│  │   │  • Connect to local daemon                                │     │   │
+│  │   └──────────────────────────┬───────────────────────────────┘     │   │
+│  │                              │ Unix Socket                         │   │
+│  └──────────────────────────────┼─────────────────────────────────────┘   │
+│                                 │                                          │
+│  ┌──────────────────────────────▼─────────────────────────────────────┐   │
+│  │                         DAEMON LAYER                                │   │
+│  │                                                                      │   │
+│  │   ┌─────────────────┐    ┌─────────────────┐    ┌────────────────┐ │   │
+│  │   │  LocalServer    │    │  PeerManager    │    │  Registry      │ │   │
+│  │   │  (Unix socket)  │    │  (WebSocket)    │    │  (agents map)  │ │   │
+│  │   │                 │    │                 │    │                │ │   │
+│  │   │ • Accept local  │    │ • Connect peers │    │ • Local agents │ │   │
+│  │   │   connections   │    │ • Route cross-  │    │ • Remote agents│ │   │
+│  │   │ • Handle HELLO  │    │   server msgs   │    │ • Server map   │ │   │
+│  │   └────────┬────────┘    └────────┬────────┘    └────────┬───────┘ │   │
+│  │            │                      │                      │         │   │
+│  │            └──────────────────────┼──────────────────────┘         │   │
+│  │                                   │                                 │   │
+│  │                         ┌─────────▼─────────┐                      │   │
+│  │                         │      Router       │                      │   │
+│  │                         │                   │                      │   │
+│  │                         │ • Decide local vs │                      │   │
+│  │                         │   remote routing  │                      │   │
+│  │                         │ • Handle broadcast│                      │   │
+│  │                         │ • Queue on disco- │                      │   │
+│  │                         │   nnect           │                      │   │
+│  │                         └───────────────────┘                      │   │
+│  │                                                                      │   │
+│  └──────────────────────────────────────────────────────────────────────┘   │
+│                                 │                                          │
+│                                 │ WebSocket (wss://)                       │
+│                                 ▼                                          │
+│                    ┌─────────────────────────────┐                        │
+│                    │       PEER SERVERS          │                        │
+│                    │   (other fleet members)     │                        │
+│                    └─────────────────────────────┘                        │
+│                                                                              │
+└─────────────────────────────────────────────────────────────────────────────┘
+```
+
+### 2.2 Data Flow: Cross-Server Message
+
+```
+Alice@ServerA sends to Bob@ServerB:
+
+┌─────────────────────────────────────────────────────────────────────────────┐
+│ SERVER A                                                                     │
+│                                                                              │
+│ 1. Alice outputs: @relay:Bob Can you review auth.ts?                        │
+│         │                                                                    │
+│         ▼                                                                    │
+│ 2. TmuxWrapper captures via tmux capture-pane                               │
+│         │                                                                    │
+│         ▼                                                                    │
+│ 3. Parser extracts: { to: "Bob", body: "Can you review auth.ts?" }          │
+│         │                                                                    │
+│         ▼                                                                    │
+│ 4. RelayClient sends SEND envelope to local daemon (Unix socket)            │
+│         │                                                                    │
+│         ▼                                                                    │
+│ 5. Router checks registry: Bob not local, Bob is on ServerB                 │
+│         │                                                                    │
+│         ▼                                                                    │
+│ 6. PeerManager sends PEER_ROUTE to ServerB (WebSocket)                      │
+│                                                                              │
+└─────────────────────────────────────────────────────────────────────────────┘
+                                    │
+                                    │ wss:// (TLS encrypted)
+                                    ▼
+┌─────────────────────────────────────────────────────────────────────────────┐
+│ SERVER B                                                                     │
+│                                                                              │
+│ 7. PeerManager receives PEER_ROUTE                                          │
+│         │                                                                    │
+│         ▼                                                                    │
+│ 8. Router looks up Bob in local connections                                 │
+│         │                                                                    │
+│         ▼                                                                    │
+│ 9. Router calls TmuxWrapper.deliverToLocal(Bob, envelope)                   │
+│         │                                                                    │
+│         ▼                                                                    │
+│ 10. TmuxWrapper executes:                                                   │
+│     tmux send-keys -t relay-Bob-12345 -l "Relay message from Alice..."      │
+│     tmux send-keys -t relay-Bob-12345 Enter                                 │
+│         │                                                                    │
+│         ▼                                                                    │
+│ 11. Bob's agent receives message AS USER INPUT                              │
+│     (automatic, no polling, no checking!)                                   │
+│                                                                              │
+└─────────────────────────────────────────────────────────────────────────────┘
+```
+
+---
+
+## 3. Network Topology
+
+### 3.1 Topology Options
+
+#### Option A: Full Mesh (Recommended for <10 servers)
+
+```
+        ServerA ◄────────► ServerB
+           ▲                  ▲
+           │                  │
+           │                  │
+           ▼                  ▼
+        ServerC ◄────────► ServerD
+```
+
+- Every daemon connects to every other daemon
+- O(n²) connections, but fine for small fleets
+- No single point of failure
+- Lowest latency (direct paths)
+
+#### Option B: Hub-and-Spoke (Recommended for 10+ servers)
+
+```
+                    ┌─────────┐
+         ┌─────────►│   Hub   │◄─────────┐
+         │          └────┬────┘          │
+         │               │               │
+         ▼               ▼               ▼
+     ServerA         ServerB         ServerC
+```
+
+- All daemons connect to central hub
+- Hub routes messages between servers
+- Single point of failure (mitigate with hub HA)
+- Simpler operations
+
+#### Option C: Hybrid (Recommended for production)
+
+```
+                    ┌─────────┐
+         ┌─────────►│ Hub     │◄─────────┐
+         │          │(discover)│          │
+         │          └─────────┘          │
+         │                               │
+         ▼                               ▼
+     ServerA ◄─────────────────────► ServerB
+         ▲                               ▲
+         │                               │
+         └───────────► ServerC ◄─────────┘
+```
+
+- Hub provides discovery and registry sync
+- Daemons establish direct peer connections
+- Messages route directly (low latency)
+- Hub failure doesn't break existing connections
+
+### 3.2 Recommended Approach
+
+**Hybrid topology with optional hub:**
+
+1. Daemons can be configured with static peer list (no hub needed)
+2. Optionally connect to hub for dynamic discovery
+3. Once peers are known, establish direct connections
+4. Hub going down doesn't break messaging (just discovery)
+
+---
+
+## 4. Protocol Specification
+
+### 4.1 Peer Protocol Messages
+
+Extend the existing envelope format with peer-specific message types:
+
+```typescript
+// New message types for federation
+type PeerMessageType =
+  | 'PEER_HELLO'      // Initial handshake
+  | 'PEER_WELCOME'    // Handshake response
+  | 'PEER_SYNC'       // Registry synchronization
+  | 'PEER_ROUTE'      // Route message to local agent
+  | 'PEER_BROADCAST'  // Broadcast to local agents
+  | 'PEER_PING'       // Heartbeat
+  | 'PEER_PONG'       // Heartbeat response
+  | 'PEER_BYE';       // Graceful disconnect
+
+// Peer envelope (over WebSocket)
+interface PeerEnvelope<T = unknown> {
+  v: 1;                    // Protocol version
+  type: PeerMessageType;
+  id: string;              // Message UUID
+  ts: number;              // Timestamp
+  from_server: string;     // Originating server ID
+  payload: T;
+}
+```
+
+### 4.2 PEER_HELLO / PEER_WELCOME
+
+Initial handshake between daemons:
+
+```typescript
+// Client → Server
+interface PeerHelloPayload {
+  server_id: string;           // e.g., "nyc-prod-01"
+  server_name?: string;        // Human-readable name
+  version: string;             // agent-relay version
+  capabilities: {
+    max_message_size: number;
+    supports_broadcast: boolean;
+    supports_topics: boolean;
+  };
+  agents: AgentInfo[];         // Local agents to register
+  auth_token: string;          // Pre-shared token
+}
+
+// Server → Client
+interface PeerWelcomePayload {
+  server_id: string;
+  session_id: string;          // For reconnection
+  agents: AgentInfo[];         // Server's local agents
+  peers: PeerInfo[];           // Other known peers (for mesh)
+  config: {
+    heartbeat_ms: number;      // Ping interval
+    sync_interval_ms: number;  // Registry sync interval
+  };
+}
+
+interface AgentInfo {
+  name: string;
+  server_id: string;
+  cli?: string;                // claude, codex, gemini
+  connected_at: string;        // ISO timestamp
+  status: 'online' | 'idle' | 'busy';
+}
+
+interface PeerInfo {
+  server_id: string;
+  url: string;                 // WebSocket URL
+  agents: string[];            // Agent names on this peer
+}
+```
+
+### 4.3 PEER_SYNC
+
+Registry updates when agents join/leave:
+
+```typescript
+interface PeerSyncPayload {
+  type: 'agent_joined' | 'agent_left' | 'full_sync';
+  agents?: AgentInfo[];        // For full_sync
+  agent?: AgentInfo;           // For join/leave
+}
+```
+
+### 4.4 PEER_ROUTE
+
+Forward a message to a specific agent:
+
+```typescript
+interface PeerRoutePayload {
+  original_envelope: Envelope<SendPayload>;  // The actual message
+  target_agent: string;                       // Local agent name
+  hops: string[];                             // Servers traversed (loop prevention)
+}
+```
+
+### 4.5 PEER_BROADCAST
+
+Forward a broadcast to all local agents:
+
+```typescript
+interface PeerBroadcastPayload {
+  original_envelope: Envelope<SendPayload>;
+  exclude_agents?: string[];   // Don't deliver to these
+  scope?: 'fleet' | 'server';  // Broadcast scope
+}
+```
+
+### 4.6 Connection State Machine
+
+```
+                     ┌─────────────────┐
+                     │  DISCONNECTED   │◄─────────────────────┐
+                     └────────┬────────┘                      │
+                              │ connect()                     │
+                              ▼                               │
+                     ┌─────────────────┐                      │
+          ┌──────────│   CONNECTING    │──────────┐           │
+          │          └────────┬────────┘          │           │
+          │                   │ socket open       │           │
+          │ error             ▼                   │ error     │
+          │          ┌─────────────────┐          │           │
+          │          │  HANDSHAKING    │──────────┤           │
+          │          └────────┬────────┘          │           │
+          │                   │ WELCOME received  │           │
+          │                   ▼                   │           │
+          │          ┌─────────────────┐          │           │
+          │          │     ACTIVE      │──────────┤           │
+          │          └────────┬────────┘          │           │
+          │                   │ BYE or error      │           │
+          │                   ▼                   │           │
+          │          ┌─────────────────┐          │           │
+          └─────────►│   RECONNECTING  │◄─────────┘           │
+                     └────────┬────────┘                      │
+                              │ max retries                   │
+                              └───────────────────────────────┘
+
+  Reconnection: exponential backoff 1s → 2s → 4s → 8s → 16s → 30s (max)
+  Max attempts: unlimited (peers are persistent)
+```
+
+---
+
+## 5. Agent Discovery & Registry
+
+### 5.1 Registry Structure
+
+```typescript
+interface FleetRegistry {
+  // All known agents across the fleet
+  agents: Map<string, AgentRecord>;
+
+  // Server information
+  servers: Map<string, ServerRecord>;
+
+  // Index for fast lookup
+  agentToServer: Map<string, string>;  // agentName → serverId
+}
+
+interface AgentRecord {
+  name: string;
+  server_id: string;
+  qualified_name: string;      // "Alice@nyc-prod-01"
+  cli?: string;
+  status: 'online' | 'idle' | 'offline';
+  connected_at: string;
+  last_seen: string;
+  metadata?: Record<string, unknown>;
+}
+
+interface ServerRecord {
+  id: string;
+  name?: string;
+  url: string;
+  status: 'connected' | 'disconnected' | 'unknown';
+  agents: Set<string>;
+  connected_at?: string;
+  last_seen: string;
+  latency_ms?: number;
+}
+```
+
+### 5.2 Name Resolution
+
+Agents can be addressed in multiple ways:
+
+| Pattern | Resolution |
+|---------|------------|
+| `@relay:Bob` | Local first, then fleet-wide lookup |
+| `@relay:Bob@nyc` | Explicitly route to server "nyc" |
+| `@relay:Bob@*` | Send to ALL agents named Bob (rare) |
+| `@relay:*` | Broadcast to entire fleet |
+| `@relay:*@local` | Broadcast to local server only |
+| `@relay:*@nyc` | Broadcast to all agents on "nyc" |
+
+### 5.3 Name Collision Handling
+
+If two agents have the same name on different servers:
+
+1. **Local preference**: Unqualified name routes to local agent first
+2. **First-registered wins**: For fleet-wide lookup, first to register owns the name
+3. **Explicit qualification**: Use `@relay:Bob@server-id` to disambiguate
+4. **Warning on collision**: Daemon logs warning when collision detected
+
+```typescript
+// Resolution algorithm
+function resolveAgent(name: string, fromServer: string): AgentRecord | null {
+  // Check for explicit qualification
+  if (name.includes('@')) {
+    const [agentName, serverSpec] = name.split('@');
+    return registry.findOnServer(agentName, serverSpec);
+  }
+
+  // Try local first
+  const local = registry.findLocal(name, fromServer);
+  if (local) return local;
+
+  // Fleet-wide lookup (first registered)
+  return registry.findAny(name);
+}
+```
+
+### 5.4 Registry Synchronization
+
+Registries sync via gossip-like protocol:
+
+```
+Server A joins fleet:
+1. A → B: PEER_HELLO (includes A's agents)
+2. B → A: PEER_WELCOME (includes B's agents + known peers)
+3. A → C: PEER_HELLO (A learned about C from B)
+4. A → B: PEER_SYNC (A now knows about C's agents)
+...eventually consistent...
+```
+
+**Sync triggers:**
+- New peer connection
+- Agent joins/leaves locally
+- Periodic full sync (every 60s)
+- On reconnection after disconnect
+
+---
+
+## 6. Message Routing
+
+### 6.1 Routing Algorithm
+
+```typescript
+class FederatedRouter {
+  route(from: string, envelope: Envelope<SendPayload>): void {
+    const target = envelope.to;
+
+    // 1. Broadcast handling
+    if (target === '*' || target?.startsWith('*@')) {
+      return this.handleBroadcast(from, envelope, target);
+    }
+
+    // 2. Resolve target agent
+    const resolved = this.registry.resolve(target, this.serverId);
+    if (!resolved) {
+      this.sendNack(from, envelope.id, 'UNKNOWN_AGENT');
+      return;
+    }
+
+    // 3. Local delivery
+    if (resolved.server_id === this.serverId) {
+      this.deliverLocal(from, resolved.name, envelope);
+      return;
+    }
+
+    // 4. Remote delivery
+    this.deliverRemote(from, resolved, envelope);
+  }
+
+  private handleBroadcast(from: string, envelope: Envelope, scope: string): void {
+    const [, serverSpec] = scope.split('@');
+
+    // Deliver to local agents (except sender)
+    if (!serverSpec || serverSpec === 'local' || serverSpec === this.serverId) {
+      for (const agent of this.localAgents) {
+        if (agent.name !== from) {
+          this.deliverLocal(from, agent.name, envelope);
+        }
+      }
+    }
+
+    // Forward to peers (unless local-only)
+    if (serverSpec !== 'local') {
+      for (const peer of this.peers.values()) {
+        if (!serverSpec || peer.serverId === serverSpec) {
+          peer.send({
+            type: 'PEER_BROADCAST',
+            payload: {
+              original_envelope: envelope,
+              exclude_agents: [from],
+            }
+          });
+        }
+      }
+    }
+  }
+
+  private deliverRemote(
+    from: string,
+    target: AgentRecord,
+    envelope: Envelope
+  ): void {
+    const peer = this.peers.get(target.server_id);
+
+    if (!peer || peer.state !== 'ACTIVE') {
+      // Queue for later delivery
+      this.queueMessage(target.server_id, envelope);
+      return;
+    }
+
+    peer.send({
+      type: 'PEER_ROUTE',
+      payload: {
+        original_envelope: envelope,
+        target_agent: target.name,
+        hops: [this.serverId],
+      }
+    });
+  }
+}
+```
+
+### 6.2 Message Queuing
+
+When a peer is disconnected, messages are queued:
+
+```typescript
+interface QueuedMessage {
+  envelope: Envelope;
+  target_server: string;
+  queued_at: number;
+  attempts: number;
+  expires_at: number;  // TTL
+}
+
+class MessageQueue {
+  private queues: Map<string, QueuedMessage[]>;
+
+  enqueue(serverId: string, envelope: Envelope): void {
+    const queue = this.queues.get(serverId) ?? [];
+    queue.push({
+      envelope,
+      target_server: serverId,
+      queued_at: Date.now(),
+      attempts: 0,
+      expires_at: Date.now() + 3600000,  // 1 hour TTL
+    });
+    this.queues.set(serverId, queue);
+  }
+
+  // Called when peer reconnects
+  flush(serverId: string, peer: PeerConnection): void {
+    const queue = this.queues.get(serverId) ?? [];
+    for (const msg of queue) {
+      if (Date.now() < msg.expires_at) {
+        peer.send({ type: 'PEER_ROUTE', payload: msg.envelope });
+      }
+    }
+    this.queues.delete(serverId);
+  }
+}
+```
+
+### 6.3 Loop Prevention
+
+Prevent messages from bouncing between servers:
+
+```typescript
+interface PeerRoutePayload {
+  // ... other fields
+  hops: string[];  // Servers this message has traversed
+}
+
+// On receiving PEER_ROUTE
+function handlePeerRoute(msg: PeerEnvelope<PeerRoutePayload>): void {
+  // Check for loop
+  if (msg.payload.hops.includes(this.serverId)) {
+    console.warn('Loop detected, dropping message');
+    return;
+  }
+
+  // Add ourselves to hops if forwarding
+  if (needsForwarding) {
+    msg.payload.hops.push(this.serverId);
+  }
+}
+```
+
+---
+
+## 7. Security Model
+
+### 7.1 Authentication
+
+**Pre-shared tokens** for simplicity:
+
+```yaml
+# Server A config
+auth:
+  server_token: "server-a-secret-token"
+  peer_tokens:
+    server-b: "shared-secret-ab"
+    server-c: "shared-secret-ac"
+```
+
+Tokens are exchanged in PEER_HELLO and validated before PEER_WELCOME.
+
+### 7.2 Transport Security
+
+**Mandatory TLS** for peer connections:
+
+```typescript
+const ws = new WebSocket(peerUrl, {
+  // TLS configuration
+  rejectUnauthorized: true,  // Verify peer certificate
+  ca: fs.readFileSync('/etc/agent-relay/ca.pem'),
+
+  // Client certificate (for mTLS)
+  cert: fs.readFileSync('/etc/agent-relay/client.pem'),
+  key: fs.readFileSync('/etc/agent-relay/client-key.pem'),
+});
+```
+
+### 7.3 Authorization
+
+Simple capability model:
+
+```typescript
+interface ServerCapabilities {
+  can_broadcast: boolean;       // Can send fleet-wide broadcasts
+  can_route_to: string[];       // Allowed target servers
+  max_message_rate: number;     // Rate limit (msgs/sec)
+  allowed_agents: string[];     // Can message these agents (or '*')
+}
+```
+
+### 7.4 Security Checklist
+
+| Control | Status | Notes |
+|---------|--------|-------|
+| TLS encryption | Required | wss:// only |
+| Peer authentication | Required | Pre-shared token |
+| mTLS (mutual TLS) | Optional | For high-security |
+| Message signing | Future | Verify message origin |
+| Rate limiting | Recommended | Prevent floods |
+| Audit logging | Recommended | Log all cross-server |
+
+---
+
+## 8. Failure Handling & Resilience
+
+### 8.1 Connection Failures
+
+```
+┌─────────────────────────────────────────────────────────────────┐
+│                    CONNECTION FAILURE HANDLING                   │
+│                                                                  │
+│  Peer disconnects                                                │
+│       │                                                          │
+│       ▼                                                          │
+│  Mark peer as DISCONNECTED                                       │
+│       │                                                          │
+│       ├──► Queue outbound messages                               │
+│       │                                                          │
+│       ├──► Start reconnection timer                              │
+│       │    (exponential backoff: 1s, 2s, 4s, ... 30s max)       │
+│       │                                                          │
+│       └──► Notify local agents (optional)                        │
+│            "@relay:* [SYSTEM] Lost connection to server-b"       │
+│                                                                  │
+│  On reconnect:                                                   │
+│       │                                                          │
+│       ├──► Re-authenticate (PEER_HELLO/WELCOME)                 │
+│       │                                                          │
+│       ├──► Sync registries (PEER_SYNC full_sync)                │
+│       │                                                          │
+│       └──► Flush queued messages                                 │
+│                                                                  │
+└─────────────────────────────────────────────────────────────────┘
+```
+
+### 8.2 Split Brain Prevention
+
+If the fleet gets partitioned:
+
+1. **Agents remain addressable** within their partition
+2. **Cross-partition messages queue** until healed
+3. **No automatic conflict resolution** - messages deliver in order received
+4. **TTL expiration** - queued messages expire after 1 hour (configurable)
+
+### 8.3 Graceful Degradation
+
+```
+Fleet healthy:     A ◄──► B ◄──► C    (full connectivity)
+
+B goes down:       A ◄─X─► B ◄─X─► C
+                   A ◄──────────────► C  (A-C still works)
+
+B comes back:      A ◄──► B ◄──► C    (queued messages flush)
+```
+
+### 8.4 Health Monitoring
+
+```typescript
+// Heartbeat every 30 seconds
+setInterval(() => {
+  for (const peer of this.peers.values()) {
+    if (peer.state === 'ACTIVE') {
+      peer.send({ type: 'PEER_PING', ts: Date.now() });
+
+      // If no PONG in 60s, consider dead
+      peer.setTimeout(() => {
+        if (!peer.lastPong || Date.now() - peer.lastPong > 60000) {
+          peer.reconnect();
+        }
+      }, 60000);
+    }
+  }
+}, 30000);
+```
+
+---
+
+## 9. Configuration
+
+### 9.1 Configuration File
+
+```yaml
+# /etc/agent-relay/config.yaml (or ~/.agent-relay/config.yaml)
+
+# Server identity
+server:
+  id: nyc-prod-01                    # Unique server ID
+  name: "NYC Production 01"          # Human-readable name
+
+# Local daemon settings (unchanged from current)
+local:
+  socket_path: /tmp/agent-relay/relay.sock
+  storage_path: /tmp/agent-relay/messages.sqlite
+
+# Federation settings (NEW)
+federation:
+  enabled: true
+
+  # Listen for peer connections
+  listen:
+    host: 0.0.0.0
+    port: 8765
+
+  # TLS configuration
+  tls:
+    enabled: true
+    cert: /etc/agent-relay/server.pem
+    key: /etc/agent-relay/server-key.pem
+    ca: /etc/agent-relay/ca.pem       # For client verification
+    mutual: false                      # Require client certs
+
+  # Authentication
+  auth:
+    # This server's token (peers use this to connect to us)
+    server_token: "${RELAY_SERVER_TOKEN}"
+
+    # Tokens for connecting to peers
+    peer_tokens:
+      london-prod-01: "${RELAY_TOKEN_LONDON}"
+      tokyo-prod-01: "${RELAY_TOKEN_TOKYO}"
+
+  # Peer connections
+  peers:
+    - url: wss://london.example.com:8765
+      server_id: london-prod-01
+      auto_connect: true
+
+    - url: wss://tokyo.example.com:8765
+      server_id: tokyo-prod-01
+      auto_connect: true
+
+  # Optional hub for discovery
+  hub:
+    url: wss://hub.example.com:8765
+    token: "${RELAY_HUB_TOKEN}"
+    enabled: false                    # Hub is optional
+
+  # Behavior settings
+  settings:
+    heartbeat_interval_ms: 30000      # Ping peers every 30s
+    reconnect_max_delay_ms: 30000     # Max backoff delay
+    message_queue_ttl_ms: 3600000     # 1 hour queue TTL
+    sync_interval_ms: 60000           # Full registry sync
+    max_message_size_bytes: 1048576   # 1 MiB
+
+# Dashboard settings
+dashboard:
+  enabled: true
+  port: 3888
+  show_fleet: true                    # Show all fleet agents
+```
+
+### 9.2 Environment Variables
+
+All config can be overridden via environment:
+
+```bash
+# Server identity
+AGENT_RELAY_SERVER_ID=nyc-prod-01
+AGENT_RELAY_SERVER_NAME="NYC Production 01"
+
+# Federation
+AGENT_RELAY_FEDERATION_ENABLED=true
+AGENT_RELAY_FEDERATION_PORT=8765
+AGENT_RELAY_SERVER_TOKEN=secret-token
+AGENT_RELAY_PEER_london-prod-01_TOKEN=london-token
+AGENT_RELAY_PEER_london-prod-01_URL=wss://london.example.com:8765
+```
+
+### 9.3 Minimal Configuration
+
+For simple two-server setup:
+
+```yaml
+# Server A (nyc)
+server:
+  id: nyc
+federation:
+  enabled: true
+  listen:
+    port: 8765
+  auth:
+    server_token: "shared-secret"
+  peers:
+    - url: wss://london.example.com:8765
+      server_id: london
+```
+
+```yaml
+# Server B (london)
+server:
+  id: london
+federation:
+  enabled: true
+  listen:
+    port: 8765
+  auth:
+    server_token: "shared-secret"
+  peers:
+    - url: wss://nyc.example.com:8765
+      server_id: nyc
+```
+
+---
+
+## 10. CLI Interface
+
+### 10.1 New Commands
+
+```bash
+# Start daemon with federation
+agent-relay up [--peer-port 8765] [--config /path/to/config.yaml]
+
+# Peer management
+agent-relay peer list                           # List connected peers
+agent-relay peer add <url> [--token <token>]    # Add peer dynamically
+agent-relay peer remove <server-id>             # Remove peer
+agent-relay peer status <server-id>             # Detailed peer status
+
+# Fleet-wide agent listing
+agent-relay agents                     # Local agents only (default)
+agent-relay agents --fleet             # All agents in fleet
+agent-relay agents --server <id>       # Agents on specific server
+
+# Send to remote agent
+agent-relay send <agent>[@<server>] <message>
+
+# Fleet status
+agent-relay fleet status               # Overview of all servers
+agent-relay fleet topology             # Show connection graph
+
+# Debugging
+agent-relay fleet ping <server-id>     # Ping a peer
+agent-relay fleet trace <agent>        # Trace route to agent
+```
+
+### 10.2 Example Session
+
+```bash
+# On Server NYC
+$ agent-relay up --peer-port 8765
+Daemon started (federation enabled)
+Listening for peers on :8765
+Connecting to peer: london.example.com:8765...
+Connected to london (3 agents)
+Connecting to peer: tokyo.example.com:8765...
+Connected to tokyo (2 agents)
+
+$ agent-relay agents --fleet
+AGENT         SERVER    CLI      STATUS    CONNECTED
+Alice         nyc       claude   online    2 min ago
+Bob           nyc       codex    online    5 min ago
+Carol         london    claude   online    1 min ago
+Dave          london    gemini   idle      10 min ago
+Eve           tokyo     claude   online    3 min ago
+
+$ agent-relay fleet status
+SERVER    STATUS       AGENTS    LATENCY    LAST SEEN
+nyc       local        2         -          -
+london    connected    2         45ms       just now
+tokyo     connected    1         120ms      2s ago
+
+# Start an agent that can message across servers
+$ agent-relay -n Alice claude
+
+# Inside Claude session:
+# Alice> @relay:Carol Can you help with the auth module?
+# [relay:Alice] → Carol@london: Can you help with the auth module?
+
+# Carol (on london server) automatically receives:
+# "Relay message from Alice@nyc [abc123]: Can you help with the auth module?"
+```
+
+### 10.3 Addressing Examples
+
+```bash
+# From Alice on NYC server:
+
+@relay:Bob hello                    # Bob is local (NYC) → local delivery
+@relay:Carol hello                  # Carol is on london → routes to london
+@relay:Carol@london hello           # Explicit: route to Carol on london
+@relay:Eve@tokyo hello              # Explicit: route to Eve on tokyo
+@relay:* Status update              # Broadcast to ALL agents in fleet
+@relay:*@local Status update        # Broadcast only to NYC agents
+@relay:*@london Status update       # Broadcast to all london agents
+```
+
+---
+
+## 11. Implementation Plan
+
+### 11.1 Phases
+
+```
+┌─────────────────────────────────────────────────────────────────────────────┐
+│ PHASE 1: Foundation (1 week)                                                 │
+│                                                                              │
+│ • Define peer protocol types (src/protocol/peer-types.ts)                   │
+│ • Implement PeerConnection class (src/federation/peer-connection.ts)        │
+│ • Implement basic HELLO/WELCOME handshake                                   │
+│ • Add peer WebSocket listener to daemon                                     │
+│ • Unit tests for protocol                                                   │
+│                                                                              │
+│ Deliverable: Two daemons can connect and handshake                          │
+└─────────────────────────────────────────────────────────────────────────────┘
+                                    │
+                                    ▼
+┌─────────────────────────────────────────────────────────────────────────────┐
+│ PHASE 2: Registry & Discovery (1 week)                                       │
+│                                                                              │
+│ • Implement FleetRegistry (src/federation/registry.ts)                      │
+│ • Implement PEER_SYNC message handling                                      │
+│ • Add registry to Router for lookups                                        │
+│ • Implement name resolution (local → fleet)                                 │
+│ • CLI: `agent-relay agents --fleet`                                         │
+│                                                                              │
+│ Deliverable: Agents visible across servers                                  │
+└─────────────────────────────────────────────────────────────────────────────┘
+                                    │
+                                    ▼
+┌─────────────────────────────────────────────────────────────────────────────┐
+│ PHASE 3: Message Routing (1 week)                                            │
+│                                                                              │
+│ • Implement PEER_ROUTE handling                                             │
+│ • Implement PEER_BROADCAST handling                                         │
+│ • Integrate with existing Router                                            │
+│ • Cross-server message delivery                                             │
+│ • Local tmux injection on receipt (existing code!)                          │
+│                                                                              │
+│ Deliverable: Alice@NYC can message Bob@London automatically                 │
+└─────────────────────────────────────────────────────────────────────────────┘
+                                    │
+                                    ▼
+┌─────────────────────────────────────────────────────────────────────────────┐
+│ PHASE 4: Resilience (1 week)                                                 │
+│                                                                              │
+│ • Implement reconnection logic                                              │
+│ • Implement message queue for disconnected peers                            │
+│ • Implement heartbeat (PING/PONG)                                           │
+│ • Handle graceful shutdown (PEER_BYE)                                       │
+│ • CLI: `agent-relay peer status`                                            │
+│                                                                              │
+│ Deliverable: Fleet survives server restarts                                 │
+└─────────────────────────────────────────────────────────────────────────────┘
+                                    │
+                                    ▼
+┌─────────────────────────────────────────────────────────────────────────────┐
+│ PHASE 5: Security & Polish (1 week)                                          │
+│                                                                              │
+│ • Add TLS support for peer connections                                      │
+│ • Add token-based authentication                                            │
+│ • Add configuration file support                                            │
+│ • Update dashboard for fleet view                                           │
+│ • Documentation                                                             │
+│                                                                              │
+│ Deliverable: Production-ready federation                                    │
+└─────────────────────────────────────────────────────────────────────────────┘
+```
+
+### 11.2 File Structure
+
+```
+src/
+├── federation/                    # NEW: Federation module
+│   ├── index.ts                   # Exports
+│   ├── peer-connection.ts         # WebSocket connection to peer
+│   ├── peer-manager.ts            # Manages all peer connections
+│   ├── peer-server.ts             # WebSocket server for incoming
+│   ├── registry.ts                # Fleet-wide agent registry
+│   ├── message-queue.ts           # Queue for disconnected peers
+│   └── config.ts                  # Federation configuration
+│
+├── protocol/
+│   ├── types.ts                   # Existing types
+│   └── peer-types.ts              # NEW: Peer protocol types
+│
+├── daemon/
+│   ├── server.ts                  # Modified: integrate federation
+│   ├── router.ts                  # Modified: federated routing
+│   └── ...
+│
+└── cli/
+    └── index.ts                   # Modified: new commands
+```
+
+### 11.3 Estimated Effort
+
+| Phase | Effort | Dependencies |
+|-------|--------|--------------|
+| Phase 1: Foundation | 3-4 days | None |
+| Phase 2: Registry | 3-4 days | Phase 1 |
+| Phase 3: Routing | 3-4 days | Phase 2 |
+| Phase 4: Resilience | 3-4 days | Phase 3 |
+| Phase 5: Security | 3-4 days | Phase 4 |
+| **Total** | **~4-5 weeks** | |
+
+---
+
+## 12. Migration Path
+
+### 12.1 Backward Compatibility
+
+Existing single-server deployments work without changes:
+
+```yaml
+# No federation block = single-server mode (current behavior)
+local:
+  socket_path: /tmp/agent-relay/relay.sock
+```
+
+### 12.2 Upgrade Path
+
+1. **Update agent-relay** to federation-capable version
+2. **Add federation config** to enable cross-server
+3. **Start daemons** - they auto-connect to peers
+4. **Agents just work** - no changes needed
+
+### 12.3 Rollback
+
+If issues arise:
+1. Set `federation.enabled: false`
+2. Restart daemon
+3. Back to single-server mode
+
+---
+
+## Summary
+
+This proposal extends agent-relay to support federated multi-server deployments while **preserving the core differentiator**: automatic message injection via tmux.
+
+**Key points:**
+
+1. **Injection stays local** - Each server runs tmux sessions, does local send-keys
+2. **Routing goes network** - Daemons connect via WebSocket for cross-server
+3. **Progressive enhancement** - Single-server still works, federation is opt-in
+4. **Simple operations** - Static peer config works, hub optional
+5. **Resilient** - Reconnection, message queuing, graceful degradation
+
+**What we preserve:**
+- Zero-config agent integration (@relay: pattern)
+- Automatic message delivery (no polling)
+- Low latency (<5ms local, +network RTT remote)
+- Simple mental model
+
+**What we add:**
+- Cross-server messaging
+- Fleet-wide agent discovery
+- Peer-to-peer daemon connections
+- Message queuing for resilience
+
+---
+
+## Next Steps
+
+1. Review and approve this proposal
+2. Create implementation tasks in Beads
+3. Begin Phase 1: Foundation

From bf1268bb8131a7253a36bc941576fce1398a78ea Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 21 Dec 2025 07:52:08 +0000
Subject: [PATCH 02/19] Add critical review of federation proposal
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Identifies major gaps and risks in the federation design:

HIGH SEVERITY:
- No end-to-end delivery guarantee (sender doesn't know if agent received)
- Registry consistency race conditions (split-brain on name collisions)
- Message ordering not guaranteed across servers

MEDIUM SEVERITY:
- Token management doesn't scale (N² tokens for N servers)
- No message-level authentication (spoofing possible)
- No rate limiting (flood attacks possible)
- Debugging distributed failures is hard (no tracing)
- NAT/firewall traversal not addressed
- Timeline underestimated (8-10 weeks realistic vs 4-5 proposed)

Includes:
- Specific failure scenarios for each issue
- Recommendations for fixes
- Alternative approaches (NATS, SSH tunnels)
- Suggested MVP scope to ship faster

bd-TBD
---
 docs/FEDERATION_PROPOSAL_REVIEW.md | 584 +++++++++++++++++++++++++++++
 1 file changed, 584 insertions(+)
 create mode 100644 docs/FEDERATION_PROPOSAL_REVIEW.md

diff --git a/docs/FEDERATION_PROPOSAL_REVIEW.md b/docs/FEDERATION_PROPOSAL_REVIEW.md
new file mode 100644
index 000000000..833b65e6a
--- /dev/null
+++ b/docs/FEDERATION_PROPOSAL_REVIEW.md
@@ -0,0 +1,584 @@
+# Federation Proposal: Critical Review
+
+A critical analysis of FEDERATION_PROPOSAL.md identifying gaps, risks, and areas needing more thought.
+
+---
+
+## Executive Summary: Major Concerns
+
+| Category | Severity | Issue |
+|----------|----------|-------|
+| **Delivery Guarantees** | 🔴 High | No end-to-end acknowledgment |
+| **Registry Consistency** | 🔴 High | Race conditions in name resolution |
+| **Security** | 🟡 Medium | Token management doesn't scale |
+| **Operational** | 🟡 Medium | Debugging distributed failures is hard |
+| **Timeline** | 🟡 Medium | 4-5 weeks is optimistic |
+| **NAT/Firewall** | 🟡 Medium | Assumes direct connectivity |
+
+---
+
+## 1. Fundamental Architecture Issues
+
+### 1.1 🔴 No End-to-End Delivery Guarantee
+
+**The Problem:**
+
+```
+Alice@A → Daemon A → Daemon B → ??? → Bob receives?
+         ↑           ↑
+         ACK         ACK
+         (local)     (peer)
+
+But does Bob's agent actually SEE the message?
+```
+
+The proposal has ACKs between daemons, but:
+- No confirmation that `tmux send-keys` succeeded
+- No confirmation that the agent processed the message
+- Sender Alice has no idea if Bob actually received it
+
+**Real failure modes:**
+- Bob's tmux session crashed between delivery and injection
+- Bob's agent is in a blocking state (waiting for human input)
+- Injection happened but Bob's agent ignored it (prompt too long, agent confused)
+
+**Recommendation:**
+Add optional end-to-end ACK pattern:
+```
+Alice sends → Bob receives → Bob's daemon detects "Relay message" in output
+                           → Bob's daemon sends DELIVERY_CONFIRMED back to Alice
+```
+
+### 1.2 🔴 Registry Consistency Race Conditions
+
+**The Problem:**
+
+The proposal says "first-registered wins" for name collisions, but with async gossip:
+
+```
+Time 0:  Server A has no "Bob"
+         Server B has no "Bob"
+
+Time 1:  Alice on A starts "Bob" agent
+         Carol on B starts "Bob" agent
+
+Time 2:  A sends PEER_SYNC: "Bob joined"
+         B sends PEER_SYNC: "Bob joined"
+         (messages cross in flight)
+
+Time 3:  Both A and B think THEIR Bob is the "real" Bob
+         Fleet has split-brain on who "Bob" is
+```
+
+**Result:** `@relay:Bob` from Server C routes to different agents depending on which PEER_SYNC arrived first. Completely non-deterministic.
+
+**Recommendation:**
+- Use Lamport timestamps or vector clocks for ordering
+- Or: require unique names fleet-wide (reject registration if name exists anywhere)
+- Or: always require qualified names (`Bob@server-a`)
+
+### 1.3 🟡 Message Ordering Not Guaranteed
+
+**The Problem:**
+
+```
+Alice sends M1 to Bob
+Alice sends M2 to Bob
+Network hiccup: M1 queued, M2 takes different path
+Bob receives: M2, then M1
+
+Bob: "Why is Alice saying 'yes' before asking the question?"
+```
+
+The proposal mentions sequence numbers per-topic but doesn't specify:
+- Are they enforced on delivery?
+- What happens to out-of-order messages?
+- How do sequence numbers work across server restarts?
+
+**Recommendation:**
+- Add explicit ordering guarantees (or document that there are none)
+- Consider per-conversation sequence numbers
+- Or: accept eventual consistency and document it clearly
+
+---
+
+## 2. Security Issues
+
+### 2.1 🟡 Token Management Doesn't Scale
+
+**The Problem:**
+
+With N servers, you need N² tokens (each pair needs a shared secret):
+
+```
+3 servers: 3 tokens (A-B, A-C, B-C)
+5 servers: 10 tokens
+10 servers: 45 tokens
+20 servers: 190 tokens
+```
+
+Managing 190 tokens across 20 servers is operational nightmare.
+
+**Additional concerns:**
+- No token rotation mechanism specified
+- Tokens in config files can leak
+- No revocation process
+
+**Recommendation:**
+- Use asymmetric keys (each server has keypair, sign challenges)
+- Or: single CA, mTLS with auto-rotation
+- Or: hub-based auth (servers auth to hub, hub vouches for peers)
+
+### 2.2 🟡 No Message-Level Authentication
+
+**The Problem:**
+
+Once a peer connection is established, any message from that peer is trusted:
+
+```
+Malicious/compromised Server B:
+  - Connects legitimately to A
+  - Sends: PEER_ROUTE { from: "Alice@A", to: "Bob@A", ... }
+  - Bob thinks Alice sent it, but B fabricated it
+```
+
+The `from_server` field isn't cryptographically verified per-message.
+
+**Recommendation:**
+- Sign each message with server's private key
+- Include HMAC of message content
+- Verify signature before processing
+
+### 2.3 🟡 No Rate Limiting Specified
+
+**The Problem:**
+
+A misbehaving peer can flood the fleet:
+
+```
+Server B: for i in 1..1000000: send(PEER_BROADCAST, "spam")
+
+Result: All agents on all servers get 1M messages injected
+        Fleet is DoS'd
+```
+
+**Recommendation:**
+- Per-peer rate limits
+- Per-agent rate limits
+- Backpressure signaling (BUSY message already exists locally, extend to peers)
+
+---
+
+## 3. Operational Issues
+
+### 3.1 🟡 Debugging Distributed Failures is Hard
+
+**The Problem:**
+
+"My message from Alice@NYC to Bob@London never arrived. Why?"
+
+The proposal has no:
+- Distributed tracing (correlation IDs across servers)
+- Message tracking ("where is message X right now?")
+- Visibility into queue depths
+- Alerting on delivery failures
+
+**Scenario:**
+```
+1. Alice sends message
+2. NYC daemon routes to London
+3. London daemon queues (Bob's agent busy)
+4. London daemon crashes, loses queue
+5. Bob never receives
+6. Nobody knows what happened
+```
+
+**Recommendation:**
+- Add correlation ID to all messages
+- Log all routing decisions with correlation ID
+- Add `agent-relay trace <message-id>` command
+- Persist queue to disk (not just memory)
+
+### 3.2 🟡 No Graceful Fleet Operations
+
+**The Problem:**
+
+How do you:
+- Drain a server (migrate agents before shutdown)?
+- Rolling upgrade the fleet?
+- Add capacity without disruption?
+
+The proposal doesn't address:
+- Agent migration between servers
+- Planned maintenance mode
+- Capacity planning
+
+**Recommendation:**
+- Add DRAINING state (accept no new agents, continue routing)
+- Add agent handoff protocol
+- Document operational runbooks
+
+### 3.3 🟡 Configuration Drift
+
+**The Problem:**
+
+With config files per server:
+- Server A thinks B's token is X
+- Server B rotated to token Y
+- Connection fails, hard to debug
+
+No central config management, no config validation.
+
+**Recommendation:**
+- Add `agent-relay config validate` command
+- Add config sync mechanism (or document best practices)
+- Health check should verify peer auth before claiming "healthy"
+
+---
+
+## 4. Protocol Issues
+
+### 4.1 🟡 No Backpressure Across Servers
+
+**The Problem:**
+
+Local daemon has BUSY message for backpressure. But across servers:
+
+```
+Server A sends 1000 msgs/sec to Server B
+Server B can only inject 10 msgs/sec (agents are slow)
+Server B's queue grows unbounded → OOM
+```
+
+**Recommendation:**
+- Add PEER_BUSY message
+- Flow control with credits/windows
+- Bounded queues with drop policy (oldest? newest? random?)
+
+### 4.2 🟡 Broadcast Scalability
+
+**The Problem:**
+
+`@relay:*` with 50 agents across 10 servers:
+
+```
+Origin server:
+  - 5 local agents → 5 local deliveries
+  - 9 peers → 9 PEER_BROADCAST messages
+
+Each peer:
+  - 5 local agents → 5 local deliveries
+
+Total: 5 + (9 × 5) = 50 deliveries (correct)
+But: 9 WebSocket messages sent simultaneously
+```
+
+For larger fleets:
+- 100 agents, 20 servers → 19 peer broadcasts
+- Each broadcast must be processed fully
+
+No fan-out optimization, no multicast.
+
+**Recommendation:**
+- For hub topology: single broadcast to hub, hub fans out
+- For mesh: consider gossip-style propagation
+- Rate limit broadcasts
+
+### 4.3 🟡 Large Message Handling
+
+**The Problem:**
+
+Max message size is 1 MiB. But:
+- What if an agent tries to send 5 MiB?
+- Silent truncation? Error? Split?
+
+Not specified.
+
+**Recommendation:**
+- Return NACK with PAYLOAD_TOO_LARGE
+- Or: implement message chunking
+- Document limits clearly to agent implementers
+
+---
+
+## 5. Edge Cases Not Addressed
+
+### 5.1 🟡 NAT and Firewall Traversal
+
+**The Problem:**
+
+The proposal assumes direct connectivity:
+
+```
+Server A (public IP) ──► Server B (behind NAT)
+                              ↑
+                              Cannot initiate inbound
+```
+
+Many production servers are behind NATs, firewalls, or in private VPCs.
+
+**Recommendation:**
+- Document network requirements explicitly
+- Consider connection reversal (B connects to A)
+- Consider TURN-style relay for NAT traversal
+- Or: explicitly require hub topology for NAT scenarios
+
+### 5.2 🟡 Clock Skew
+
+**The Problem:**
+
+TTL expiration uses timestamps:
+```typescript
+expires_at: Date.now() + 3600000  // 1 hour
+```
+
+But if Server A's clock is 30 minutes ahead of B's:
+- A queues message with expires_at = A.now + 1hr
+- A reconnects to B after 45 min (A's time)
+- B receives message, checks expires_at against B.now
+- B.now is only 15 min past message creation (from B's perspective)
+- Message still valid... but conceptually stale
+
+Or worse: clocks very far off could cause immediate expiration.
+
+**Recommendation:**
+- Use relative TTL in message (ttl_ms: 3600000)
+- Receiving server applies TTL from receipt time
+- Or: require NTP sync, document assumption
+
+### 5.3 🟡 Server ID Collisions
+
+**The Problem:**
+
+Two servers configured with same ID:
+
+```yaml
+# Server in NYC
+server:
+  id: production
+
+# Server in London (copy-paste error)
+server:
+  id: production
+```
+
+Both connect to hub. Registry confused. Routing broken.
+
+**Recommendation:**
+- Validate ID uniqueness on connection
+- Reject PEER_HELLO if server_id already registered
+- Generate default ID from hostname/MAC if not configured
+
+### 5.4 🟡 Message Replay
+
+**The Problem:**
+
+No replay protection:
+```
+1. Attacker captures PEER_ROUTE message
+2. Attacker replays it 1000 times
+3. Bob gets same message 1000 times
+```
+
+**Recommendation:**
+- Add nonce/message ID to dedup
+- Track seen message IDs (with expiry)
+- Already have `id` field, just need dedup
+
+---
+
+## 6. Missing Features
+
+### 6.1 🟡 No Message Priorities
+
+**The Problem:**
+
+All messages treated equally. But:
+- System messages (peer down notifications) should be urgent
+- Bulk status updates can wait
+- User-initiated messages more important than background sync
+
+**Recommendation:**
+- Add priority field (LOW, NORMAL, HIGH, SYSTEM)
+- Separate queues per priority
+- Process high priority first
+
+### 6.2 🟡 No Metrics or Observability
+
+**The Problem:**
+
+How do you know if federation is healthy?
+
+No specified:
+- Message latency histograms
+- Delivery success rates
+- Queue depths
+- Peer connection status
+- Error rates by type
+
+**Recommendation:**
+- Add Prometheus metrics endpoint
+- Key metrics: peer_connection_state, messages_routed_total, messages_queued, routing_latency_seconds
+- Integrate with dashboard
+
+### 6.3 🟡 No Testing Strategy
+
+**The Problem:**
+
+How do you test federation?
+
+- Unit tests for protocol parsing ✓ (mentioned)
+- Integration tests across "servers"? Not mentioned
+- Chaos testing (network partitions, slow peers)? Not mentioned
+- Performance benchmarks? Not mentioned
+
+**Recommendation:**
+- Add multi-daemon integration test harness
+- Simulate network conditions (latency, packet loss)
+- Chaos tests: kill peers, corrupt messages, reorder
+- Benchmark: messages/sec at various fleet sizes
+
+---
+
+## 7. Timeline Concerns
+
+### 7.1 🟡 4-5 Weeks is Optimistic
+
+**The Reality:**
+
+Distributed systems are hard. The proposal underestimates:
+
+| Phase | Proposed | Realistic |
+|-------|----------|-----------|
+| Foundation | 1 week | 1.5-2 weeks (WebSocket edge cases) |
+| Registry | 1 week | 2 weeks (consistency is hard) |
+| Routing | 1 week | 1.5 weeks (broadcast complexity) |
+| Resilience | 1 week | 2-3 weeks (reconnection, queuing, testing) |
+| Security | 1 week | 2 weeks (TLS setup, token management) |
+| **Total** | 4-5 weeks | **8-10 weeks** |
+
+Plus:
+- Integration testing
+- Documentation
+- Bug fixes from early testing
+- Edge cases discovered in use
+
+**Recommendation:**
+- Double the estimate
+- Plan for Phase 6: Stabilization (2 weeks of bug fixes)
+- MVP first: mesh without hub, no TLS, basic routing
+
+---
+
+## 8. Alternative Approaches Worth Considering
+
+### 8.1 Why Not Use NATS/Redis?
+
+**The proposal dismisses external dependencies but doesn't fully justify.**
+
+NATS JetStream provides:
+- ✅ Persistent queues (survive restart)
+- ✅ Exactly-once delivery
+- ✅ Clustering/HA built-in
+- ✅ Backpressure
+- ✅ Observability
+- ✅ Battle-tested
+
+Custom implementation provides:
+- ✅ No external dependency
+- ✅ Full control
+- ❌ Must implement all of the above
+
+**Honest trade-off:**
+- Custom: 8-10 weeks dev, ongoing maintenance, custom bugs
+- NATS: 1 week integration, proven reliability, learning curve
+
+**Recommendation:**
+At minimum, document why custom is preferred. Consider NATS for production, custom for dev/simple cases.
+
+### 8.2 Simpler Alternative: SSH Tunnels
+
+For small fleets, SSH tunnels might be simpler:
+
+```bash
+# On NYC server, create tunnel to London
+ssh -L 8765:localhost:8765 london.example.com
+
+# Local daemon connects to localhost:8765
+# Tunnel forwards to London's daemon
+```
+
+Benefits:
+- Auth handled by SSH (keys, etc.)
+- Encryption handled by SSH
+- No new code needed
+- Operationally familiar
+
+Downsides:
+- Manual tunnel management
+- Single point of failure per tunnel
+- Doesn't scale to large fleets
+
+**Recommendation:**
+Document SSH tunnel option for simple 2-3 server setups.
+
+---
+
+## 9. Recommendations Summary
+
+### Must Fix Before Implementation
+
+1. **End-to-end delivery confirmation** - Sender must know message was injected
+2. **Registry consistency** - Define conflict resolution, prevent split-brain
+3. **Message deduplication** - Prevent replays using message ID
+
+### Should Address in v1
+
+4. **Bounded queues** - Prevent OOM from slow peers
+5. **Distributed tracing** - Correlation IDs for debugging
+6. **Token rotation** - Or switch to asymmetric auth
+7. **Rate limiting** - Prevent flood attacks
+
+### Can Defer to v2
+
+8. **Message priorities**
+9. **Graceful drain/migration**
+10. **Metrics/observability**
+11. **Hub HA**
+
+### Revise Estimates
+
+- **Realistic timeline: 8-10 weeks**
+- Plan stabilization phase
+- Consider MVP with reduced scope
+
+---
+
+## 10. Suggested MVP Scope
+
+To ship something useful faster, consider this reduced scope:
+
+**MVP (4 weeks):**
+- Static peer list only (no hub)
+- No TLS (rely on VPN/private network)
+- Single token per fleet (not per-pair)
+- Basic registry (no conflict handling - require unique names)
+- No queue persistence (memory only)
+- No message priorities
+
+**Post-MVP:**
+- TLS + proper auth
+- Hub for discovery
+- Queue persistence
+- Conflict resolution
+- Observability
+
+This gets cross-server messaging working quickly, then hardens iteratively.
+
+---
+
+## Conclusion
+
+The federation proposal has solid architectural bones—separation of routing and injection is the right call. However, it underestimates the complexity of distributed systems and glosses over critical details around consistency, delivery guarantees, and operations.
+
+**Verdict:** Good foundation, needs refinement before implementation. Address the 🔴 High severity issues, revise timeline, and consider an MVP approach.

From 1bff9555008b72a00a1f90031253180fdfd1bfb0 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 21 Dec 2025 08:19:00 +0000
Subject: [PATCH 03/19] Revise federation proposal (v2) addressing critical
 review
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Major additions to address identified issues:

DELIVERY CONFIRMATION (Section 7):
- End-to-end ACK so sender knows message was injected
- Detection via capture-pane after send-keys
- Optional confirmation notification to sender

REGISTRY CONSISTENCY (Section 5.3):
- Fleet-wide unique names (no split-brain)
- Quorum-based registration with Lamport timestamps
- Clear error on name collision with suggestions

AUTHENTICATION (Section 8.1-8.3):
- Ed25519 asymmetric keys (scales better than N² tokens)
- Challenge-response handshake
- Per-message signing to prevent spoofing
- TOFU, static config, or CA options

FLOW CONTROL (Section 9):
- Credit-based flow control
- PEER_BUSY/PEER_READY backpressure signals
- Token bucket rate limiting (per-peer, per-agent, fleet-wide)
- Bounded queues with drop policies

TRANSPORT ABSTRACTION (Section 11):
- Pluggable PeerTransport interface
- WebSocket implementation (default)
- NATS JetStream implementation (optional)
- Migration path from WebSocket to NATS

TIMELINE (Section 14.3-14.4):
- Realistic estimate: 8-10 weeks (not 4-5)
- MVP option for 4-week delivery
- Phase 6: Stabilization added

OPEN QUESTIONS (Section 16):
- 10 unresolved questions for discussion
- Recommendations for each
- Clear decision points before implementation

bd-TBD
---
 docs/FEDERATION_PROPOSAL.md | 762 +++++++++++++++++++++++++++++++++---
 1 file changed, 697 insertions(+), 65 deletions(-)

diff --git a/docs/FEDERATION_PROPOSAL.md b/docs/FEDERATION_PROPOSAL.md
index 2aff708b5..37c68053c 100644
--- a/docs/FEDERATION_PROPOSAL.md
+++ b/docs/FEDERATION_PROPOSAL.md
@@ -1,9 +1,22 @@
 # Agent Relay Federation: Cross-Server Communication Proposal
 
+**Status:** Draft v2 (revised after critical review)
+**Last Updated:** 2025-12-21
+
 ## Executive Summary
 
 This proposal extends agent-relay to support **federated multi-server deployments** while preserving the core differentiator: **automatic message injection via tmux**. Unlike polling-based systems (mcp_agent_mail OSS), federated agent-relay maintains real-time, interrupt-driven communication across server boundaries.
 
+### Key Design Decisions (v2)
+
+| Decision | Choice | Rationale |
+|----------|--------|-----------|
+| Transport | Pluggable (WebSocket default, NATS optional) | Start simple, scale up |
+| Delivery | End-to-end confirmation | Sender knows agent received |
+| Naming | Fleet-wide unique names | Avoid split-brain complexity |
+| Auth | Asymmetric keys (Ed25519) | Scales better than N² tokens |
+| Backpressure | Credit-based flow control | Prevent OOM on slow peers |
+
 ```
 ┌─────────────────────────────────────────────────────────────────────────────┐
 │                           FEDERATED AGENT-RELAY                              │
@@ -38,12 +51,16 @@ This proposal extends agent-relay to support **federated multi-server deployment
 4. [Protocol Specification](#4-protocol-specification)
 5. [Agent Discovery & Registry](#5-agent-discovery--registry)
 6. [Message Routing](#6-message-routing)
-7. [Security Model](#7-security-model)
-8. [Failure Handling & Resilience](#8-failure-handling--resilience)
-9. [Configuration](#9-configuration)
-10. [CLI Interface](#10-cli-interface)
-11. [Implementation Plan](#11-implementation-plan)
-12. [Migration Path](#12-migration-path)
+7. [Delivery Confirmation](#7-delivery-confirmation) *(NEW)*
+8. [Security Model](#8-security-model)
+9. [Flow Control & Backpressure](#9-flow-control--backpressure) *(NEW)*
+10. [Failure Handling & Resilience](#10-failure-handling--resilience)
+11. [Transport Abstraction (NATS Option)](#11-transport-abstraction-nats-option) *(NEW)*
+12. [Configuration](#12-configuration)
+13. [CLI Interface](#13-cli-interface)
+14. [Implementation Plan](#14-implementation-plan)
+15. [Migration Path](#15-migration-path)
+16. [Open Questions](#16-open-questions) *(NEW)*
 
 ---
 
@@ -471,30 +488,51 @@ Agents can be addressed in multiple ways:
 | `@relay:*@local` | Broadcast to local server only |
 | `@relay:*@nyc` | Broadcast to all agents on "nyc" |
 
-### 5.3 Name Collision Handling
+### 5.3 Name Collision Handling (v2: Fleet-Wide Uniqueness)
 
-If two agents have the same name on different servers:
+**Design Decision:** Agent names must be unique across the entire fleet.
 
-1. **Local preference**: Unqualified name routes to local agent first
-2. **First-registered wins**: For fleet-wide lookup, first to register owns the name
-3. **Explicit qualification**: Use `@relay:Bob@server-id` to disambiguate
-4. **Warning on collision**: Daemon logs warning when collision detected
+This is simpler than "first-registered wins" which has race conditions with async gossip. With fleet-wide uniqueness:
+- No split-brain scenarios
+- No ambiguous routing
+- Clear error on collision
 
 ```typescript
-// Resolution algorithm
-function resolveAgent(name: string, fromServer: string): AgentRecord | null {
-  // Check for explicit qualification
+// Registration flow
+async function registerAgent(name: string, serverId: string): Promise<Result> {
+  // Check fleet-wide registry
+  if (registry.exists(name)) {
+    const existing = registry.get(name);
+    return {
+      success: false,
+      error: `Name "${name}" already registered on ${existing.server_id}`,
+      suggestion: `${name}-${serverId.slice(0, 4)}` // e.g., "Bob-nyc1"
+    };
+  }
+
+  // Broadcast reservation with Lamport timestamp
+  await broadcastReservation(name, serverId, lamportClock.tick());
+
+  // Wait for quorum acknowledgment (majority of peers)
+  const acks = await waitForAcks(name, QUORUM_TIMEOUT_MS);
+  if (acks < quorumSize()) {
+    return { success: false, error: 'Failed to achieve quorum' };
+  }
+
+  registry.add(name, serverId);
+  return { success: true };
+}
+
+// Resolution is now simple
+function resolveAgent(name: string): AgentRecord | null {
+  // Check for explicit qualification (still supported)
   if (name.includes('@')) {
     const [agentName, serverSpec] = name.split('@');
     return registry.findOnServer(agentName, serverSpec);
   }
 
-  // Try local first
-  const local = registry.findLocal(name, fromServer);
-  if (local) return local;
-
-  // Fleet-wide lookup (first registered)
-  return registry.findAny(name);
+  // Fleet-wide lookup (guaranteed unique)
+  return registry.get(name);
 }
 ```
 
@@ -671,24 +709,203 @@ function handlePeerRoute(msg: PeerEnvelope<PeerRoutePayload>): void {
 
 ---
 
-## 7. Security Model
+## 7. Delivery Confirmation *(NEW)*
+
+### 7.1 The Problem
+
+Without end-to-end confirmation, senders don't know if messages were actually received:
+
+```
+Alice sends → Daemon A → Daemon B → tmux send-keys → ???
+                          ↑
+                          ACK (peer received)
+
+But: Did Bob's agent actually see it?
+     - tmux session might have crashed
+     - Agent might be blocked
+     - Injection might have failed silently
+```
+
+### 7.2 Solution: DELIVERY_CONFIRMED Message
+
+Add a new message type that confirms the message was injected into the agent's terminal:
+
+```typescript
+interface DeliveryConfirmedPayload {
+  original_message_id: string;  // ID of the message being confirmed
+  injected_at: number;          // Timestamp when send-keys completed
+  agent_status: 'active' | 'idle' | 'unknown';
+}
+```
+
+### 7.3 Confirmation Flow
+
+```
+┌─────────────────────────────────────────────────────────────────────────────┐
+│                         END-TO-END DELIVERY CONFIRMATION                     │
+│                                                                              │
+│  Alice@A                    Daemon A         Daemon B              Bob@B    │
+│     │                          │                │                    │       │
+│     │ ── @relay:Bob msg ────►  │                │                    │       │
+│     │                          │ ─ PEER_ROUTE ─►│                    │       │
+│     │                          │                │ ── send-keys ────► │       │
+│     │                          │                │    (inject msg)    │       │
+│     │                          │                │                    │       │
+│     │                          │                │ ◄── capture-pane ──│       │
+│     │                          │                │    (detect receipt)│       │
+│     │                          │                │                    │       │
+│     │                          │ ◄─ DELIVERY_ ──│                    │       │
+│     │                          │   CONFIRMED    │                    │       │
+│     │                          │                │                    │       │
+│     │ ◄─ inject confirmation ──│                │                    │       │
+│     │   "[✓] Bob received"     │                │                    │       │
+│     │                          │                │                    │       │
+└─────────────────────────────────────────────────────────────────────────────┘
+```
+
+### 7.4 Detection Mechanism
+
+After injecting a message, the TmuxWrapper watches for evidence the agent received it:
+
+```typescript
+async function confirmDelivery(messageId: string): Promise<boolean> {
+  // Wait for agent to echo/process the message
+  const startTime = Date.now();
+  const timeout = 5000; // 5 seconds
+
+  while (Date.now() - startTime < timeout) {
+    const output = await capturePane();
+
+    // Look for our injected message in output
+    if (output.includes(`Relay message from`) && output.includes(messageId.slice(0, 8))) {
+      return true;
+    }
+
+    await sleep(200);
+  }
+
+  return false; // Timeout - uncertain delivery
+}
+```
+
+### 7.5 Sender Notification
+
+Senders receive confirmation or timeout notification:
+
+```
+# Success case
+[relay:Alice] → Bob: Can you review auth.ts?
+[relay:Alice] ✓ Bob received (145ms)
+
+# Timeout case
+[relay:Alice] → Bob: Can you review auth.ts?
+[relay:Alice] ⚠ Delivery to Bob unconfirmed (timeout)
+```
+
+### 7.6 Configuration
+
+Delivery confirmation is optional (adds latency):
+
+```yaml
+federation:
+  delivery_confirmation:
+    enabled: true           # Enable end-to-end confirmation
+    timeout_ms: 5000        # How long to wait for confirmation
+    notify_sender: true     # Inject confirmation into sender's terminal
+```
+
+---
+
+## 8. Security Model
 
-### 7.1 Authentication
+### 8.1 Authentication (v2: Asymmetric Keys)
 
-**Pre-shared tokens** for simplicity:
+**Design Decision:** Use Ed25519 keypairs instead of pre-shared tokens.
 
+Pre-shared tokens don't scale: N servers = N² tokens. With asymmetric keys:
+- Each server has one keypair
+- Servers exchange public keys once
+- Challenge-response authentication
+- Easy key rotation
+
+```typescript
+// Each server generates a keypair on first run
+interface ServerIdentity {
+  server_id: string;
+  public_key: string;   // Ed25519 public key (base64)
+  private_key: string;  // Ed25519 private key (stored securely)
+}
+
+// Handshake uses challenge-response
+interface PeerHelloPayload {
+  server_id: string;
+  public_key: string;
+  challenge: string;          // Random nonce
+  challenge_signature: string; // Sign our challenge with our private key
+}
+
+interface PeerWelcomePayload {
+  server_id: string;
+  challenge_response: string;  // Sign their challenge with our private key
+  // ... rest of welcome
+}
+```
+
+### 8.2 Key Distribution
+
+Options for distributing public keys:
+
+**Option A: Static Configuration (Simple)**
+```yaml
+auth:
+  private_key_path: /etc/agent-relay/server.key
+  known_peers:
+    london-prod-01: "ed25519:abc123..."  # Public key
+    tokyo-prod-01: "ed25519:def456..."
+```
+
+**Option B: Trust-on-First-Use (TOFU)**
+```yaml
+auth:
+  tofu_enabled: true      # Accept new peers, remember their keys
+  tofu_require_approval: true  # Require human approval for new peers
+```
+
+**Option C: Certificate Authority (Enterprise)**
 ```yaml
-# Server A config
 auth:
-  server_token: "server-a-secret-token"
-  peer_tokens:
-    server-b: "shared-secret-ab"
-    server-c: "shared-secret-ac"
+  ca_cert: /etc/agent-relay/ca.pem
+  server_cert: /etc/agent-relay/server.pem
+  server_key: /etc/agent-relay/server.key
 ```
 
-Tokens are exchanged in PEER_HELLO and validated before PEER_WELCOME.
+### 8.3 Message Signing
+
+Each peer-to-peer message is signed:
+
+```typescript
+interface PeerEnvelope<T> {
+  // ... existing fields
+  signature: string;  // Ed25519 signature of (type + id + ts + payload)
+}
+
+function signEnvelope(envelope: PeerEnvelope, privateKey: Key): string {
+  const payload = JSON.stringify({
+    type: envelope.type,
+    id: envelope.id,
+    ts: envelope.ts,
+    payload: envelope.payload
+  });
+  return ed25519.sign(payload, privateKey);
+}
 
-### 7.2 Transport Security
+function verifyEnvelope(envelope: PeerEnvelope, publicKey: Key): boolean {
+  // Reject if signature invalid - prevents spoofing
+  return ed25519.verify(envelope.signature, publicKey);
+}
+```
+
+### 8.4 Transport Security
 
 **Mandatory TLS** for peer connections:
 
@@ -704,7 +921,7 @@ const ws = new WebSocket(peerUrl, {
 });
 ```
 
-### 7.3 Authorization
+### 8.5 Authorization
 
 Simple capability model:
 
@@ -717,7 +934,7 @@ interface ServerCapabilities {
 }
 ```
 
-### 7.4 Security Checklist
+### 8.6 Security Checklist
 
 | Control | Status | Notes |
 |---------|--------|-------|
@@ -730,9 +947,163 @@ interface ServerCapabilities {
 
 ---
 
-## 8. Failure Handling & Resilience
+## 9. Flow Control & Backpressure *(NEW)*
+
+### 15.1 The Problem
+
+Without flow control, a fast sender can overwhelm a slow receiver:
+
+```
+Server A: sends 1000 msgs/sec to Server B
+Server B: can only inject 10 msgs/sec (agents busy)
+Server B: queue grows → memory exhaustion → OOM crash
+```
+
+### 15.2 Credit-Based Flow Control
+
+Each peer connection has a credit window:
+
+```typescript
+interface FlowControl {
+  // Sender side
+  credits: number;          // How many messages we can send
+  pendingAcks: Map<string, Envelope>;  // Messages awaiting ACK
+
+  // Receiver side
+  windowSize: number;       // Max messages before ACK required
+  received: number;         // Messages received since last ACK
+}
+
+// Sender checks credits before sending
+function canSend(): boolean {
+  return this.credits > 0;
+}
+
+function send(envelope: Envelope): boolean {
+  if (!this.canSend()) {
+    this.queue.push(envelope);  // Queue locally
+    return false;
+  }
+
+  this.credits--;
+  this.pendingAcks.set(envelope.id, envelope);
+  this.peer.send(envelope);
+  return true;
+}
+
+// Receiver sends PEER_ACK to replenish credits
+function onReceive(envelope: Envelope): void {
+  this.received++;
+
+  if (this.received >= this.windowSize / 2) {
+    this.peer.send({
+      type: 'PEER_ACK',
+      payload: { credits: this.received }
+    });
+    this.received = 0;
+  }
+}
+
+// Sender receives ACK, replenishes credits
+function onAck(ack: PeerAckPayload): void {
+  this.credits += ack.credits;
+  this.flushQueue();  // Send queued messages
+}
+```
+
+### 15.3 Backpressure Signals
+
+When a receiver is overwhelmed, it sends PEER_BUSY:
+
+```typescript
+type PeerMessageType =
+  // ... existing types
+  | 'PEER_ACK'    // Replenish sender credits
+  | 'PEER_BUSY'   // Receiver overwhelmed, stop sending
+  | 'PEER_READY'; // Receiver recovered, resume
+
+interface PeerBusyPayload {
+  reason: 'queue_full' | 'agent_busy' | 'rate_limited';
+  retry_after_ms?: number;  // Suggested wait time
+}
+```
+
+### 9.4 Rate Limiting
+
+Per-peer and fleet-wide rate limits:
+
+```typescript
+interface RateLimiter {
+  // Token bucket algorithm
+  tokens: number;
+  maxTokens: number;
+  refillRate: number;  // tokens per second
+
+  tryConsume(count: number): boolean {
+    this.refill();
+    if (this.tokens >= count) {
+      this.tokens -= count;
+      return true;
+    }
+    return false;
+  }
+}
+
+// Applied at multiple levels
+const limits = {
+  perPeer: new RateLimiter({ maxTokens: 100, refillRate: 50 }),     // 50/sec per peer
+  perAgent: new RateLimiter({ maxTokens: 20, refillRate: 10 }),     // 10/sec per agent
+  fleetWide: new RateLimiter({ maxTokens: 1000, refillRate: 200 }), // 200/sec total
+};
+```
+
+### 9.5 Bounded Queues
+
+Queues have maximum sizes with drop policies:
+
+```typescript
+interface BoundedQueue<T> {
+  maxSize: number;
+  dropPolicy: 'oldest' | 'newest' | 'reject';
+
+  push(item: T): boolean {
+    if (this.items.length >= this.maxSize) {
+      switch (this.dropPolicy) {
+        case 'oldest':
+          this.items.shift();  // Drop oldest
+          break;
+        case 'newest':
+          return false;        // Reject new item
+        case 'reject':
+          throw new QueueFullError();
+      }
+    }
+    this.items.push(item);
+    return true;
+  }
+}
+```
+
+### 9.6 Configuration
+
+```yaml
+federation:
+  flow_control:
+    window_size: 100          # Messages before ACK required
+    max_queue_size: 1000      # Max queued messages per peer
+    queue_drop_policy: oldest # oldest | newest | reject
+
+  rate_limits:
+    per_peer_per_second: 50
+    per_agent_per_second: 10
+    fleet_wide_per_second: 200
+```
+
+---
+
+## 10. Failure Handling & Resilience
 
-### 8.1 Connection Failures
+### 13.1 Connection Failures
 
 ```
 ┌─────────────────────────────────────────────────────────────────┐
@@ -762,7 +1133,7 @@ interface ServerCapabilities {
 └─────────────────────────────────────────────────────────────────┘
 ```
 
-### 8.2 Split Brain Prevention
+### 13.2 Split Brain Prevention
 
 If the fleet gets partitioned:
 
@@ -771,7 +1142,7 @@ If the fleet gets partitioned:
 3. **No automatic conflict resolution** - messages deliver in order received
 4. **TTL expiration** - queued messages expire after 1 hour (configurable)
 
-### 8.3 Graceful Degradation
+### 13.3 Graceful Degradation
 
 ```
 Fleet healthy:     A ◄──► B ◄──► C    (full connectivity)
@@ -782,7 +1153,7 @@ B goes down:       A ◄─X─► B ◄─X─► C
 B comes back:      A ◄──► B ◄──► C    (queued messages flush)
 ```
 
-### 8.4 Health Monitoring
+### 10.4 Health Monitoring
 
 ```typescript
 // Heartbeat every 30 seconds
@@ -804,9 +1175,158 @@ setInterval(() => {
 
 ---
 
-## 9. Configuration
+## 11. Transport Abstraction (NATS Option) *(NEW)*
+
+### 14.1 Motivation
+
+The custom WebSocket protocol works for simple deployments, but production fleets may benefit from battle-tested message infrastructure. NATS JetStream provides:
+
+- ✅ Persistent message queues (survive restarts)
+- ✅ Exactly-once delivery semantics
+- ✅ Built-in clustering and HA
+- ✅ Backpressure and flow control
+- ✅ Rich observability (metrics, tracing)
+- ✅ Years of production hardening
+
+**Trade-off:** External dependency vs. implementation effort.
+
+### 14.2 Transport Interface
+
+Abstract the transport layer so implementations are swappable:
+
+```typescript
+interface PeerTransport {
+  // Lifecycle
+  connect(config: TransportConfig): Promise<void>;
+  disconnect(): Promise<void>;
+
+  // Messaging
+  send(serverId: string, envelope: PeerEnvelope): Promise<void>;
+  broadcast(envelope: PeerEnvelope): Promise<void>;
+  subscribe(handler: (from: string, envelope: PeerEnvelope) => void): void;
+
+  // Discovery
+  getConnectedPeers(): string[];
+  onPeerJoin(handler: (serverId: string) => void): void;
+  onPeerLeave(handler: (serverId: string) => void): void;
+}
+```
+
+### 14.3 WebSocket Implementation (Default)
+
+```typescript
+class WebSocketTransport implements PeerTransport {
+  private connections: Map<string, WebSocket>;
+
+  async connect(config: TransportConfig): Promise<void> {
+    for (const peer of config.peers) {
+      const ws = new WebSocket(peer.url);
+      // ... handshake, auth
+      this.connections.set(peer.serverId, ws);
+    }
+  }
+
+  async send(serverId: string, envelope: PeerEnvelope): Promise<void> {
+    const ws = this.connections.get(serverId);
+    ws?.send(JSON.stringify(envelope));
+  }
+
+  // ... rest of implementation
+}
+```
+
+### 11.4 NATS Implementation (Optional)
+
+```typescript
+class NatsTransport implements PeerTransport {
+  private nc: NatsConnection;
+  private js: JetStreamClient;
+
+  async connect(config: TransportConfig): Promise<void> {
+    this.nc = await connect({ servers: config.natsUrl });
+    this.js = this.nc.jetstream();
+
+    // Create stream for fleet messages
+    await this.js.streams.add({
+      name: 'RELAY_FLEET',
+      subjects: ['relay.>'],
+      retention: RetentionPolicy.Limits,
+      max_age: 3600 * 1e9, // 1 hour
+    });
+  }
+
+  async send(serverId: string, envelope: PeerEnvelope): Promise<void> {
+    // Publish to server-specific subject
+    await this.js.publish(`relay.server.${serverId}`, encode(envelope));
+  }
+
+  async broadcast(envelope: PeerEnvelope): Promise<void> {
+    // Publish to broadcast subject
+    await this.js.publish('relay.broadcast', encode(envelope));
+  }
+
+  subscribe(handler: (from: string, envelope: PeerEnvelope) => void): void {
+    // Subscribe to our server subject + broadcast
+    const sub = this.nc.subscribe(`relay.server.${this.serverId}`);
+    const broadcastSub = this.nc.subscribe('relay.broadcast');
+
+    (async () => {
+      for await (const msg of sub) {
+        const envelope = decode(msg.data);
+        handler(envelope.from_server, envelope);
+      }
+    })();
+  }
+}
+```
+
+### 11.5 Configuration
+
+```yaml
+federation:
+  # Transport selection
+  transport: websocket  # websocket | nats
+
+  # WebSocket config (if transport: websocket)
+  websocket:
+    peers:
+      - url: wss://london.example.com:8765
+        server_id: london
+
+  # NATS config (if transport: nats)
+  nats:
+    url: nats://nats.example.com:4222
+    credentials: /etc/agent-relay/nats.creds
+    stream_name: RELAY_FLEET
+```
+
+### 11.6 When to Use Which
+
+| Scenario | Recommended Transport | Rationale |
+|----------|----------------------|-----------|
+| 2-5 servers, simple setup | WebSocket | No external deps |
+| Development/testing | WebSocket | Easy to run locally |
+| 10+ servers | NATS | Better scaling |
+| High reliability required | NATS | Persistence, HA |
+| Already have NATS | NATS | Leverage existing |
+| Air-gapped/restricted | WebSocket | No external deps |
+
+### 11.7 Migration Path
+
+Start with WebSocket, migrate to NATS when needed:
+
+1. Deploy NATS cluster
+2. Update config: `transport: nats`
+3. Restart daemons (one at a time)
+4. Messages route through NATS immediately
+
+No changes to agents or injection logic.
+
+---
+
+## 12. Configuration
 
-### 9.1 Configuration File
+### 15.1 Configuration File
 
 ```yaml
 # /etc/agent-relay/config.yaml (or ~/.agent-relay/config.yaml)
@@ -879,7 +1399,7 @@ dashboard:
   show_fleet: true                    # Show all fleet agents
 ```
 
-### 9.2 Environment Variables
+### 15.2 Environment Variables
 
 All config can be overridden via environment:
 
@@ -896,7 +1416,7 @@ AGENT_RELAY_PEER_london-prod-01_TOKEN=london-token
 AGENT_RELAY_PEER_london-prod-01_URL=wss://london.example.com:8765
 ```
 
-### 9.3 Minimal Configuration
+### 15.3 Minimal Configuration
 
 For simple two-server setup:
 
@@ -932,9 +1452,9 @@ federation:
 
 ---
 
-## 10. CLI Interface
+## 13. CLI Interface
 
-### 10.1 New Commands
+### 13.1 New Commands
 
 ```bash
 # Start daemon with federation
@@ -963,7 +1483,7 @@ agent-relay fleet ping <server-id>     # Ping a peer
 agent-relay fleet trace <agent>        # Trace route to agent
 ```
 
-### 10.2 Example Session
+### 13.2 Example Session
 
 ```bash
 # On Server NYC
@@ -1000,7 +1520,7 @@ $ agent-relay -n Alice claude
 # "Relay message from Alice@nyc [abc123]: Can you help with the auth module?"
 ```
 
-### 10.3 Addressing Examples
+### 13.3 Addressing Examples
 
 ```bash
 # From Alice on NYC server:
@@ -1016,9 +1536,9 @@ $ agent-relay -n Alice claude
 
 ---
 
-## 11. Implementation Plan
+## 14. Implementation Plan
 
-### 11.1 Phases
+### 14.1 Phases
 
 ```
 ┌─────────────────────────────────────────────────────────────────────────────┐
@@ -1086,7 +1606,7 @@ $ agent-relay -n Alice claude
 └─────────────────────────────────────────────────────────────────────────────┘
 ```
 
-### 11.2 File Structure
+### 14.2 File Structure
 
 ```
 src/
@@ -1112,22 +1632,45 @@ src/
     └── index.ts                   # Modified: new commands
 ```
 
-### 11.3 Estimated Effort
+### 14.3 Estimated Effort (Revised)
+
+**Original estimate was too optimistic.** Distributed systems are hard. Realistic timeline:
 
-| Phase | Effort | Dependencies |
-|-------|--------|--------------|
-| Phase 1: Foundation | 3-4 days | None |
-| Phase 2: Registry | 3-4 days | Phase 1 |
-| Phase 3: Routing | 3-4 days | Phase 2 |
-| Phase 4: Resilience | 3-4 days | Phase 3 |
-| Phase 5: Security | 3-4 days | Phase 4 |
-| **Total** | **~4-5 weeks** | |
+| Phase | Optimistic | Realistic | Notes |
+|-------|------------|-----------|-------|
+| Phase 1: Foundation | 1 week | 1.5-2 weeks | WebSocket edge cases |
+| Phase 2: Registry | 1 week | 2 weeks | Consistency is hard |
+| Phase 3: Routing | 1 week | 1.5 weeks | Broadcast complexity |
+| Phase 4: Resilience | 1 week | 2-3 weeks | Reconnection, testing |
+| Phase 5: Security | 1 week | 2 weeks | TLS setup, key mgmt |
+| Phase 6: Stabilization | - | 2 weeks | Bug fixes, edge cases |
+| **Total** | 4-5 weeks | **8-10 weeks** | |
+
+### 14.4 MVP Option
+
+To ship faster, consider a reduced-scope MVP:
+
+**MVP Scope (4 weeks):**
+- Static peer list only (no hub)
+- No TLS (rely on VPN/private network)
+- Single fleet token (not per-pair)
+- Require unique names (no conflict resolution)
+- Memory-only queues (no persistence)
+- No message priorities
+
+**Post-MVP (add incrementally):**
+- TLS + asymmetric key auth
+- Hub for discovery
+- Queue persistence
+- Delivery confirmation
+- NATS transport option
+- Observability
 
 ---
 
-## 12. Migration Path
+## 15. Migration Path
 
-### 12.1 Backward Compatibility
+### 15.1 Backward Compatibility
 
 Existing single-server deployments work without changes:
 
@@ -1137,14 +1680,14 @@ local:
   socket_path: /tmp/agent-relay/relay.sock
 ```
 
-### 12.2 Upgrade Path
+### 15.2 Upgrade Path
 
 1. **Update agent-relay** to federation-capable version
 2. **Add federation config** to enable cross-server
 3. **Start daemons** - they auto-connect to peers
 4. **Agents just work** - no changes needed
 
-### 12.3 Rollback
+### 15.3 Rollback
 
 If issues arise:
 1. Set `federation.enabled: false`
@@ -1179,8 +1722,97 @@ This proposal extends agent-relay to support federated multi-server deployments
 
 ---
 
+## 16. Open Questions *(NEW)*
+
+These questions remain unresolved and need input before/during implementation:
+
+### Architecture
+
+1. **Hub vs. Mesh for MVP?**
+   - Hub is simpler but single point of failure
+   - Mesh is resilient but more complex
+   - Recommendation: Start with mesh (static peers), add hub later
+
+2. **Queue persistence?**
+   - Memory-only: Simple, but loses messages on crash
+   - SQLite: Survives restarts, but adds complexity
+   - Recommendation: Memory for MVP, SQLite for v2
+
+3. **NATS priority?**
+   - Implement WebSocket first, NATS later?
+   - Or start with NATS to avoid reimplementing?
+   - Recommendation: WebSocket MVP, NATS for production scale
+
+### Protocol
+
+4. **Message ordering guarantees?**
+   - Per-agent FIFO? Global ordering? Best-effort?
+   - Strict ordering adds latency and complexity
+   - Recommendation: Document best-effort, no guarantees
+
+5. **Broadcast scalability?**
+   - O(n) messages for n agents - acceptable?
+   - Need gossip-style fan-out for large fleets?
+   - Recommendation: Direct broadcast for <50 agents, revisit at scale
+
+### Security
+
+6. **Key distribution method?**
+   - Static config, TOFU, or CA?
+   - Trade-off: security vs. operational simplicity
+   - Recommendation: TOFU with approval for dev, CA for enterprise
+
+7. **mTLS required?**
+   - Adds complexity but strong authentication
+   - Alternative: TLS + Ed25519 challenge-response
+   - Recommendation: TLS + challenge-response for MVP
+
+### Operations
+
+8. **Testing strategy?**
+   - How to test multi-server locally?
+   - Need chaos testing framework?
+   - Recommendation: Docker Compose for integration tests
+
+9. **Observability from day one?**
+   - Add Prometheus metrics in MVP?
+   - Or defer to post-MVP?
+   - Recommendation: Basic metrics (connections, messages) in MVP
+
+10. **NAT traversal?**
+    - Support servers behind NAT?
+    - Requires connection reversal or TURN relay
+    - Recommendation: Document requirement for direct connectivity, defer NAT to v2
+
+---
+
+## Summary (v2)
+
+This revised proposal addresses the critical issues identified in review:
+
+| Issue | Resolution |
+|-------|------------|
+| No end-to-end ACK | Added delivery confirmation (Section 7) |
+| Registry split-brain | Fleet-wide unique names + quorum (Section 5.3) |
+| Token scaling | Asymmetric keys (Section 8.1) |
+| No backpressure | Credit-based flow control (Section 9) |
+| Timeline unrealistic | Revised to 8-10 weeks (Section 14.3) |
+| NATS consideration | Pluggable transport layer (Section 11) |
+
+**Key additions in v2:**
+- End-to-end delivery confirmation
+- Fleet-wide unique name enforcement
+- Ed25519 authentication (scales better)
+- Credit-based flow control + rate limiting
+- Transport abstraction for NATS option
+- Realistic timeline with MVP option
+- Open questions for discussion
+
+---
+
 ## Next Steps
 
-1. Review and approve this proposal
-2. Create implementation tasks in Beads
-3. Begin Phase 1: Foundation
+1. Review v2 proposal, discuss open questions
+2. Decide on MVP scope
+3. Create implementation tasks in Beads
+4. Begin Phase 1: Foundation

From c1ea132657977d3350b2a924067abe107a763246 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 21 Dec 2025 09:17:47 +0000
Subject: [PATCH 04/19] Add Storage Architecture section to federation proposal
 (v2)

Addresses storage requirements for federated deployments by separating:
- Ephemeral storage (routing): Memory or NATS JetStream for message queues
- Durable storage (trajectories): File/SQLite local + PostgreSQL/S3 central

References the trajectories proposal (PR #3) for detailed format specification.
Includes configuration examples and federation impact analysis.
---
 docs/FEDERATION_PROPOSAL.md | 359 ++++++++++++++++++++++++++++++++++++
 1 file changed, 359 insertions(+)

diff --git a/docs/FEDERATION_PROPOSAL.md b/docs/FEDERATION_PROPOSAL.md
index 37c68053c..b4bc548e1 100644
--- a/docs/FEDERATION_PROPOSAL.md
+++ b/docs/FEDERATION_PROPOSAL.md
@@ -61,6 +61,7 @@ This proposal extends agent-relay to support **federated multi-server deployment
 14. [Implementation Plan](#14-implementation-plan)
 15. [Migration Path](#15-migration-path)
 16. [Open Questions](#16-open-questions) *(NEW)*
+17. [Storage Architecture](#17-storage-architecture) *(NEW)*
 
 ---
 
@@ -1786,6 +1787,362 @@ These questions remain unresolved and need input before/during implementation:
 
 ---
 
+## 17. Storage Architecture *(NEW)*
+
+Federation introduces distinct storage requirements: **ephemeral storage** for message routing and **durable storage** for trajectories and work history. These have fundamentally different characteristics.
+
+### 17.1 Two Storage Domains
+
+```
+┌─────────────────────────────────────────────────────────────────────────────┐
+│                         STORAGE ARCHITECTURE                                 │
+├─────────────────────────────────────────────────────────────────────────────┤
+│                                                                              │
+│  ┌─────────────────────────┐      ┌─────────────────────────────────────┐  │
+│  │   EPHEMERAL STORAGE     │      │       DURABLE STORAGE                │  │
+│  │   (Message Routing)     │      │       (Trajectories)                 │  │
+│  │                         │      │                                      │  │
+│  │  • Peer message queues  │      │  • Agent work history               │  │
+│  │  • Pending ACKs         │      │  • Decisions & retrospectives       │  │
+│  │  • Flow control credits │      │  • Inter-agent conversations        │  │
+│  │  • Connection state     │      │  • Exported artifacts               │  │
+│  │                         │      │                                      │  │
+│  │  Lifetime: minutes/hours│      │  Lifetime: months/years             │  │
+│  │  Size: KB-MB per peer   │      │  Size: MB-GB per project            │  │
+│  │  Loss impact: retry     │      │  Loss impact: permanent             │  │
+│  │                         │      │                                      │  │
+│  │  Backend:               │      │  Backend:                           │  │
+│  │  • Memory (default)     │      │  • File system (default)            │  │
+│  │  • NATS JetStream       │      │  • SQLite (local queries)           │  │
+│  │                         │      │  • PostgreSQL (team sharing)        │  │
+│  │                         │      │  • S3/GCS (archive)                 │  │
+│  └─────────────────────────┘      └─────────────────────────────────────┘  │
+│                                                                              │
+└─────────────────────────────────────────────────────────────────────────────┘
+```
+
+### 17.2 Ephemeral Storage (Message Routing)
+
+For federation's real-time message routing, **memory is the default**. Messages are transient—they matter for delivery, not history.
+
+#### In-Memory Queues (Default)
+
+```typescript
+class EphemeralStore {
+  // Per-peer outbound queues (for disconnected peers)
+  peerQueues: Map<string, BoundedQueue<PeerEnvelope>>;
+
+  // Pending delivery confirmations
+  pendingAcks: Map<string, { envelope: Envelope; sentAt: number }>;
+
+  // Flow control state
+  peerCredits: Map<string, number>;
+
+  // Configuration
+  config: {
+    maxQueueSize: 1000;           // Bounded to prevent OOM
+    ackTimeoutMs: 30000;          // Expire pending ACKs after 30s
+    queueTtlMs: 3600000;          // Drop queued messages after 1 hour
+  };
+}
+```
+
+**Properties:**
+- ✅ Fast (no I/O)
+- ✅ Simple (no external deps)
+- ❌ Lost on daemon restart
+- ❌ Limited by available memory
+
+**When this is fine:**
+- Most messages deliver immediately
+- Disconnections are brief (seconds to minutes)
+- Acceptable to lose queued messages on crash
+
+#### NATS JetStream (Optional Upgrade)
+
+When using NATS transport (Section 11), streams provide ephemeral persistence:
+
+```typescript
+// NATS stream for routing messages
+const routingStream = {
+  name: 'RELAY_ROUTING',
+  subjects: ['relay.route.*', 'relay.broadcast'],
+  retention: RetentionPolicy.Limits,
+  max_age: 3600 * 1e9,           // 1 hour retention
+  max_bytes: 100 * 1024 * 1024,  // 100 MB max
+  discard: DiscardPolicy.Old,    // Drop oldest on limit
+};
+```
+
+**Properties:**
+- ✅ Survives daemon restarts
+- ✅ Shared across peers (no per-peer queuing)
+- ✅ Built-in flow control and backpressure
+- ❌ External dependency
+- ❌ Additional operational complexity
+
+**When to use NATS:**
+- High message volume
+- Long disconnection tolerance needed
+- Already have NATS infrastructure
+
+### 17.3 Durable Storage (Trajectories)
+
+For long-term work history, **durable storage is essential**. This stores agent trajectories—the complete record of task work including prompts, reasoning, decisions, and retrospectives.
+
+> **See also:** [Trajectories Proposal](https://github.com/khaliqgant/agent-relay/pull/3) for detailed format specification.
+
+#### Storage Tiers
+
+```
+┌─────────────────────────────────────────────────────────────────────────────┐
+│                    TRAJECTORY STORAGE TIERS                                  │
+│                                                                              │
+│  ┌─────────────┐     ┌─────────────┐     ┌─────────────┐     ┌────────────┐│
+│  │  Active     │     │   Local     │     │   Central   │     │  Archive   ││
+│  │  (File)     │────►│  (SQLite)   │────►│  (Postgres) │────►│  (S3)      ││
+│  │             │     │             │     │             │     │            ││
+│  │ In-progress │     │ Completed   │     │ Team-shared │     │ Cold       ││
+│  │ trajectories│     │ trajectories│     │ trajectories│     │ storage    ││
+│  │             │     │ (indexed)   │     │             │     │            ││
+│  │ .trajectories/    │ trajectories.db   │ Central DB  │     │ S3 bucket  ││
+│  │  active/    │     │             │     │             │     │            ││
+│  └─────────────┘     └─────────────┘     └─────────────┘     └────────────┘│
+│                                                                              │
+│  Speed: ◄────────────────────────────────────────────────────────────► Cost │
+│         Fastest                                                    Cheapest  │
+└─────────────────────────────────────────────────────────────────────────────┘
+```
+
+#### File System (Default)
+
+```
+.trajectories/
+├── index.json                    # Quick lookup index
+├── active/                       # In-progress trajectories
+│   └── traj_abc123.json
+├── completed/                    # Finished trajectories
+│   ├── 2024-01/
+│   │   ├── traj_def456.json     # Full trajectory data
+│   │   └── traj_def456.md       # Human-readable export
+│   └── 2024-02/
+└── archive/                      # Compressed old trajectories
+```
+
+**Properties:**
+- ✅ Git-friendly (can commit trajectories with code)
+- ✅ No external deps
+- ✅ Portable (copy directory to share)
+- ❌ No cross-server queries
+- ❌ No team sharing without file sync
+
+#### SQLite (Local)
+
+For indexing and querying completed trajectories:
+
+```sql
+-- Same DB can hold both routing state and trajectories
+-- /tmp/agent-relay/state.sqlite
+
+CREATE TABLE trajectories (
+  id TEXT PRIMARY KEY,
+  task_id TEXT,
+  project_id TEXT NOT NULL,
+  started_at INTEGER NOT NULL,
+  completed_at INTEGER,
+  agent_names TEXT,                -- JSON array
+  chapters TEXT NOT NULL,          -- JSON
+  retrospective TEXT,              -- JSON
+  created_at INTEGER NOT NULL
+);
+
+CREATE INDEX idx_traj_task ON trajectories(task_id);
+CREATE INDEX idx_traj_project ON trajectories(project_id);
+```
+
+**Properties:**
+- ✅ Fast local queries
+- ✅ Single-file, easy backup
+- ✅ No external deps
+- ❌ Single-server scope
+
+#### PostgreSQL (Central)
+
+For team-wide trajectory sharing:
+
+```typescript
+// Central trajectory store configuration
+interface CentralStorageConfig {
+  type: 'postgresql';
+  connectionString: string;
+  schema: 'agent_relay';
+
+  // Sync behavior
+  syncOnComplete: boolean;        // Push trajectory when task completes
+  syncOnDemand: boolean;          // Pull trajectories from central
+  conflictResolution: 'server-wins' | 'local-wins' | 'merge';
+}
+```
+
+**Properties:**
+- ✅ Team-wide visibility
+- ✅ Rich querying (across all servers)
+- ✅ Central backup
+- ❌ Requires PostgreSQL infrastructure
+- ❌ Network dependency for writes
+
+#### S3/GCS (Archive)
+
+For long-term cold storage:
+
+```typescript
+interface ArchiveConfig {
+  type: 's3' | 'gcs';
+  bucket: string;
+  prefix: 'trajectories/';
+
+  // Lifecycle
+  archiveAfterDays: 90;           // Move to archive after 90 days
+  format: 'json' | 'json.gz';     // Compress for storage
+}
+```
+
+### 17.4 Export Format
+
+Trajectories export to a portable `.trajectory` format:
+
+```
+task-bd-123.trajectory/
+├── manifest.json                 # Metadata and table of contents
+├── trajectory.json               # Machine-readable full data
+├── trajectory.md                 # Human-readable narrative
+└── assets/                       # Attachments (screenshots, files)
+    ├── screenshot-001.png
+    └── diff-summary.patch
+```
+
+**Manifest structure:**
+
+```json
+{
+  "version": 1,
+  "trajectory_id": "traj_abc123",
+  "task": {
+    "source": "beads",
+    "id": "bd-123",
+    "title": "Implement rate limiting"
+  },
+  "created_at": "2025-01-15T10:00:00Z",
+  "completed_at": "2025-01-15T14:30:00Z",
+  "agents": ["Alice", "Bob"],
+  "summary": {
+    "chapters": 5,
+    "decisions": 3,
+    "files_changed": 12,
+    "total_events": 156
+  }
+}
+```
+
+### 17.5 Storage Configuration
+
+```yaml
+# /etc/agent-relay/config.yaml
+
+storage:
+  # Ephemeral (routing)
+  ephemeral:
+    type: memory                   # memory | nats
+    max_queue_per_peer: 1000
+    queue_ttl_ms: 3600000
+
+    # If using NATS
+    nats:
+      stream: RELAY_ROUTING
+      max_age_seconds: 3600
+      max_bytes: 104857600         # 100 MB
+
+  # Durable (trajectories)
+  trajectories:
+    # Local storage
+    local:
+      type: file                   # file | sqlite
+      path: .trajectories/
+      index_db: .trajectories/index.sqlite
+
+    # Optional central storage
+    central:
+      enabled: false
+      type: postgresql
+      connection_string: "${TRAJECTORY_DB_URL}"
+      sync_on_complete: true
+
+    # Optional archive
+    archive:
+      enabled: false
+      type: s3
+      bucket: company-trajectories
+      region: us-east-1
+      archive_after_days: 90
+```
+
+### 17.6 Federation Impact on Storage
+
+When federation is enabled, storage considerations change:
+
+| Concern | Single Server | Federated Fleet |
+|---------|---------------|-----------------|
+| **Routing queues** | Per-agent | Per-peer + per-agent |
+| **Registry** | Local only | Fleet-wide sync |
+| **Trajectories** | Local files | Central DB recommended |
+| **Message history** | Optional | Recommended for debugging |
+
+**Recommendations for federated deployments:**
+
+1. **Routing:** Use NATS if available, otherwise memory with bounded queues
+2. **Registry:** Memory + periodic persistence (survive restarts)
+3. **Trajectories:** SQLite local + PostgreSQL central for team visibility
+4. **Archive:** S3 for cost-effective long-term storage
+
+### 17.7 Data Flow
+
+```
+┌─────────────────────────────────────────────────────────────────────────────┐
+│                    DATA FLOW: ROUTING vs TRAJECTORIES                        │
+│                                                                              │
+│  Message Send                              Task Work                         │
+│       │                                         │                            │
+│       ▼                                         ▼                            │
+│  ┌─────────────┐                         ┌─────────────┐                    │
+│  │ Route via   │                         │ Capture     │                    │
+│  │ ephemeral   │                         │ events      │                    │
+│  │ queues      │                         │ (trajectory)│                    │
+│  └──────┬──────┘                         └──────┬──────┘                    │
+│         │                                       │                            │
+│         ▼                                       ▼                            │
+│  ┌─────────────┐                         ┌─────────────┐                    │
+│  │ Deliver     │                         │ Write to    │                    │
+│  │ & discard   │◄── separate ───────────►│ durable     │                    │
+│  │             │    concerns             │ storage     │                    │
+│  └─────────────┘                         └──────┬──────┘                    │
+│                                                 │                            │
+│                                                 ▼                            │
+│                                          ┌─────────────┐                    │
+│                                          │ Sync to     │                    │
+│                                          │ central     │                    │
+│                                          │ (optional)  │                    │
+│                                          └──────┬──────┘                    │
+│                                                 │                            │
+│                                                 ▼                            │
+│                                          ┌─────────────┐                    │
+│                                          │ Archive to  │                    │
+│                                          │ S3 (cold)   │                    │
+│                                          └─────────────┘                    │
+└─────────────────────────────────────────────────────────────────────────────┘
+```
+
+---
+
 ## Summary (v2)
 
 This revised proposal addresses the critical issues identified in review:
@@ -1798,6 +2155,7 @@ This revised proposal addresses the critical issues identified in review:
 | No backpressure | Credit-based flow control (Section 9) |
 | Timeline unrealistic | Revised to 8-10 weeks (Section 14.3) |
 | NATS consideration | Pluggable transport layer (Section 11) |
+| Storage for trajectories | Two-tier storage architecture (Section 17) |
 
 **Key additions in v2:**
 - End-to-end delivery confirmation
@@ -1805,6 +2163,7 @@ This revised proposal addresses the critical issues identified in review:
 - Ed25519 authentication (scales better)
 - Credit-based flow control + rate limiting
 - Transport abstraction for NATS option
+- Storage architecture (ephemeral routing + durable trajectories)
 - Realistic timeline with MVP option
 - Open questions for discussion
 

From 7420de8ddb307cf0c9d917ad510b0b33c89f26ed Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 21 Dec 2025 10:48:11 +0000
Subject: [PATCH 05/19] Add 28 federation build tasks with agent assignments

Tasks organized by phase (1-5) and assigned to agent roles:
- Architect: Protocol design, testing, docs (3 tasks)
- Network: PeerConnection, PeerManager, reconnection, flow control (8 tasks)
- Router: FleetRegistry, routing, broadcast, delivery confirmation (8 tasks)
- Security: TLS, Ed25519 authentication (2 tasks)
- Storage: Message queues, trajectory storage (3 tasks)
- CLI: Fleet commands, config, dashboard (4 tasks)

Dependencies mapped to ensure correct build order.
See docs/FEDERATION_PROPOSAL.md for full specification.
---
 .beads/issues.jsonl | 28 ++++++++++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/.beads/issues.jsonl b/.beads/issues.jsonl
index c01f29238..50821c345 100644
--- a/.beads/issues.jsonl
+++ b/.beads/issues.jsonl
@@ -49,3 +49,31 @@
 {"id":"agent-relay-sio","title":"Add graceful degradation when relay daemon is unavailable","description":"In wrapper/tmux-wrapper.ts:195-197, daemon connection failures are silently caught. Consider: (1) Periodic reconnection attempts, (2) Queueing messages for later delivery, (3) Visual indicator in terminal showing connection status.","status":"open","priority":2,"issue_type":"feature","created_at":"2025-12-20T00:17:32.600333+01:00","updated_at":"2025-12-20T00:17:32.600333+01:00"}
 {"id":"agent-relay-ucw","title":"Dashboard: multi-project navigation or dynamic port allocation","description":"When the dashboard is already running for one project, users should be able to either: (1) Navigate between different projects in a single dashboard view, OR (2) Start a new dashboard instance on an automatically allocated available port for a different project. Currently if a dashboard is running, starting another project conflicts.","status":"closed","priority":2,"issue_type":"feature","created_at":"2025-12-19T23:40:18.667766+01:00","updated_at":"2025-12-20T00:18:05.10495+01:00","closed_at":"2025-12-20T00:18:05.10495+01:00"}
 {"id":"agent-relay-v57","title":"No message expiration/cleanup in SQLite storage","description":"SQLite adapter has no TTL or cleanup mechanism for old messages. Over time, the database will grow unbounded. Add: (1) Configurable message retention period, (2) Automatic cleanup job, (3) Index on ts column is there but no cleanup uses it.","status":"open","priority":2,"issue_type":"task","created_at":"2025-12-20T00:18:01.86766+01:00","updated_at":"2025-12-20T00:18:01.86766+01:00"}
+{"id":"fed-001","title":"[Phase 1] Define peer protocol types","description":"Create src/protocol/peer-types.ts with PeerMessageType, PeerEnvelope, PeerHelloPayload, PeerWelcomePayload, PeerSyncPayload, PeerRoutePayload, PeerBroadcastPayload. See FEDERATION_PROPOSAL.md Section 4.","status":"open","priority":1,"issue_type":"task","assignee":"Architect","created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T12:00:00Z","labels":["federation","phase-1"]}
+{"id":"fed-002","title":"[Phase 1] Implement PeerConnection class","description":"Create src/federation/peer-connection.ts. WebSocket connection to single peer with state machine (DISCONNECTED→CONNECTING→HANDSHAKING→ACTIVE→RECONNECTING). Handle HELLO/WELCOME handshake. See FEDERATION_PROPOSAL.md Section 4.6.","status":"open","priority":1,"issue_type":"task","assignee":"Network","depends_on":["fed-001"],"created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T12:00:00Z","labels":["federation","phase-1"]}
+{"id":"fed-003","title":"[Phase 1] Implement PeerManager","description":"Create src/federation/peer-manager.ts. Manages multiple PeerConnection instances. Connect to configured peers on daemon start. Handle peer discovery from WELCOME messages.","status":"open","priority":1,"issue_type":"task","assignee":"Network","depends_on":["fed-002"],"created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T12:00:00Z","labels":["federation","phase-1"]}
+{"id":"fed-004","title":"[Phase 1] Add peer WebSocket listener to daemon","description":"Extend daemon to listen for incoming peer connections on configurable port (default 8765). Accept connections, perform handshake, register with PeerManager.","status":"open","priority":1,"issue_type":"task","assignee":"Network","depends_on":["fed-003"],"created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T12:00:00Z","labels":["federation","phase-1"]}
+{"id":"fed-005","title":"[Phase 2] Implement FleetRegistry","description":"Create src/federation/registry.ts with FleetRegistry class. Track agents across fleet with agentToServer mapping. Support fleet-wide unique name enforcement with quorum registration. See FEDERATION_PROPOSAL.md Section 5.","status":"open","priority":1,"issue_type":"task","assignee":"Router","depends_on":["fed-004"],"created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T12:00:00Z","labels":["federation","phase-2"]}
+{"id":"fed-006","title":"[Phase 2] Implement PEER_SYNC message handling","description":"Handle agent_joined, agent_left, full_sync message types. Update FleetRegistry on sync. Trigger sync on new peer connection and periodically (60s).","status":"open","priority":1,"issue_type":"task","assignee":"Router","depends_on":["fed-005"],"created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T12:00:00Z","labels":["federation","phase-2"]}
+{"id":"fed-007","title":"[Phase 2] Implement name resolution","description":"Add resolve() method to FleetRegistry. Support patterns: Bob (local-first then fleet), Bob@server (explicit), *@server (broadcast to server). See FEDERATION_PROPOSAL.md Section 5.2.","status":"open","priority":1,"issue_type":"task","assignee":"Router","depends_on":["fed-006"],"created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T12:00:00Z","labels":["federation","phase-2"]}
+{"id":"fed-008","title":"[Phase 2] CLI: agent-relay agents --fleet","description":"Add --fleet flag to agents command. Query FleetRegistry for all agents across servers. Display server, CLI, status, connected_at columns.","status":"open","priority":2,"issue_type":"task","assignee":"CLI","depends_on":["fed-005"],"created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T12:00:00Z","labels":["federation","phase-2","cli"]}
+{"id":"fed-009","title":"[Phase 3] Implement PEER_ROUTE handling","description":"Handle incoming PEER_ROUTE messages. Extract target agent, look up in local registry, deliver via existing tmux send-keys injection. Preserve automatic delivery magic.","status":"open","priority":1,"issue_type":"task","assignee":"Router","depends_on":["fed-007"],"created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T12:00:00Z","labels":["federation","phase-3"]}
+{"id":"fed-010","title":"[Phase 3] Implement PEER_BROADCAST handling","description":"Handle incoming PEER_BROADCAST messages. Deliver to all local agents except excluded. Support scope (fleet vs server).","status":"open","priority":1,"issue_type":"task","assignee":"Router","depends_on":["fed-009"],"created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T12:00:00Z","labels":["federation","phase-3"]}
+{"id":"fed-011","title":"[Phase 3] Integrate FederatedRouter with existing Router","description":"Modify existing Router to check FleetRegistry for remote agents. Route to PeerManager for cross-server delivery. Keep local delivery path unchanged.","status":"open","priority":1,"issue_type":"task","assignee":"Router","depends_on":["fed-010"],"created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T12:00:00Z","labels":["federation","phase-3"]}
+{"id":"fed-012","title":"[Phase 3] Implement loop prevention","description":"Add hops array to PEER_ROUTE messages. Check for loops before forwarding. Drop messages that have already visited this server.","status":"open","priority":2,"issue_type":"task","assignee":"Router","depends_on":["fed-011"],"created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T12:00:00Z","labels":["federation","phase-3"]}
+{"id":"fed-013","title":"[Phase 4] Implement reconnection logic","description":"Add exponential backoff reconnection (1s→2s→4s→...→30s max). Unlimited retry attempts for configured peers. Resume session on reconnect.","status":"open","priority":1,"issue_type":"task","assignee":"Network","depends_on":["fed-011"],"created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T12:00:00Z","labels":["federation","phase-4"]}
+{"id":"fed-014","title":"[Phase 4] Implement message queue for disconnected peers","description":"Create src/federation/message-queue.ts. Bounded queue per peer (max 1000 messages). TTL expiration (1 hour). Flush on reconnect.","status":"open","priority":1,"issue_type":"task","assignee":"Storage","depends_on":["fed-013"],"created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T12:00:00Z","labels":["federation","phase-4","storage"]}
+{"id":"fed-015","title":"[Phase 4] Implement heartbeat (PING/PONG)","description":"Send PEER_PING every 30s. Expect PEER_PONG within 60s. Trigger reconnect on timeout. Track latency_ms in ServerRecord.","status":"open","priority":1,"issue_type":"task","assignee":"Network","depends_on":["fed-013"],"created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T12:00:00Z","labels":["federation","phase-4"]}
+{"id":"fed-016","title":"[Phase 4] Implement graceful shutdown (PEER_BYE)","description":"Send PEER_BYE on daemon shutdown. Remove agents from fleet registry. Allow queued messages to be sent before disconnect.","status":"open","priority":2,"issue_type":"task","assignee":"Network","depends_on":["fed-015"],"created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T12:00:00Z","labels":["federation","phase-4"]}
+{"id":"fed-017","title":"[Phase 4] CLI: agent-relay peer status","description":"Add peer subcommands: list, add, remove, status. Show connection state, latency, agent count, last seen.","status":"open","priority":2,"issue_type":"task","assignee":"CLI","depends_on":["fed-015"],"created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T12:00:00Z","labels":["federation","phase-4","cli"]}
+{"id":"fed-018","title":"[Phase 5] Add TLS support for peer connections","description":"Configure wss:// with cert/key/ca paths. Support mutual TLS (mTLS) optionally. Reject non-TLS in production mode.","status":"open","priority":1,"issue_type":"task","assignee":"Security","depends_on":["fed-016"],"created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T12:00:00Z","labels":["federation","phase-5","security"]}
+{"id":"fed-019","title":"[Phase 5] Implement Ed25519 authentication","description":"Generate server keypair on first run. Challenge-response handshake. Sign all peer messages. Verify signatures on receive. See FEDERATION_PROPOSAL.md Section 8.1-8.3.","status":"open","priority":1,"issue_type":"task","assignee":"Security","depends_on":["fed-018"],"created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T12:00:00Z","labels":["federation","phase-5","security"]}
+{"id":"fed-020","title":"[Phase 5] Add configuration file support","description":"Parse /etc/agent-relay/config.yaml or ~/.agent-relay/config.yaml. Support all federation settings from FEDERATION_PROPOSAL.md Section 12. Environment variable overrides.","status":"open","priority":1,"issue_type":"task","assignee":"CLI","depends_on":["fed-016"],"created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T12:00:00Z","labels":["federation","phase-5","cli"]}
+{"id":"fed-021","title":"[Phase 5] Update dashboard for fleet view","description":"Add fleet view toggle to dashboard. Show agents from all servers with server badge. Show peer connection status. Real-time updates via existing WebSocket.","status":"open","priority":2,"issue_type":"task","assignee":"CLI","depends_on":["fed-008"],"created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T12:00:00Z","labels":["federation","phase-5","dashboard"]}
+{"id":"fed-022","title":"[Phase 5] Implement delivery confirmation","description":"Add DELIVERY_CONFIRMED message type. Detect injection via capture-pane. Send confirmation back to sender. Optional sender notification. See FEDERATION_PROPOSAL.md Section 7.","status":"open","priority":2,"issue_type":"task","assignee":"Router","depends_on":["fed-011"],"created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T12:00:00Z","labels":["federation","phase-5"]}
+{"id":"fed-023","title":"[Phase 5] Implement flow control","description":"Add credit-based flow control per peer. PEER_ACK to replenish credits. PEER_BUSY/PEER_READY for backpressure. Rate limiting. See FEDERATION_PROPOSAL.md Section 9.","status":"open","priority":2,"issue_type":"task","assignee":"Network","depends_on":["fed-014"],"created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T12:00:00Z","labels":["federation","phase-5"]}
+{"id":"fed-024","title":"[Storage] Implement trajectory storage adapter","description":"Create TrajectoryStorageAdapter interface. File system backend (default). SQLite backend for local queries. See FEDERATION_PROPOSAL.md Section 17 and Trajectories Proposal.","status":"open","priority":2,"issue_type":"task","assignee":"Storage","created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T12:00:00Z","labels":["federation","storage","trajectories"]}
+{"id":"fed-025","title":"[Storage] Implement central trajectory sync","description":"PostgreSQL backend for team sharing. Sync on task complete. Conflict resolution (server-wins default). See FEDERATION_PROPOSAL.md Section 17.3.","status":"open","priority":3,"issue_type":"task","assignee":"Storage","depends_on":["fed-024"],"created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T12:00:00Z","labels":["federation","storage","trajectories"]}
+{"id":"fed-026","title":"[Transport] Implement NATS transport adapter","description":"Create NatsTransport implementing PeerTransport interface. JetStream for persistence. Subject-based routing. See FEDERATION_PROPOSAL.md Section 11.4.","status":"open","priority":3,"issue_type":"task","assignee":"Network","depends_on":["fed-023"],"created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T12:00:00Z","labels":["federation","nats","optional"]}
+{"id":"fed-027","title":"Write federation integration tests","description":"Docker Compose setup for multi-server testing. Test cross-server messaging, reconnection, broadcast. Chaos testing for network partitions.","status":"open","priority":2,"issue_type":"task","assignee":"Architect","depends_on":["fed-022"],"created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T12:00:00Z","labels":["federation","testing"]}
+{"id":"fed-028","title":"Write federation documentation","description":"Update README with federation quick start. Document configuration options. Add architecture diagrams. Troubleshooting guide.","status":"open","priority":2,"issue_type":"task","assignee":"Architect","depends_on":["fed-027"],"created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T12:00:00Z","labels":["federation","docs"]}

From 55ace49932182f215ce41192dcbe1c0e012b3b23 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 21 Dec 2025 10:54:09 +0000
Subject: [PATCH 06/19] Update federation tasks with collaborators and fix
 dependencies

Changes:
- Add collaborators for cross-boundary tasks (8 tasks now dual-assigned)
- Fix fed-014 dependency (queue can start after fed-004, not fed-013)
- Add fed-026a for PeerTransport interface before NATS adapter
- Add Architect review to Security tasks (fed-019)
- Lower priority on fed-012 (loop prevention can merge with routing)

Dual assignments:
- fed-001: Architect + Network (protocol types)
- fed-011: Router + Network (federated router integration)
- fed-014: Storage + Network (message queue)
- fed-018: Security + Network (TLS)
- fed-019: Security + Architect (Ed25519 crypto review)
- fed-022: Router + Network (delivery confirmation)
- fed-023: Network + Storage (flow control)
- fed-026a: Architect + Network (transport interface)
- fed-027: Architect + Network + Router (integration tests)
---
 .beads/issues.jsonl | 25 +++++++++++++------------
 1 file changed, 13 insertions(+), 12 deletions(-)

diff --git a/.beads/issues.jsonl b/.beads/issues.jsonl
index 50821c345..0c275d32c 100644
--- a/.beads/issues.jsonl
+++ b/.beads/issues.jsonl
@@ -49,8 +49,6 @@
 {"id":"agent-relay-sio","title":"Add graceful degradation when relay daemon is unavailable","description":"In wrapper/tmux-wrapper.ts:195-197, daemon connection failures are silently caught. Consider: (1) Periodic reconnection attempts, (2) Queueing messages for later delivery, (3) Visual indicator in terminal showing connection status.","status":"open","priority":2,"issue_type":"feature","created_at":"2025-12-20T00:17:32.600333+01:00","updated_at":"2025-12-20T00:17:32.600333+01:00"}
 {"id":"agent-relay-ucw","title":"Dashboard: multi-project navigation or dynamic port allocation","description":"When the dashboard is already running for one project, users should be able to either: (1) Navigate between different projects in a single dashboard view, OR (2) Start a new dashboard instance on an automatically allocated available port for a different project. Currently if a dashboard is running, starting another project conflicts.","status":"closed","priority":2,"issue_type":"feature","created_at":"2025-12-19T23:40:18.667766+01:00","updated_at":"2025-12-20T00:18:05.10495+01:00","closed_at":"2025-12-20T00:18:05.10495+01:00"}
 {"id":"agent-relay-v57","title":"No message expiration/cleanup in SQLite storage","description":"SQLite adapter has no TTL or cleanup mechanism for old messages. Over time, the database will grow unbounded. Add: (1) Configurable message retention period, (2) Automatic cleanup job, (3) Index on ts column is there but no cleanup uses it.","status":"open","priority":2,"issue_type":"task","created_at":"2025-12-20T00:18:01.86766+01:00","updated_at":"2025-12-20T00:18:01.86766+01:00"}
-{"id":"fed-001","title":"[Phase 1] Define peer protocol types","description":"Create src/protocol/peer-types.ts with PeerMessageType, PeerEnvelope, PeerHelloPayload, PeerWelcomePayload, PeerSyncPayload, PeerRoutePayload, PeerBroadcastPayload. See FEDERATION_PROPOSAL.md Section 4.","status":"open","priority":1,"issue_type":"task","assignee":"Architect","created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T12:00:00Z","labels":["federation","phase-1"]}
-{"id":"fed-002","title":"[Phase 1] Implement PeerConnection class","description":"Create src/federation/peer-connection.ts. WebSocket connection to single peer with state machine (DISCONNECTED→CONNECTING→HANDSHAKING→ACTIVE→RECONNECTING). Handle HELLO/WELCOME handshake. See FEDERATION_PROPOSAL.md Section 4.6.","status":"open","priority":1,"issue_type":"task","assignee":"Network","depends_on":["fed-001"],"created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T12:00:00Z","labels":["federation","phase-1"]}
 {"id":"fed-003","title":"[Phase 1] Implement PeerManager","description":"Create src/federation/peer-manager.ts. Manages multiple PeerConnection instances. Connect to configured peers on daemon start. Handle peer discovery from WELCOME messages.","status":"open","priority":1,"issue_type":"task","assignee":"Network","depends_on":["fed-002"],"created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T12:00:00Z","labels":["federation","phase-1"]}
 {"id":"fed-004","title":"[Phase 1] Add peer WebSocket listener to daemon","description":"Extend daemon to listen for incoming peer connections on configurable port (default 8765). Accept connections, perform handshake, register with PeerManager.","status":"open","priority":1,"issue_type":"task","assignee":"Network","depends_on":["fed-003"],"created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T12:00:00Z","labels":["federation","phase-1"]}
 {"id":"fed-005","title":"[Phase 2] Implement FleetRegistry","description":"Create src/federation/registry.ts with FleetRegistry class. Track agents across fleet with agentToServer mapping. Support fleet-wide unique name enforcement with quorum registration. See FEDERATION_PROPOSAL.md Section 5.","status":"open","priority":1,"issue_type":"task","assignee":"Router","depends_on":["fed-004"],"created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T12:00:00Z","labels":["federation","phase-2"]}
@@ -59,21 +57,24 @@
 {"id":"fed-008","title":"[Phase 2] CLI: agent-relay agents --fleet","description":"Add --fleet flag to agents command. Query FleetRegistry for all agents across servers. Display server, CLI, status, connected_at columns.","status":"open","priority":2,"issue_type":"task","assignee":"CLI","depends_on":["fed-005"],"created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T12:00:00Z","labels":["federation","phase-2","cli"]}
 {"id":"fed-009","title":"[Phase 3] Implement PEER_ROUTE handling","description":"Handle incoming PEER_ROUTE messages. Extract target agent, look up in local registry, deliver via existing tmux send-keys injection. Preserve automatic delivery magic.","status":"open","priority":1,"issue_type":"task","assignee":"Router","depends_on":["fed-007"],"created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T12:00:00Z","labels":["federation","phase-3"]}
 {"id":"fed-010","title":"[Phase 3] Implement PEER_BROADCAST handling","description":"Handle incoming PEER_BROADCAST messages. Deliver to all local agents except excluded. Support scope (fleet vs server).","status":"open","priority":1,"issue_type":"task","assignee":"Router","depends_on":["fed-009"],"created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T12:00:00Z","labels":["federation","phase-3"]}
-{"id":"fed-011","title":"[Phase 3] Integrate FederatedRouter with existing Router","description":"Modify existing Router to check FleetRegistry for remote agents. Route to PeerManager for cross-server delivery. Keep local delivery path unchanged.","status":"open","priority":1,"issue_type":"task","assignee":"Router","depends_on":["fed-010"],"created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T12:00:00Z","labels":["federation","phase-3"]}
-{"id":"fed-012","title":"[Phase 3] Implement loop prevention","description":"Add hops array to PEER_ROUTE messages. Check for loops before forwarding. Drop messages that have already visited this server.","status":"open","priority":2,"issue_type":"task","assignee":"Router","depends_on":["fed-011"],"created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T12:00:00Z","labels":["federation","phase-3"]}
-{"id":"fed-013","title":"[Phase 4] Implement reconnection logic","description":"Add exponential backoff reconnection (1s→2s→4s→...→30s max). Unlimited retry attempts for configured peers. Resume session on reconnect.","status":"open","priority":1,"issue_type":"task","assignee":"Network","depends_on":["fed-011"],"created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T12:00:00Z","labels":["federation","phase-4"]}
-{"id":"fed-014","title":"[Phase 4] Implement message queue for disconnected peers","description":"Create src/federation/message-queue.ts. Bounded queue per peer (max 1000 messages). TTL expiration (1 hour). Flush on reconnect.","status":"open","priority":1,"issue_type":"task","assignee":"Storage","depends_on":["fed-013"],"created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T12:00:00Z","labels":["federation","phase-4","storage"]}
 {"id":"fed-015","title":"[Phase 4] Implement heartbeat (PING/PONG)","description":"Send PEER_PING every 30s. Expect PEER_PONG within 60s. Trigger reconnect on timeout. Track latency_ms in ServerRecord.","status":"open","priority":1,"issue_type":"task","assignee":"Network","depends_on":["fed-013"],"created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T12:00:00Z","labels":["federation","phase-4"]}
 {"id":"fed-016","title":"[Phase 4] Implement graceful shutdown (PEER_BYE)","description":"Send PEER_BYE on daemon shutdown. Remove agents from fleet registry. Allow queued messages to be sent before disconnect.","status":"open","priority":2,"issue_type":"task","assignee":"Network","depends_on":["fed-015"],"created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T12:00:00Z","labels":["federation","phase-4"]}
 {"id":"fed-017","title":"[Phase 4] CLI: agent-relay peer status","description":"Add peer subcommands: list, add, remove, status. Show connection state, latency, agent count, last seen.","status":"open","priority":2,"issue_type":"task","assignee":"CLI","depends_on":["fed-015"],"created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T12:00:00Z","labels":["federation","phase-4","cli"]}
-{"id":"fed-018","title":"[Phase 5] Add TLS support for peer connections","description":"Configure wss:// with cert/key/ca paths. Support mutual TLS (mTLS) optionally. Reject non-TLS in production mode.","status":"open","priority":1,"issue_type":"task","assignee":"Security","depends_on":["fed-016"],"created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T12:00:00Z","labels":["federation","phase-5","security"]}
-{"id":"fed-019","title":"[Phase 5] Implement Ed25519 authentication","description":"Generate server keypair on first run. Challenge-response handshake. Sign all peer messages. Verify signatures on receive. See FEDERATION_PROPOSAL.md Section 8.1-8.3.","status":"open","priority":1,"issue_type":"task","assignee":"Security","depends_on":["fed-018"],"created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T12:00:00Z","labels":["federation","phase-5","security"]}
 {"id":"fed-020","title":"[Phase 5] Add configuration file support","description":"Parse /etc/agent-relay/config.yaml or ~/.agent-relay/config.yaml. Support all federation settings from FEDERATION_PROPOSAL.md Section 12. Environment variable overrides.","status":"open","priority":1,"issue_type":"task","assignee":"CLI","depends_on":["fed-016"],"created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T12:00:00Z","labels":["federation","phase-5","cli"]}
 {"id":"fed-021","title":"[Phase 5] Update dashboard for fleet view","description":"Add fleet view toggle to dashboard. Show agents from all servers with server badge. Show peer connection status. Real-time updates via existing WebSocket.","status":"open","priority":2,"issue_type":"task","assignee":"CLI","depends_on":["fed-008"],"created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T12:00:00Z","labels":["federation","phase-5","dashboard"]}
-{"id":"fed-022","title":"[Phase 5] Implement delivery confirmation","description":"Add DELIVERY_CONFIRMED message type. Detect injection via capture-pane. Send confirmation back to sender. Optional sender notification. See FEDERATION_PROPOSAL.md Section 7.","status":"open","priority":2,"issue_type":"task","assignee":"Router","depends_on":["fed-011"],"created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T12:00:00Z","labels":["federation","phase-5"]}
-{"id":"fed-023","title":"[Phase 5] Implement flow control","description":"Add credit-based flow control per peer. PEER_ACK to replenish credits. PEER_BUSY/PEER_READY for backpressure. Rate limiting. See FEDERATION_PROPOSAL.md Section 9.","status":"open","priority":2,"issue_type":"task","assignee":"Network","depends_on":["fed-014"],"created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T12:00:00Z","labels":["federation","phase-5"]}
 {"id":"fed-024","title":"[Storage] Implement trajectory storage adapter","description":"Create TrajectoryStorageAdapter interface. File system backend (default). SQLite backend for local queries. See FEDERATION_PROPOSAL.md Section 17 and Trajectories Proposal.","status":"open","priority":2,"issue_type":"task","assignee":"Storage","created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T12:00:00Z","labels":["federation","storage","trajectories"]}
 {"id":"fed-025","title":"[Storage] Implement central trajectory sync","description":"PostgreSQL backend for team sharing. Sync on task complete. Conflict resolution (server-wins default). See FEDERATION_PROPOSAL.md Section 17.3.","status":"open","priority":3,"issue_type":"task","assignee":"Storage","depends_on":["fed-024"],"created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T12:00:00Z","labels":["federation","storage","trajectories"]}
-{"id":"fed-026","title":"[Transport] Implement NATS transport adapter","description":"Create NatsTransport implementing PeerTransport interface. JetStream for persistence. Subject-based routing. See FEDERATION_PROPOSAL.md Section 11.4.","status":"open","priority":3,"issue_type":"task","assignee":"Network","depends_on":["fed-023"],"created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T12:00:00Z","labels":["federation","nats","optional"]}
-{"id":"fed-027","title":"Write federation integration tests","description":"Docker Compose setup for multi-server testing. Test cross-server messaging, reconnection, broadcast. Chaos testing for network partitions.","status":"open","priority":2,"issue_type":"task","assignee":"Architect","depends_on":["fed-022"],"created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T12:00:00Z","labels":["federation","testing"]}
+{"id":"fed-001","title":"[Phase 1] Define peer protocol types","description":"Create src/protocol/peer-types.ts with PeerMessageType, PeerEnvelope, PeerHelloPayload, PeerWelcomePayload, PeerSyncPayload, PeerRoutePayload, PeerBroadcastPayload. See FEDERATION_PROPOSAL.md Section 4.","status":"open","priority":1,"issue_type":"task","assignee":"Architect","collaborators":["Network"],"created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T13:00:00Z","labels":["federation","phase-1"]}
+{"id":"fed-011","title":"[Phase 3] Integrate FederatedRouter with existing Router","description":"Modify existing Router to check FleetRegistry for remote agents. Route to PeerManager for cross-server delivery. Keep local delivery path unchanged.","status":"open","priority":1,"issue_type":"task","assignee":"Router","collaborators":["Network"],"depends_on":["fed-010"],"created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T13:00:00Z","labels":["federation","phase-3"]}
+{"id":"fed-012","title":"[Phase 3] Implement loop prevention","description":"Add hops array to PEER_ROUTE messages. Check for loops before forwarding. Drop messages that have already visited this server. Note: Can be implemented as part of fed-009/010.","status":"open","priority":3,"issue_type":"task","assignee":"Router","depends_on":["fed-009"],"created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T13:00:00Z","labels":["federation","phase-3","can-merge"]}
+{"id":"fed-014","title":"[Phase 4] Implement message queue for disconnected peers","description":"Create src/federation/message-queue.ts. Bounded queue per peer (max 1000 messages). TTL expiration (1 hour). Flush on reconnect.","status":"open","priority":1,"issue_type":"task","assignee":"Storage","collaborators":["Network"],"depends_on":["fed-004"],"created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T13:00:00Z","labels":["federation","phase-4","storage"]}
+{"id":"fed-018","title":"[Phase 5] Add TLS support for peer connections","description":"Configure wss:// with cert/key/ca paths. Support mutual TLS (mTLS) optionally. Reject non-TLS in production mode.","status":"open","priority":1,"issue_type":"task","assignee":"Security","collaborators":["Network"],"depends_on":["fed-016"],"created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T13:00:00Z","labels":["federation","phase-5","security"]}
+{"id":"fed-022","title":"[Phase 5] Implement delivery confirmation","description":"Add DELIVERY_CONFIRMED message type. Detect injection via capture-pane. Send confirmation back to sender. Optional sender notification. See FEDERATION_PROPOSAL.md Section 7.","status":"open","priority":2,"issue_type":"task","assignee":"Router","collaborators":["Network"],"depends_on":["fed-011"],"created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T13:00:00Z","labels":["federation","phase-5"]}
+{"id":"fed-026a","title":"[Transport] Define PeerTransport interface","description":"Create src/federation/transport.ts with PeerTransport interface: connect(), disconnect(), send(), broadcast(), subscribe(), getConnectedPeers(). WebSocketTransport as default implementation. See FEDERATION_PROPOSAL.md Section 11.2.","status":"open","priority":2,"issue_type":"task","assignee":"Architect","collaborators":["Network"],"depends_on":["fed-023"],"created_at":"2025-12-21T13:00:00Z","updated_at":"2025-12-21T13:00:00Z","labels":["federation","transport"]}
+{"id":"fed-026","title":"[Transport] Implement NATS transport adapter","description":"Create NatsTransport implementing PeerTransport interface. JetStream for persistence. Subject-based routing. See FEDERATION_PROPOSAL.md Section 11.4.","status":"open","priority":3,"issue_type":"task","assignee":"Network","depends_on":["fed-026a"],"created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T13:00:00Z","labels":["federation","nats","optional"]}
+{"id":"fed-027","title":"Write federation integration tests","description":"Docker Compose setup for multi-server testing. Test cross-server messaging, reconnection, broadcast. Chaos testing for network partitions.","status":"open","priority":2,"issue_type":"task","assignee":"Architect","collaborators":["Network","Router"],"depends_on":["fed-022"],"created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T13:00:00Z","labels":["federation","testing"]}
+{"id":"fed-002","title":"[Phase 1] Implement PeerConnection class","description":"Create src/federation/peer-connection.ts. WebSocket connection to single peer with state machine (DISCONNECTED→CONNECTING→HANDSHAKING→ACTIVE→RECONNECTING). Handle HELLO/WELCOME handshake. See FEDERATION_PROPOSAL.md Section 4.6.","status":"open","priority":1,"issue_type":"task","assignee":"Network","depends_on":["fed-001"],"created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T12:00:00Z","labels":["federation","phase-1"]}
+{"id":"fed-013","title":"[Phase 4] Implement reconnection logic","description":"Add exponential backoff reconnection (1s→2s→4s→...→30s max). Unlimited retry attempts for configured peers. Resume session on reconnect.","status":"open","priority":1,"issue_type":"task","assignee":"Network","depends_on":["fed-011"],"created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T12:00:00Z","labels":["federation","phase-4"]}
+{"id":"fed-019","title":"[Phase 5] Implement Ed25519 authentication","description":"Generate server keypair on first run. Challenge-response handshake. Sign all peer messages. Verify signatures on receive. See FEDERATION_PROPOSAL.md Section 8.1-8.3.","status":"open","priority":1,"issue_type":"task","assignee":"Security","collaborators":["Architect"],"depends_on":["fed-018"],"created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T13:00:00Z","labels":["federation","phase-5","security"]}
+{"id":"fed-023","title":"[Phase 5] Implement flow control","description":"Add credit-based flow control per peer. PEER_ACK to replenish credits. PEER_BUSY/PEER_READY for backpressure. Rate limiting. See FEDERATION_PROPOSAL.md Section 9.","status":"open","priority":2,"issue_type":"task","assignee":"Network","collaborators":["Storage"],"depends_on":["fed-014"],"created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T13:00:00Z","labels":["federation","phase-5"]}
 {"id":"fed-028","title":"Write federation documentation","description":"Update README with federation quick start. Document configuration options. Add architecture diagrams. Troubleshooting guide.","status":"open","priority":2,"issue_type":"task","assignee":"Architect","depends_on":["fed-027"],"created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T12:00:00Z","labels":["federation","docs"]}

From 7972f06cbb6babe8e752f685891b9a597156f8e9 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 21 Dec 2025 12:26:03 +0000
Subject: [PATCH 07/19] Add control plane roadmap and ai-maestro competitive
 analysis

Control Plane Tasks (12):
- ctrl-001: Design Control API (REST + WebSocket)
- ctrl-002: Lead Agent orchestration
- ctrl-003: Web dashboard v2 (fleet control)
- ctrl-004: Human authentication (OAuth/magic link)
- ctrl-005: Push notification service (APNs/FCM)
- ctrl-006: iPhone app MVP
- ctrl-007: Slack/Discord bot integration
- ctrl-008: Human escalation queue
- ctrl-009: Agent skills registry
- ctrl-010: Code Graph integration (from ai-maestro)
- ctrl-011: Agent health monitoring (from ai-maestro)
- ctrl-012: Agent portability export/import (from ai-maestro)

Competitive Analysis:
- ai-maestro uses file-based messaging (human relay required)
- agent-relay uses auto-injection (truly autonomous)
- Learn from ai-maestro: Code Graphs, health monitoring, portability
- Our advantage: real-time messaging backbone
---
 .beads/issues.jsonl                     |  12 ++
 docs/competitive-analysis-ai-maestro.md | 249 ++++++++++++++++++++++++
 2 files changed, 261 insertions(+)
 create mode 100644 docs/competitive-analysis-ai-maestro.md

diff --git a/.beads/issues.jsonl b/.beads/issues.jsonl
index 0c275d32c..65cccb22c 100644
--- a/.beads/issues.jsonl
+++ b/.beads/issues.jsonl
@@ -78,3 +78,15 @@
 {"id":"fed-019","title":"[Phase 5] Implement Ed25519 authentication","description":"Generate server keypair on first run. Challenge-response handshake. Sign all peer messages. Verify signatures on receive. See FEDERATION_PROPOSAL.md Section 8.1-8.3.","status":"open","priority":1,"issue_type":"task","assignee":"Security","collaborators":["Architect"],"depends_on":["fed-018"],"created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T13:00:00Z","labels":["federation","phase-5","security"]}
 {"id":"fed-023","title":"[Phase 5] Implement flow control","description":"Add credit-based flow control per peer. PEER_ACK to replenish credits. PEER_BUSY/PEER_READY for backpressure. Rate limiting. See FEDERATION_PROPOSAL.md Section 9.","status":"open","priority":2,"issue_type":"task","assignee":"Network","collaborators":["Storage"],"depends_on":["fed-014"],"created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T13:00:00Z","labels":["federation","phase-5"]}
 {"id":"fed-028","title":"Write federation documentation","description":"Update README with federation quick start. Document configuration options. Add architecture diagrams. Troubleshooting guide.","status":"open","priority":2,"issue_type":"task","assignee":"Architect","depends_on":["fed-027"],"created_at":"2025-12-21T12:00:00Z","updated_at":"2025-12-21T12:00:00Z","labels":["federation","docs"]}
+{"id":"ctrl-001","title":"[Control Plane] Design Control API (REST + WebSocket)","description":"Design API for human control plane: POST /tasks, GET /agents, POST /agents/:id/msg, WS /stream. OpenAPI spec. Authentication endpoints. See ai-maestro's manager/worker pattern for reference.","status":"open","priority":2,"issue_type":"task","assignee":"Architect","collaborators":["CLI"],"created_at":"2025-12-21T14:00:00Z","updated_at":"2025-12-21T14:00:00Z","labels":["control-plane","api"]}
+{"id":"ctrl-002","title":"[Control Plane] Implement Lead Agent orchestration","description":"Lead Agent receives human intent, breaks into tasks, assigns to specialized agents based on skills, monitors progress, escalates decisions. Can be AI-powered or rule-based initially.","status":"open","priority":2,"issue_type":"task","assignee":"Router","collaborators":["Architect"],"depends_on":["ctrl-001","fed-011"],"created_at":"2025-12-21T14:00:00Z","updated_at":"2025-12-21T14:00:00Z","labels":["control-plane","orchestration"]}
+{"id":"ctrl-003","title":"[Control Plane] Build web dashboard v2 (fleet control)","description":"Next.js dashboard with: fleet overview, task assignment UI, agent status cards, trajectory viewer, decision queue. Learn from ai-maestro's hierarchical naming and color coding.","status":"open","priority":2,"issue_type":"task","assignee":"CLI","depends_on":["ctrl-001","fed-021"],"created_at":"2025-12-21T14:00:00Z","updated_at":"2025-12-21T14:00:00Z","labels":["control-plane","dashboard"]}
+{"id":"ctrl-004","title":"[Control Plane] Human authentication (OAuth/magic link)","description":"Secure human access to control plane. OAuth2 (GitHub/Google) or magic link email auth. JWT tokens for API access. Role-based permissions (admin/operator/viewer).","status":"open","priority":2,"issue_type":"task","assignee":"Security","depends_on":["ctrl-001"],"created_at":"2025-12-21T14:00:00Z","updated_at":"2025-12-21T14:00:00Z","labels":["control-plane","security"]}
+{"id":"ctrl-005","title":"[Control Plane] Push notification service","description":"Real-time notifications to mobile/web: APNs for iOS, FCM for Android/web. Notify on: task completion, agent questions, escalations, errors.","status":"open","priority":3,"issue_type":"task","assignee":"Network","depends_on":["ctrl-001"],"created_at":"2025-12-21T14:00:00Z","updated_at":"2025-12-21T14:00:00Z","labels":["control-plane","notifications"]}
+{"id":"ctrl-006","title":"[Control Plane] iPhone app MVP","description":"React Native or Swift app: fleet status view, task list, agent messaging, push notifications, decision queue with quick actions. Minimal viable version.","status":"open","priority":3,"issue_type":"task","assignee":"CLI","collaborators":["Architect"],"depends_on":["ctrl-003","ctrl-005"],"created_at":"2025-12-21T14:00:00Z","updated_at":"2025-12-21T14:00:00Z","labels":["control-plane","mobile"]}
+{"id":"ctrl-007","title":"[Control Plane] Slack/Discord bot integration","description":"Bot for team chat: /fleet status, /assign task agent, @mentions for escalations, threaded replies for agent questions. Webhook-based notifications.","status":"open","priority":3,"issue_type":"task","assignee":"CLI","depends_on":["ctrl-001"],"created_at":"2025-12-21T14:00:00Z","updated_at":"2025-12-21T14:00:00Z","labels":["control-plane","integrations"]}
+{"id":"ctrl-008","title":"[Control Plane] Human escalation queue","description":"Queue for decisions needing human input. Agents can escalate with context. Humans respond via any interface (web/mobile/Slack). Response routed back to agent.","status":"open","priority":2,"issue_type":"task","assignee":"Router","collaborators":["CLI"],"depends_on":["ctrl-002"],"created_at":"2025-12-21T14:00:00Z","updated_at":"2025-12-21T14:00:00Z","labels":["control-plane","escalation"]}
+{"id":"ctrl-009","title":"[Control Plane] Agent skills registry","description":"Track agent capabilities: skills map (Network→websocket,tcp,tls), availability, current workload, performance history. Used by Lead Agent for smart assignment.","status":"open","priority":2,"issue_type":"task","assignee":"Router","depends_on":["fed-005"],"created_at":"2025-12-21T14:00:00Z","updated_at":"2025-12-21T14:00:00Z","labels":["control-plane","registry"]}
+{"id":"ctrl-010","title":"[Control Plane] Code Graph integration","description":"Learn from ai-maestro: visualize codebase as graph (classes, functions, relationships). Delta indexing for performance. Shared context for all agents. Multi-language support.","status":"open","priority":3,"issue_type":"task","assignee":"Storage","collaborators":["Architect"],"created_at":"2025-12-21T14:00:00Z","updated_at":"2025-12-21T14:00:00Z","labels":["control-plane","intelligence"]}
+{"id":"ctrl-011","title":"[Control Plane] Agent health monitoring","description":"Like ai-maestro's green/red/yellow status: heartbeat checks, resource usage, error rates, stuck detection. Alert on degradation. Auto-recovery actions.","status":"open","priority":2,"issue_type":"task","assignee":"Network","depends_on":["fed-015"],"created_at":"2025-12-21T14:00:00Z","updated_at":"2025-12-21T14:00:00Z","labels":["control-plane","monitoring"]}
+{"id":"ctrl-012","title":"[Control Plane] Agent portability (export/import)","description":"Learn from ai-maestro: export agent as .zip with config, message history, git associations, skills. Import with conflict detection. Enable agent migration between fleets.","status":"open","priority":3,"issue_type":"task","assignee":"Storage","depends_on":["fed-024"],"created_at":"2025-12-21T14:00:00Z","updated_at":"2025-12-21T14:00:00Z","labels":["control-plane","portability"]}
diff --git a/docs/competitive-analysis-ai-maestro.md b/docs/competitive-analysis-ai-maestro.md
new file mode 100644
index 000000000..a12404cb1
--- /dev/null
+++ b/docs/competitive-analysis-ai-maestro.md
@@ -0,0 +1,249 @@
+# Competitive Analysis: ai-maestro vs agent-relay
+
+**Date:** 2025-12-21
+**Source:** https://github.com/23blocks-OS/ai-maestro (v0.17.7)
+
+## Executive Summary
+
+ai-maestro is a mature orchestration dashboard for multi-agent coordination. It takes a **human-as-orchestrator** approach with rich visualization, while agent-relay focuses on **automatic agent-to-agent communication**. Both use tmux for agent sessions, but differ fundamentally in message delivery and autonomy.
+
+---
+
+## Architecture Comparison
+
+```
+┌─────────────────────────────────────────────────────────────────────────────┐
+│                           AI-MAESTRO                                         │
+│                                                                              │
+│  ┌─────────────────────────────────────────────────────────────────────┐   │
+│  │                    Web Dashboard (localhost:23000)                   │   │
+│  │                    Next.js + TypeScript + Tailwind                   │   │
+│  └────────────────────────────────┬────────────────────────────────────┘   │
+│                                   │ WebSocket                               │
+│                                   ▼                                         │
+│  ┌─────────────────────────────────────────────────────────────────────┐   │
+│  │                         Manager Node                                 │   │
+│  │              (Coordinates workers, proxies connections)              │   │
+│  └────────────────────────────────┬────────────────────────────────────┘   │
+│                                   │ Tailscale VPN / Local                   │
+│          ┌────────────────────────┼────────────────────────┐               │
+│          ▼                        ▼                        ▼               │
+│  ┌──────────────┐        ┌──────────────┐        ┌──────────────┐         │
+│  │ Worker Node  │        │ Worker Node  │        │ Worker Node  │         │
+│  │ (tmux agents)│        │ (tmux agents)│        │ (tmux agents)│         │
+│  └──────────────┘        └──────────────┘        └──────────────┘         │
+│                                                                              │
+│  Communication: FILE-BASED (read from disk, human relays)                   │
+│  Human Role: ORCHESTRATOR (assigns tasks, relays messages)                  │
+└─────────────────────────────────────────────────────────────────────────────┘
+
+┌─────────────────────────────────────────────────────────────────────────────┐
+│                           AGENT-RELAY                                        │
+│                                                                              │
+│  ┌─────────────────────────────────────────────────────────────────────┐   │
+│  │                     Dashboard (localhost:3888)                       │   │
+│  │                         + CLI Interface                              │   │
+│  └────────────────────────────────┬────────────────────────────────────┘   │
+│                                   │                                         │
+│                                   ▼                                         │
+│  ┌─────────────────────────────────────────────────────────────────────┐   │
+│  │                      Daemon (per server)                             │   │
+│  │        Unix socket (local) + WebSocket (federation - planned)        │   │
+│  └────────────────────────────────┬────────────────────────────────────┘   │
+│                                   │ tmux send-keys                          │
+│          ┌────────────────────────┼────────────────────────┐               │
+│          ▼                        ▼                        ▼               │
+│  ┌──────────────┐        ┌──────────────┐        ┌──────────────┐         │
+│  │ tmux: Alice  │◄──────►│ tmux: Bob    │◄──────►│ tmux: Carol  │         │
+│  │ (auto-inject)│        │ (auto-inject)│        │ (auto-inject)│         │
+│  └──────────────┘        └──────────────┘        └──────────────┘         │
+│                                                                              │
+│  Communication: REAL-TIME (daemon routes, auto-injects via tmux)            │
+│  Human Role: OBSERVER (agents communicate autonomously)                     │
+└─────────────────────────────────────────────────────────────────────────────┘
+```
+
+---
+
+## Feature Comparison
+
+| Feature | ai-maestro | agent-relay | Notes |
+|---------|------------|-------------|-------|
+| **Message Delivery** | File-based, human relays | Auto-inject via tmux | agent-relay is truly autonomous |
+| **Web Dashboard** | ✅ Rich (Next.js) | ⚠️ Basic (Ink) | ai-maestro more mature |
+| **Mobile App** | ❌ None | 📋 Planned | Opportunity for agent-relay |
+| **Distributed** | ✅ Tailscale VPN | 📋 Federation planned | Both support multi-server |
+| **Agent Sessions** | tmux (read-only) | tmux (read + inject) | agent-relay has write access |
+| **Code Intelligence** | ✅ Code Graphs | 📋 Planned | Learn from ai-maestro |
+| **Conversation Memory** | ✅ CozoDB + search | 📋 Trajectories planned | Similar goals |
+| **Agent Portability** | ✅ .zip export/import | 📋 Planned | Useful feature to adopt |
+| **Health Monitoring** | ✅ Green/red/yellow | ⚠️ Basic | Adopt their status model |
+| **Task Distribution** | Manual (human assigns) | Manual → Lead Agent | agent-relay planning automation |
+| **Hierarchical Naming** | ✅ project-backend-api | ❌ Flat names | Consider adopting |
+
+---
+
+## Key Differentiators
+
+### agent-relay's Advantages
+
+1. **Automatic Message Injection**
+   ```
+   ai-maestro:  Agent writes to file → Human reads → Human types to other agent
+   agent-relay: Agent outputs @relay:Bob → Daemon injects → Bob sees immediately
+   ```
+   This is the core differentiator. Agents can truly collaborate without human relay.
+
+2. **Real-Time Communication**
+   - ai-maestro: Polling-based file reads
+   - agent-relay: Push-based via daemon + WebSocket
+
+3. **Simpler Local Setup**
+   - ai-maestro: Manager + workers + VPN for distributed
+   - agent-relay: Single daemon, federation opt-in
+
+### ai-maestro's Advantages
+
+1. **Mature Web Dashboard**
+   - Rich visualizations
+   - Hierarchical agent tree with color coding
+   - Code graph browser
+   - Conversation search
+
+2. **Code Intelligence**
+   - Multi-language code graphs (Ruby, TypeScript, Python)
+   - Delta indexing (~100ms vs 1000ms+ full re-index)
+   - Relationship visualization
+
+3. **Agent Portability**
+   - Export agent as .zip
+   - Full config + message history + git associations
+   - Import with conflict detection
+
+4. **Health Monitoring**
+   - Visual status indicators (green/red/yellow)
+   - Per-agent resource tracking
+   - Stuck detection
+
+---
+
+## What We Should Learn from ai-maestro
+
+### 1. Hierarchical Agent Naming
+```
+Current:    Alice, Bob, Carol (flat)
+ai-maestro: project-backend-api, project-frontend-ui (hierarchical)
+Proposed:   federation/network/peer-connection (namespace/role/task)
+```
+
+### 2. Code Graph Visualization
+- Shared codebase understanding across agents
+- Delta indexing for performance
+- Multi-language support
+- Add as `ctrl-010` task
+
+### 3. Agent Health Status
+```typescript
+type AgentStatus = 'healthy' | 'degraded' | 'offline';
+
+interface AgentHealth {
+  status: AgentStatus;
+  lastHeartbeat: number;
+  cpuUsage?: number;
+  memoryUsage?: number;
+  errorRate?: number;
+  isStuck?: boolean;  // No output for N minutes
+}
+```
+
+### 4. Agent Portability
+```
+agent-export.zip
+├── config.json        # Agent configuration
+├── messages.jsonl     # Message history
+├── trajectory.json    # Work history (ours)
+├── git-associations/  # Linked repos
+└── skills.json        # Capability registry
+```
+
+### 5. Conversation Memory
+- ai-maestro uses CozoDB for semantic search
+- We can leverage trajectories + embeddings
+- Cross-agent knowledge sharing
+
+---
+
+## What ai-maestro Could Learn from Us
+
+### 1. Automatic Message Injection
+Their file-based messaging requires human relay. Our tmux send-keys approach enables true agent autonomy.
+
+### 2. End-to-End Delivery Confirmation
+We're planning delivery confirmation via capture-pane. They have no delivery guarantees.
+
+### 3. Peer-to-Peer Federation
+Our federation proposal is peer-to-peer mesh. They require central manager + VPN.
+
+### 4. Trajectories
+Our trajectory format captures complete work history with reasoning, decisions, and retrospectives—richer than their conversation memory.
+
+---
+
+## Integration Opportunities
+
+### Could We Integrate?
+
+1. **Use ai-maestro's Dashboard**
+   - Their web UI is more mature
+   - We provide the messaging backbone
+   - They display, we deliver
+
+2. **Share Code Intelligence**
+   - Their Code Graphs + our Trajectories
+   - Unified knowledge layer
+
+3. **Complementary Roles**
+   ```
+   ai-maestro: Human observation & control (read-mostly)
+   agent-relay: Agent-to-agent messaging (write-inject)
+   ```
+
+### Technical Compatibility
+
+- Both use tmux for agent sessions ✅
+- Both use WebSocket for real-time ✅
+- Different message formats (file vs protocol) ⚠️
+- Different authentication models ⚠️
+
+---
+
+## Roadmap Impact
+
+Based on this analysis, prioritize these additions:
+
+| Priority | Task | Source |
+|----------|------|--------|
+| High | Agent health monitoring | ai-maestro |
+| High | Code Graph integration | ai-maestro |
+| Medium | Hierarchical naming | ai-maestro |
+| Medium | Agent portability | ai-maestro |
+| Medium | Delta indexing | ai-maestro |
+| Low | CozoDB integration | ai-maestro |
+
+These are captured in `ctrl-010`, `ctrl-011`, `ctrl-012` tasks.
+
+---
+
+## Conclusion
+
+**ai-maestro** excels at human observation and visualization but lacks autonomous agent communication.
+
+**agent-relay** excels at automatic agent-to-agent messaging but needs richer dashboards and intelligence features.
+
+The ideal system combines:
+- agent-relay's automatic message injection
+- ai-maestro's visualization and code intelligence
+- Our proposed federation for distributed deployment
+- Our proposed control plane for human oversight
+
+We're building the messaging backbone; ai-maestro shows what the UI layer should look like.

From 7666f782bc3d21cab694210a9039e895cd835496 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 21 Dec 2025 13:58:32 +0000
Subject: [PATCH 08/19] Refactor storage architecture to reference
 agent-trajectories
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- agent-relay owns ephemeral storage (routing queues, ACKs, flow control)
- agent-trajectories owns durable storage (trajectories, knowledge workspace)
- Add event emission interface for agent-relay → agent-trajectories integration
- Remove duplicate trajectory storage details (now in agent-trajectories repo)
- Update summary to reflect separation of concerns
---
 docs/FEDERATION_PROPOSAL.md | 280 ++++++------------------------------
 1 file changed, 46 insertions(+), 234 deletions(-)

diff --git a/docs/FEDERATION_PROPOSAL.md b/docs/FEDERATION_PROPOSAL.md
index b4bc548e1..8ad9ef887 100644
--- a/docs/FEDERATION_PROPOSAL.md
+++ b/docs/FEDERATION_PROPOSAL.md
@@ -1789,38 +1789,37 @@ These questions remain unresolved and need input before/during implementation:
 
 ## 17. Storage Architecture *(NEW)*
 
-Federation introduces distinct storage requirements: **ephemeral storage** for message routing and **durable storage** for trajectories and work history. These have fundamentally different characteristics.
+Federation introduces storage requirements for **ephemeral message routing**. Durable storage for trajectories and work history is handled by the separate [agent-trajectories](https://github.com/khaliqgant/agent-trajectories) project.
 
-### 17.1 Two Storage Domains
+### 17.1 Separation of Concerns
 
 ```
 ┌─────────────────────────────────────────────────────────────────────────────┐
-│                         STORAGE ARCHITECTURE                                 │
+│                         STORAGE RESPONSIBILITY                               │
 ├─────────────────────────────────────────────────────────────────────────────┤
 │                                                                              │
 │  ┌─────────────────────────┐      ┌─────────────────────────────────────┐  │
-│  │   EPHEMERAL STORAGE     │      │       DURABLE STORAGE                │  │
-│  │   (Message Routing)     │      │       (Trajectories)                 │  │
+│  │   agent-relay           │      │   agent-trajectories                 │  │
+│  │   (This Project)        │      │   (Separate Project)                 │  │
 │  │                         │      │                                      │  │
+│  │  EPHEMERAL STORAGE      │      │  DURABLE STORAGE                    │  │
 │  │  • Peer message queues  │      │  • Agent work history               │  │
 │  │  • Pending ACKs         │      │  • Decisions & retrospectives       │  │
-│  │  • Flow control credits │      │  • Inter-agent conversations        │  │
+│  │  • Flow control credits │      │  • Knowledge workspace              │  │
 │  │  • Connection state     │      │  • Exported artifacts               │  │
+│  │  • Registry cache       │      │  • Semantic search / RAG            │  │
 │  │                         │      │                                      │  │
 │  │  Lifetime: minutes/hours│      │  Lifetime: months/years             │  │
-│  │  Size: KB-MB per peer   │      │  Size: MB-GB per project            │  │
-│  │  Loss impact: retry     │      │  Loss impact: permanent             │  │
-│  │                         │      │                                      │  │
-│  │  Backend:               │      │  Backend:                           │  │
-│  │  • Memory (default)     │      │  • File system (default)            │  │
-│  │  • NATS JetStream       │      │  • SQLite (local queries)           │  │
-│  │                         │      │  • PostgreSQL (team sharing)        │  │
-│  │                         │      │  • S3/GCS (archive)                 │  │
+│  │  Backend: Memory, NATS  │      │  Backend: File, SQLite, PG, S3      │  │
 │  └─────────────────────────┘      └─────────────────────────────────────┘  │
 │                                                                              │
+│  Integration: agent-relay emits events → agent-trajectories captures them   │
+│                                                                              │
 └─────────────────────────────────────────────────────────────────────────────┘
 ```
 
+> **See:** [agent-trajectories](https://github.com/khaliqgant/agent-trajectories) for trajectory format, storage backends, and knowledge workspace.
+
 ### 17.2 Ephemeral Storage (Message Routing)
 
 For federation's real-time message routing, **memory is the default**. Messages are transient—they matter for delivery, not history.
@@ -1886,171 +1885,41 @@ const routingStream = {
 - Long disconnection tolerance needed
 - Already have NATS infrastructure
 
-### 17.3 Durable Storage (Trajectories)
-
-For long-term work history, **durable storage is essential**. This stores agent trajectories—the complete record of task work including prompts, reasoning, decisions, and retrospectives.
+### 17.3 Integration with agent-trajectories
 
-> **See also:** [Trajectories Proposal](https://github.com/khaliqgant/agent-relay/pull/3) for detailed format specification.
-
-#### Storage Tiers
-
-```
-┌─────────────────────────────────────────────────────────────────────────────┐
-│                    TRAJECTORY STORAGE TIERS                                  │
-│                                                                              │
-│  ┌─────────────┐     ┌─────────────┐     ┌─────────────┐     ┌────────────┐│
-│  │  Active     │     │   Local     │     │   Central   │     │  Archive   ││
-│  │  (File)     │────►│  (SQLite)   │────►│  (Postgres) │────►│  (S3)      ││
-│  │             │     │             │     │             │     │            ││
-│  │ In-progress │     │ Completed   │     │ Team-shared │     │ Cold       ││
-│  │ trajectories│     │ trajectories│     │ trajectories│     │ storage    ││
-│  │             │     │ (indexed)   │     │             │     │            ││
-│  │ .trajectories/    │ trajectories.db   │ Central DB  │     │ S3 bucket  ││
-│  │  active/    │     │             │     │             │     │            ││
-│  └─────────────┘     └─────────────┘     └─────────────┘     └────────────┘│
-│                                                                              │
-│  Speed: ◄────────────────────────────────────────────────────────────► Cost │
-│         Fastest                                                    Cheapest  │
-└─────────────────────────────────────────────────────────────────────────────┘
-```
-
-#### File System (Default)
-
-```
-.trajectories/
-├── index.json                    # Quick lookup index
-├── active/                       # In-progress trajectories
-│   └── traj_abc123.json
-├── completed/                    # Finished trajectories
-│   ├── 2024-01/
-│   │   ├── traj_def456.json     # Full trajectory data
-│   │   └── traj_def456.md       # Human-readable export
-│   └── 2024-02/
-└── archive/                      # Compressed old trajectories
-```
-
-**Properties:**
-- ✅ Git-friendly (can commit trajectories with code)
-- ✅ No external deps
-- ✅ Portable (copy directory to share)
-- ❌ No cross-server queries
-- ❌ No team sharing without file sync
-
-#### SQLite (Local)
-
-For indexing and querying completed trajectories:
-
-```sql
--- Same DB can hold both routing state and trajectories
--- /tmp/agent-relay/state.sqlite
-
-CREATE TABLE trajectories (
-  id TEXT PRIMARY KEY,
-  task_id TEXT,
-  project_id TEXT NOT NULL,
-  started_at INTEGER NOT NULL,
-  completed_at INTEGER,
-  agent_names TEXT,                -- JSON array
-  chapters TEXT NOT NULL,          -- JSON
-  retrospective TEXT,              -- JSON
-  created_at INTEGER NOT NULL
-);
-
-CREATE INDEX idx_traj_task ON trajectories(task_id);
-CREATE INDEX idx_traj_project ON trajectories(project_id);
-```
-
-**Properties:**
-- ✅ Fast local queries
-- ✅ Single-file, easy backup
-- ✅ No external deps
-- ❌ Single-server scope
-
-#### PostgreSQL (Central)
-
-For team-wide trajectory sharing:
+agent-relay emits events that [agent-trajectories](https://github.com/khaliqgant/agent-trajectories) can capture:
 
 ```typescript
-// Central trajectory store configuration
-interface CentralStorageConfig {
-  type: 'postgresql';
-  connectionString: string;
-  schema: 'agent_relay';
-
-  // Sync behavior
-  syncOnComplete: boolean;        // Push trajectory when task completes
-  syncOnDemand: boolean;          // Pull trajectories from central
-  conflictResolution: 'server-wins' | 'local-wins' | 'merge';
+// agent-relay daemon emits events
+interface RelayEvent {
+  type: 'message_sent' | 'message_received' | 'broadcast';
+  from: string;
+  to: string;
+  content: string;
+  ts: number;
+  messageId: string;
 }
-```
 
-**Properties:**
-- ✅ Team-wide visibility
-- ✅ Rich querying (across all servers)
-- ✅ Central backup
-- ❌ Requires PostgreSQL infrastructure
-- ❌ Network dependency for writes
-
-#### S3/GCS (Archive)
-
-For long-term cold storage:
-
-```typescript
-interface ArchiveConfig {
-  type: 's3' | 'gcs';
-  bucket: string;
-  prefix: 'trajectories/';
-
-  // Lifecycle
-  archiveAfterDays: 90;           // Move to archive after 90 days
-  format: 'json' | 'json.gz';     // Compress for storage
-}
-```
-
-### 17.4 Export Format
-
-Trajectories export to a portable `.trajectory` format:
-
-```
-task-bd-123.trajectory/
-├── manifest.json                 # Metadata and table of contents
-├── trajectory.json               # Machine-readable full data
-├── trajectory.md                 # Human-readable narrative
-└── assets/                       # Attachments (screenshots, files)
-    ├── screenshot-001.png
-    └── diff-summary.patch
+// Event emitter in daemon
+this.events.emit('relay:message', {
+  type: 'message_sent',
+  from: 'Alice',
+  to: 'Bob',
+  content: 'Can you review auth.ts?',
+  ts: Date.now(),
+  messageId: 'msg-abc123'
+});
 ```
 
-**Manifest structure:**
+agent-trajectories subscribes to these events and incorporates them into task trajectories as inter-agent communication records.
 
-```json
-{
-  "version": 1,
-  "trajectory_id": "traj_abc123",
-  "task": {
-    "source": "beads",
-    "id": "bd-123",
-    "title": "Implement rate limiting"
-  },
-  "created_at": "2025-01-15T10:00:00Z",
-  "completed_at": "2025-01-15T14:30:00Z",
-  "agents": ["Alice", "Bob"],
-  "summary": {
-    "chapters": 5,
-    "decisions": 3,
-    "files_changed": 12,
-    "total_events": 156
-  }
-}
-```
-
-### 17.5 Storage Configuration
+### 17.4 Storage Configuration
 
 ```yaml
 # /etc/agent-relay/config.yaml
 
 storage:
-  # Ephemeral (routing)
+  # Ephemeral (routing) - managed by agent-relay
   ephemeral:
     type: memory                   # memory | nats
     max_queue_per_peer: 1000
@@ -2062,84 +1931,27 @@ storage:
       max_age_seconds: 3600
       max_bytes: 104857600         # 100 MB
 
-  # Durable (trajectories)
-  trajectories:
-    # Local storage
-    local:
-      type: file                   # file | sqlite
-      path: .trajectories/
-      index_db: .trajectories/index.sqlite
-
-    # Optional central storage
-    central:
-      enabled: false
-      type: postgresql
-      connection_string: "${TRAJECTORY_DB_URL}"
-      sync_on_complete: true
-
-    # Optional archive
-    archive:
-      enabled: false
-      type: s3
-      bucket: company-trajectories
-      region: us-east-1
-      archive_after_days: 90
+  # Event emission for agent-trajectories integration
+  events:
+    enabled: true
+    emit_to: 'unix:///tmp/agent-trajectories/events.sock'
 ```
 
-### 17.6 Federation Impact on Storage
+### 17.5 Federation Impact on Storage
 
-When federation is enabled, storage considerations change:
+When federation is enabled:
 
 | Concern | Single Server | Federated Fleet |
 |---------|---------------|-----------------|
 | **Routing queues** | Per-agent | Per-peer + per-agent |
 | **Registry** | Local only | Fleet-wide sync |
-| **Trajectories** | Local files | Central DB recommended |
-| **Message history** | Optional | Recommended for debugging |
+| **Event emission** | Optional | Recommended (for trajectories) |
 
-**Recommendations for federated deployments:**
+**Recommendations:**
 
 1. **Routing:** Use NATS if available, otherwise memory with bounded queues
 2. **Registry:** Memory + periodic persistence (survive restarts)
-3. **Trajectories:** SQLite local + PostgreSQL central for team visibility
-4. **Archive:** S3 for cost-effective long-term storage
-
-### 17.7 Data Flow
-
-```
-┌─────────────────────────────────────────────────────────────────────────────┐
-│                    DATA FLOW: ROUTING vs TRAJECTORIES                        │
-│                                                                              │
-│  Message Send                              Task Work                         │
-│       │                                         │                            │
-│       ▼                                         ▼                            │
-│  ┌─────────────┐                         ┌─────────────┐                    │
-│  │ Route via   │                         │ Capture     │                    │
-│  │ ephemeral   │                         │ events      │                    │
-│  │ queues      │                         │ (trajectory)│                    │
-│  └──────┬──────┘                         └──────┬──────┘                    │
-│         │                                       │                            │
-│         ▼                                       ▼                            │
-│  ┌─────────────┐                         ┌─────────────┐                    │
-│  │ Deliver     │                         │ Write to    │                    │
-│  │ & discard   │◄── separate ───────────►│ durable     │                    │
-│  │             │    concerns             │ storage     │                    │
-│  └─────────────┘                         └──────┬──────┘                    │
-│                                                 │                            │
-│                                                 ▼                            │
-│                                          ┌─────────────┐                    │
-│                                          │ Sync to     │                    │
-│                                          │ central     │                    │
-│                                          │ (optional)  │                    │
-│                                          └──────┬──────┘                    │
-│                                                 │                            │
-│                                                 ▼                            │
-│                                          ┌─────────────┐                    │
-│                                          │ Archive to  │                    │
-│                                          │ S3 (cold)   │                    │
-│                                          └─────────────┘                    │
-└─────────────────────────────────────────────────────────────────────────────┘
-```
+3. **Events:** Enable emission to agent-trajectories for cross-agent history
 
 ---
 
@@ -2155,7 +1967,7 @@ This revised proposal addresses the critical issues identified in review:
 | No backpressure | Credit-based flow control (Section 9) |
 | Timeline unrealistic | Revised to 8-10 weeks (Section 14.3) |
 | NATS consideration | Pluggable transport layer (Section 11) |
-| Storage for trajectories | Two-tier storage architecture (Section 17) |
+| Trajectory storage | Delegated to [agent-trajectories](https://github.com/khaliqgant/agent-trajectories) (Section 17) |
 
 **Key additions in v2:**
 - End-to-end delivery confirmation
@@ -2163,7 +1975,7 @@ This revised proposal addresses the critical issues identified in review:
 - Ed25519 authentication (scales better)
 - Credit-based flow control + rate limiting
 - Transport abstraction for NATS option
-- Storage architecture (ephemeral routing + durable trajectories)
+- Ephemeral storage for routing, events emitted to agent-trajectories
 - Realistic timeline with MVP option
 - Open questions for discussion
 

From 334fd73e7553a7c775ba9d80cdd0dd11218d1214 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 21 Dec 2025 14:45:33 +0000
Subject: [PATCH 09/19] Add Mem0 as memory substrate decision

Decision: Use Mem0 (github.com/mem0ai/mem0) as memory layer for
agent-trajectories rather than building from scratch.

Why Mem0:
- 25k+ stars, YC-backed, active development
- Multi-LLM support (not just OpenAI)
- MCP integration exists for Claude Code
- Self-hosted option (Apache 2.0)
- +26% accuracy vs OpenAI Memory benchmarks

What we build on top:
- Task-based trajectory grouping
- Inter-agent event capture
- Fleet-wide knowledge workspace
- .trajectory export format

New tasks (mem-001 through mem-005):
- Integrate Mem0 SDK
- Configure MCP for Claude Code
- Build trajectory layer
- Implement knowledge workspace
- Abstract MemoryBackend interface

See docs/MEMORY_STACK_DECISION.md for full rationale.
---
 .beads/issues.jsonl           |   5 +
 docs/FEDERATION_PROPOSAL.md   |   2 +
 docs/MEMORY_STACK_DECISION.md | 217 ++++++++++++++++++++++++++++++++++
 3 files changed, 224 insertions(+)
 create mode 100644 docs/MEMORY_STACK_DECISION.md

diff --git a/.beads/issues.jsonl b/.beads/issues.jsonl
index 65cccb22c..5c153e97e 100644
--- a/.beads/issues.jsonl
+++ b/.beads/issues.jsonl
@@ -90,3 +90,8 @@
 {"id":"ctrl-010","title":"[Control Plane] Code Graph integration","description":"Learn from ai-maestro: visualize codebase as graph (classes, functions, relationships). Delta indexing for performance. Shared context for all agents. Multi-language support.","status":"open","priority":3,"issue_type":"task","assignee":"Storage","collaborators":["Architect"],"created_at":"2025-12-21T14:00:00Z","updated_at":"2025-12-21T14:00:00Z","labels":["control-plane","intelligence"]}
 {"id":"ctrl-011","title":"[Control Plane] Agent health monitoring","description":"Like ai-maestro's green/red/yellow status: heartbeat checks, resource usage, error rates, stuck detection. Alert on degradation. Auto-recovery actions.","status":"open","priority":2,"issue_type":"task","assignee":"Network","depends_on":["fed-015"],"created_at":"2025-12-21T14:00:00Z","updated_at":"2025-12-21T14:00:00Z","labels":["control-plane","monitoring"]}
 {"id":"ctrl-012","title":"[Control Plane] Agent portability (export/import)","description":"Learn from ai-maestro: export agent as .zip with config, message history, git associations, skills. Import with conflict detection. Enable agent migration between fleets.","status":"open","priority":3,"issue_type":"task","assignee":"Storage","depends_on":["fed-024"],"created_at":"2025-12-21T14:00:00Z","updated_at":"2025-12-21T14:00:00Z","labels":["control-plane","portability"]}
+{"id":"mem-001","title":"[Memory] Integrate Mem0 as memory substrate","description":"Add Mem0 dependency to agent-trajectories. Implement MemoryBackend interface wrapping Mem0 SDK. Support both self-hosted and cloud Mem0. See MEMORY_STACK_DECISION.md for architecture.","status":"open","priority":1,"issue_type":"task","assignee":"Storage","created_at":"2025-12-21T15:00:00Z","updated_at":"2025-12-21T15:00:00Z","labels":["memory","mem0","agent-trajectories"]}
+{"id":"mem-002","title":"[Memory] Configure Mem0 MCP for Claude Code agents","description":"Add MCP configuration for Mem0 integration with Claude Code. Document setup in agent-relay README. Test memory persistence across sessions.","status":"open","priority":2,"issue_type":"task","assignee":"CLI","depends_on":["mem-001"],"created_at":"2025-12-21T15:00:00Z","updated_at":"2025-12-21T15:00:00Z","labels":["memory","mcp","claude-code"]}
+{"id":"mem-003","title":"[Memory] Build task-based trajectory layer on Mem0","description":"Implement trajectory grouping on top of Mem0 observations. Map task_id to Mem0 user_id or metadata. Support chapter-based organization within trajectories.","status":"open","priority":1,"issue_type":"task","assignee":"Storage","collaborators":["Architect"],"depends_on":["mem-001"],"created_at":"2025-12-21T15:00:00Z","updated_at":"2025-12-21T15:00:00Z","labels":["memory","trajectories","agent-trajectories"]}
+{"id":"mem-004","title":"[Memory] Implement fleet-wide knowledge workspace","description":"Build knowledge workspace layer: decisions log, pattern library, cross-agent context. Query interface for agents to access fleet knowledge. Uses Mem0 for storage.","status":"open","priority":2,"issue_type":"task","assignee":"Storage","collaborators":["Router"],"depends_on":["mem-003"],"created_at":"2025-12-21T15:00:00Z","updated_at":"2025-12-21T15:00:00Z","labels":["memory","knowledge","agent-trajectories"]}
+{"id":"mem-005","title":"[Memory] Abstract MemoryBackend for future flexibility","description":"Create MemoryBackend interface allowing swap between Mem0, Zep, or custom SQLite+Chroma. Default to Mem0. Enable migration path if needs change.","status":"open","priority":3,"issue_type":"task","assignee":"Architect","depends_on":["mem-001"],"created_at":"2025-12-21T15:00:00Z","updated_at":"2025-12-21T15:00:00Z","labels":["memory","architecture","agent-trajectories"]}
diff --git a/docs/FEDERATION_PROPOSAL.md b/docs/FEDERATION_PROPOSAL.md
index 8ad9ef887..9bfb49254 100644
--- a/docs/FEDERATION_PROPOSAL.md
+++ b/docs/FEDERATION_PROPOSAL.md
@@ -1819,6 +1819,8 @@ Federation introduces storage requirements for **ephemeral message routing**. Du
 ```
 
 > **See:** [agent-trajectories](https://github.com/khaliqgant/agent-trajectories) for trajectory format, storage backends, and knowledge workspace.
+>
+> **Memory Layer:** agent-trajectories uses [Mem0](https://github.com/mem0ai/mem0) as the memory substrate. See [MEMORY_STACK_DECISION.md](./MEMORY_STACK_DECISION.md) for rationale.
 
 ### 17.2 Ephemeral Storage (Message Routing)
 
diff --git a/docs/MEMORY_STACK_DECISION.md b/docs/MEMORY_STACK_DECISION.md
new file mode 100644
index 000000000..012393559
--- /dev/null
+++ b/docs/MEMORY_STACK_DECISION.md
@@ -0,0 +1,217 @@
+# Memory Stack Decision: Mem0 as Foundation
+
+**Date:** 2025-12-21
+**Status:** Proposed
+
+## Decision
+
+Use [Mem0](https://github.com/mem0ai/mem0) as the memory substrate for agent-trajectories rather than building from scratch.
+
+## Context
+
+We evaluated cross-platform memory solutions before building our own:
+
+| Solution | Stars | Focus | Multi-Agent | MCP Support |
+|----------|-------|-------|-------------|-------------|
+| [Mem0](https://github.com/mem0ai/mem0) | 25k+ | Universal memory API | ✅ | ✅ |
+| [Zep](https://github.com/getzep/zep) | 3k+ | Temporal knowledge graph | ✅ | ❓ |
+| [Letta](https://github.com/letta-ai/letta) | 20k+ | Stateful agents | ✅ | ❓ |
+| [Cognee](https://github.com/topoteretes/cognee) | 4k+ | Document → graph | ⚠️ | ✅ |
+| [claude-mem](https://github.com/thedotmack/claude-mem) | Popular | Claude Code memory | ❌ Single agent | ❌ Claude only |
+
+## Why Mem0
+
+1. **Most popular** - 25k+ stars, active development, YC-backed
+2. **Multi-LLM support** - Not locked to OpenAI (works with Anthropic, etc.)
+3. **MCP integration exists** - Works with Claude Code today via [Composio MCP](https://mcp.composio.dev/mem0)
+4. **Self-hosted option** - Apache 2.0 license
+5. **Python + TypeScript SDKs** - Matches our stack
+6. **Performance claims** - +26% accuracy vs OpenAI Memory, 91% faster, 90% fewer tokens
+
+## Why Not Others
+
+| Solution | Why Not Primary |
+|----------|-----------------|
+| **Zep** | More complex (Graphiti), cloud-first pivot |
+| **Letta** | Full agent framework, not just memory |
+| **Cognee** | Document-focused, less mature |
+| **claude-mem** | Claude Code only, not multi-agent |
+
+## Architecture
+
+```
+┌─────────────────────────────────────────────────────────────────────────────┐
+│                         AGENT MEMORY STACK                                   │
+│                                                                              │
+│  ┌─────────────────────────────────────────────────────────────────────┐   │
+│  │              agent-trajectories (our layer)                          │   │
+│  │                                                                      │   │
+│  │  BUILDS ON MEM0:                        ADDS:                       │   │
+│  │  • Uses Mem0 for observation storage    • Task-based grouping       │   │
+│  │  • Uses Mem0 for semantic search        • Inter-agent events        │   │
+│  │  • Uses Mem0's multi-user isolation     • Fleet knowledge workspace │   │
+│  │                                         • .trajectory export        │   │
+│  │                                         • Decisions & patterns      │   │
+│  └──────────────────────────────────┬──────────────────────────────────┘   │
+│                                     │ uses                                  │
+│                                     ▼                                       │
+│  ┌─────────────────────────────────────────────────────────────────────┐   │
+│  │                         Mem0 (memory substrate)                      │   │
+│  │                                                                      │   │
+│  │  • Observation storage + retrieval                                  │   │
+│  │  • Semantic search (vector + hybrid)                                │   │
+│  │  • Multi-user/agent isolation                                       │   │
+│  │  • MCP integration for Claude Code                                  │   │
+│  │  • Self-hosted or cloud                                             │   │
+│  └──────────────────────────────────┬──────────────────────────────────┘   │
+│                                     │                                       │
+│          ┌──────────────────────────┼──────────────────────────┐           │
+│          ▼                          ▼                          ▼           │
+│  ┌──────────────┐          ┌──────────────┐          ┌──────────────┐     │
+│  │ Claude agent │          │ Codex agent  │          │ Gemini agent │     │
+│  │ (MCP→Mem0)   │          │ (SDK→Mem0)   │          │ (SDK→Mem0)   │     │
+│  └──────────────┘          └──────────────┘          └──────────────┘     │
+│                                                                              │
+│  ◄──────────────────── agent-relay provides messaging ──────────────────►  │
+│                                                                              │
+└─────────────────────────────────────────────────────────────────────────────┘
+```
+
+## What We Build vs Use
+
+| Component | Build or Use | Owner |
+|-----------|--------------|-------|
+| Observation storage | **USE Mem0** | Mem0 |
+| Semantic search | **USE Mem0** | Mem0 |
+| Vector database | **USE Mem0** | Mem0 |
+| Task-based grouping | **BUILD** | agent-trajectories |
+| Trajectory format (.trajectory) | **BUILD** | agent-trajectories |
+| Knowledge workspace | **BUILD** | agent-trajectories |
+| Inter-agent event capture | **BUILD** | agent-trajectories |
+| Fleet-wide patterns/decisions | **BUILD** | agent-trajectories |
+| Message routing | **USE** | agent-relay |
+
+## Integration Points
+
+### 1. Mem0 as Storage Backend
+
+```typescript
+// agent-trajectories uses Mem0 for observation storage
+import { Memory } from 'mem0ai';
+
+const memory = new Memory({
+  // Self-hosted or cloud
+  api_key: process.env.MEM0_API_KEY,
+});
+
+// Store trajectory events as Mem0 memories
+async function storeTrajectoryEvent(event: TrajectoryEvent) {
+  await memory.add({
+    messages: [{ role: 'assistant', content: event.content }],
+    user_id: event.agentId,
+    metadata: {
+      trajectory_id: event.trajectoryId,
+      task_id: event.taskId,
+      event_type: event.type,
+      ts: event.ts,
+    },
+  });
+}
+
+// Retrieve relevant context for an agent
+async function getAgentContext(agentId: string, query: string) {
+  return memory.search({
+    query,
+    user_id: agentId,
+    limit: 10,
+  });
+}
+```
+
+### 2. MCP for Claude Code Agents
+
+```json
+// Claude Code MCP config (~/.claude/mcp.json)
+{
+  "mcpServers": {
+    "mem0": {
+      "command": "npx",
+      "args": ["-y", "@mem0/mcp-server"],
+      "env": {
+        "MEM0_API_KEY": "${MEM0_API_KEY}"
+      }
+    }
+  }
+}
+```
+
+### 3. agent-relay Event Emission
+
+```typescript
+// agent-relay emits events
+relay.on('message', (msg) => {
+  // Forward to agent-trajectories
+  trajectories.captureEvent({
+    type: 'inter_agent_message',
+    from: msg.from,
+    to: msg.to,
+    content: msg.content,
+    ts: msg.ts,
+  });
+});
+```
+
+## Alternatives Considered
+
+### Option A: Build Everything (Rejected)
+- SQLite + FTS5 + Chroma from scratch
+- **Rejected:** 3-4 weeks of work Mem0 already does
+
+### Option B: Fork claude-mem (Rejected)
+- Extend claude-mem for multi-agent
+- **Rejected:** Too Claude-specific, massive refactor needed
+
+### Option C: Use Zep (Considered)
+- Temporal knowledge graph is powerful
+- **Deferred:** More complex, can add later if needed
+
+### Option D: Use Mem0 + Build On Top (Selected)
+- Best of both worlds
+- Use mature memory infra, add our task/trajectory layer
+
+## Migration Path
+
+If Mem0 doesn't meet needs, the abstraction allows swapping:
+
+```typescript
+interface MemoryBackend {
+  add(memory: Memory): Promise<void>;
+  search(query: string, options: SearchOptions): Promise<Memory[]>;
+  delete(id: string): Promise<void>;
+}
+
+// Default: Mem0
+class Mem0Backend implements MemoryBackend { ... }
+
+// Alternative: Zep (if we need temporal graphs)
+class ZepBackend implements MemoryBackend { ... }
+
+// Fallback: Custom SQLite + Chroma
+class LocalBackend implements MemoryBackend { ... }
+```
+
+## Next Steps
+
+1. Add Mem0 dependency to agent-trajectories
+2. Implement MemoryBackend interface with Mem0
+3. Add MCP configuration for Claude Code agents
+4. Build task-based trajectory layer on top
+5. Integrate with agent-relay event emission
+
+## References
+
+- [Mem0 GitHub](https://github.com/mem0ai/mem0)
+- [Mem0 Documentation](https://docs.mem0.ai/)
+- [Mem0 MCP Integration](https://mcp.composio.dev/mem0)
+- [Collaborative Memory Paper](https://arxiv.org/html/2505.18279v1)
+- [MemEngine Paper](https://arxiv.org/html/2505.02099v1)

From f6b86b89165b018ced22645de6569d31a39e7093 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 21 Dec 2025 15:05:40 +0000
Subject: [PATCH 10/19] Address Claude Code auth constraint in memory decision

- Add section explaining MCP-based approach where Claude Code IS the LLM
- Update integration examples to use infer:false (no API needed)
- Add direct Qdrant alternative for simpler implementation
- Document embedding options without paid APIs (Ollama, FastEmbed)
- Update next steps to reflect MCP-first approach

Key insight: With MCP, the agent handles intelligence, Mem0 becomes
pure storage + vector search. No Anthropic SDK required.
---
 docs/MEMORY_STACK_DECISION.md | 171 +++++++++++++++++++++++++++++-----
 1 file changed, 148 insertions(+), 23 deletions(-)

diff --git a/docs/MEMORY_STACK_DECISION.md b/docs/MEMORY_STACK_DECISION.md
index 012393559..6f2f4cdcc 100644
--- a/docs/MEMORY_STACK_DECISION.md
+++ b/docs/MEMORY_STACK_DECISION.md
@@ -91,43 +91,167 @@ We evaluated cross-platform memory solutions before building our own:
 | Fleet-wide patterns/decisions | **BUILD** | agent-trajectories |
 | Message routing | **USE** | agent-relay |
 
+## Constraint: Claude Code Auth Only (No Direct SDK)
+
+**Problem:** We use Claude Code via auth, not direct Anthropic SDK access. Mem0's TypeScript SDK requires LLM API access for memory extraction/compression.
+
+**Solution:** Use MCP approach where **Claude Code IS the intelligence layer**.
+
+```
+┌─────────────────────────────────────────────────────────────────────────────┐
+│                    MCP-BASED MEMORY (Recommended)                           │
+│                                                                              │
+│  ┌─────────────────────────────────────────────────────────────────────┐   │
+│  │                      Claude Code Agent                               │   │
+│  │                                                                      │   │
+│  │  1. Agent decides what to remember (intelligence here)              │   │
+│  │  2. Agent calls MCP tool: add_memory("user prefers dark mode")      │   │
+│  │  3. Later: Agent calls search_memories("user preferences")          │   │
+│  │  4. Agent uses retrieved memories in context                        │   │
+│  └────────────────────────────────────┬────────────────────────────────┘   │
+│                                       │ MCP calls                           │
+│                                       ▼                                     │
+│  ┌─────────────────────────────────────────────────────────────────────┐   │
+│  │                      Mem0 MCP Server                                 │   │
+│  │                                                                      │   │
+│  │  • add_memory(content, metadata)   → Store to vector DB             │   │
+│  │  • search_memories(query)          → Vector search (no LLM needed)  │   │
+│  │  • delete_memory(id)               → Remove from storage            │   │
+│  │                                                                      │   │
+│  │  NO LLM CALLS - Pure storage + retrieval                            │   │
+│  └────────────────────────────────────┬────────────────────────────────┘   │
+│                                       │                                     │
+│                                       ▼                                     │
+│                             ┌──────────────────┐                           │
+│                             │   Qdrant / Redis  │                           │
+│                             │   (Vector Store)  │                           │
+│                             └──────────────────┘                           │
+└─────────────────────────────────────────────────────────────────────────────┘
+```
+
+### Why This Works
+
+| Approach | LLM Needed? | Works with Claude Code Auth? |
+|----------|-------------|------------------------------|
+| Mem0 TypeScript SDK (full) | ✅ Yes, for extraction | ❌ No - requires API key |
+| Mem0 MCP Server | ❌ No, agent is the LLM | ✅ Yes |
+| Mem0 with `infer: false` | ❌ No, raw storage | ✅ Yes |
+
+**Key Insight:** With MCP, the agent (Claude Code) does the "thinking" about what to remember. Mem0 becomes a dumb storage layer with smart retrieval (vector search).
+
+### Integration Modes
+
+| Mode | Use Case | LLM Required |
+|------|----------|--------------|
+| **MCP (Primary)** | Claude Code agents | No - agent is the LLM |
+| **SDK with infer:false** | Programmatic storage | No - raw storage |
+| **SDK with LLM** | Non-Claude agents with API keys | Yes - for extraction |
+
+### MCP Server Options
+
+1. **Mem0 Official MCP** - `@mem0/mcp-server`
+2. **Composio MCP** - Third-party wrapper
+3. **Custom MCP** - Build thin layer over Qdrant
+
+### Embeddings Without API Keys
+
+Vector search needs embeddings. Options without paid API:
+
+| Option | Pros | Cons |
+|--------|------|------|
+| **Ollama** (local) | Free, private, fast | Requires local GPU/CPU |
+| **HuggingFace TEI** | Free, self-hosted | Setup complexity |
+| **Qdrant FastEmbed** | Built-in, no setup | Limited models |
+| **OpenAI API** | Best quality | Requires API key + cost |
+
+Recommended: **Ollama + nomic-embed-text** for local development, option to swap to OpenAI embeddings in production if quality matters.
+
 ## Integration Points
 
-### 1. Mem0 as Storage Backend
+### 1. Mem0 as Storage Backend (No LLM Required)
 
 ```typescript
 // agent-trajectories uses Mem0 for observation storage
-import { Memory } from 'mem0ai';
+// NOTE: Using infer:false - no LLM API needed
+import { Memory } from 'mem0ai/oss';
 
 const memory = new Memory({
-  // Self-hosted or cloud
-  api_key: process.env.MEM0_API_KEY,
+  // Vector store only - no LLM config needed
+  vectorStore: {
+    provider: 'qdrant',
+    config: {
+      url: process.env.QDRANT_URL || 'http://localhost:6333',
+      collectionName: 'trajectories',
+    },
+  },
+  // Optional: embeddings can use local model or API
+  embedder: {
+    provider: 'ollama',  // Local embeddings, no API key
+    config: { model: 'nomic-embed-text' },
+  },
 });
 
-// Store trajectory events as Mem0 memories
+// Store trajectory events - raw storage, no extraction
 async function storeTrajectoryEvent(event: TrajectoryEvent) {
-  await memory.add({
-    messages: [{ role: 'assistant', content: event.content }],
-    user_id: event.agentId,
-    metadata: {
-      trajectory_id: event.trajectoryId,
-      task_id: event.taskId,
-      event_type: event.type,
-      ts: event.ts,
-    },
-  });
+  await memory.add(
+    [{ role: 'assistant', content: event.content }],
+    {
+      user_id: event.agentId,
+      metadata: {
+        trajectory_id: event.trajectoryId,
+        task_id: event.taskId,
+        event_type: event.type,
+        ts: event.ts,
+      },
+      infer: false,  // Skip LLM extraction - store as-is
+    }
+  );
 }
 
-// Retrieve relevant context for an agent
+// Retrieve relevant context - vector search only
 async function getAgentContext(agentId: string, query: string) {
-  return memory.search({
-    query,
+  return memory.search(query, {
     user_id: agentId,
     limit: 10,
   });
 }
 ```
 
+### 1b. Alternative: Direct Qdrant (Simpler)
+
+If Mem0 adds complexity, use Qdrant directly:
+
+```typescript
+import { QdrantClient } from '@qdrant/js-client-rest';
+
+const qdrant = new QdrantClient({ url: 'http://localhost:6333' });
+
+// Store with pre-computed embeddings (from local model)
+async function storeEvent(event: TrajectoryEvent, embedding: number[]) {
+  await qdrant.upsert('trajectories', {
+    points: [{
+      id: event.id,
+      vector: embedding,
+      payload: {
+        agentId: event.agentId,
+        content: event.content,
+        trajectoryId: event.trajectoryId,
+        ts: event.ts,
+      },
+    }],
+  });
+}
+
+// Search by vector similarity
+async function search(queryEmbedding: number[], agentId: string) {
+  return qdrant.search('trajectories', {
+    vector: queryEmbedding,
+    filter: { must: [{ key: 'agentId', match: { value: agentId } }] },
+    limit: 10,
+  });
+}
+```
+
 ### 2. MCP for Claude Code Agents
 
 ```json
@@ -202,11 +326,12 @@ class LocalBackend implements MemoryBackend { ... }
 
 ## Next Steps
 
-1. Add Mem0 dependency to agent-trajectories
-2. Implement MemoryBackend interface with Mem0
-3. Add MCP configuration for Claude Code agents
-4. Build task-based trajectory layer on top
-5. Integrate with agent-relay event emission
+1. **Set up Qdrant** - Local vector store (docker or binary)
+2. **Set up Ollama** - Local embeddings (nomic-embed-text)
+3. **Configure Mem0 MCP** - For Claude Code agents
+4. **Implement MemoryBackend** - With `infer: false` for programmatic use
+5. **Build trajectory layer** - Task grouping, patterns on top
+6. **Integrate with agent-relay** - Event emission to trajectories
 
 ## References
 

From e5d8082bbe81d80ee3844498f75ab0233ccedce6 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 21 Dec 2025 15:49:39 +0000
Subject: [PATCH 11/19] Add Hooks API as core agent-relay primitive

- Define pattern namespace system (@relay:, @memory:, @custom:)
- Add hook lifecycle events (onSessionStart, onOutput, etc.)
- Document HookContext and programmatic API
- Add relay.config.ts configuration format
- Create 6 hook-* tasks for implementation roadmap

Hooks enable:
- Automatic memory prompts at session end
- User-defined pattern handlers
- Integration points for extensions

See docs/HOOKS_API.md for full design.
---
 .beads/issues.jsonl |   6 +
 docs/HOOKS_API.md   | 340 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 346 insertions(+)
 create mode 100644 docs/HOOKS_API.md

diff --git a/.beads/issues.jsonl b/.beads/issues.jsonl
index 5c153e97e..37a6e2679 100644
--- a/.beads/issues.jsonl
+++ b/.beads/issues.jsonl
@@ -95,3 +95,9 @@
 {"id":"mem-003","title":"[Memory] Build task-based trajectory layer on Mem0","description":"Implement trajectory grouping on top of Mem0 observations. Map task_id to Mem0 user_id or metadata. Support chapter-based organization within trajectories.","status":"open","priority":1,"issue_type":"task","assignee":"Storage","collaborators":["Architect"],"depends_on":["mem-001"],"created_at":"2025-12-21T15:00:00Z","updated_at":"2025-12-21T15:00:00Z","labels":["memory","trajectories","agent-trajectories"]}
 {"id":"mem-004","title":"[Memory] Implement fleet-wide knowledge workspace","description":"Build knowledge workspace layer: decisions log, pattern library, cross-agent context. Query interface for agents to access fleet knowledge. Uses Mem0 for storage.","status":"open","priority":2,"issue_type":"task","assignee":"Storage","collaborators":["Router"],"depends_on":["mem-003"],"created_at":"2025-12-21T15:00:00Z","updated_at":"2025-12-21T15:00:00Z","labels":["memory","knowledge","agent-trajectories"]}
 {"id":"mem-005","title":"[Memory] Abstract MemoryBackend for future flexibility","description":"Create MemoryBackend interface allowing swap between Mem0, Zep, or custom SQLite+Chroma. Default to Mem0. Enable migration path if needs change.","status":"open","priority":3,"issue_type":"task","assignee":"Architect","depends_on":["mem-001"],"created_at":"2025-12-21T15:00:00Z","updated_at":"2025-12-21T15:00:00Z","labels":["memory","architecture","agent-trajectories"]}
+{"id":"hook-001","title":"[Hooks] Implement pattern namespace registry","description":"Create src/hooks/pattern-registry.ts. Register pattern handlers by namespace (@relay:, @memory:, @custom:). Match output against patterns. Route to handlers. Support built-in and user-defined namespaces. See HOOKS_API.md.","status":"open","priority":1,"issue_type":"task","assignee":"Router","created_at":"2025-12-21T16:00:00Z","updated_at":"2025-12-21T16:00:00Z","labels":["hooks","core"]}
+{"id":"hook-002","title":"[Hooks] Implement hook lifecycle events","description":"Add lifecycle events to tmux wrapper: onSessionStart, onSessionEnd, onOutput, onIdle, onMessageReceived. Emit events with HookContext. Allow handlers to inject text. See HOOKS_API.md.","status":"open","priority":1,"issue_type":"task","assignee":"Router","depends_on":["hook-001"],"created_at":"2025-12-21T16:00:00Z","updated_at":"2025-12-21T16:00:00Z","labels":["hooks","core"]}
+{"id":"hook-003","title":"[Hooks] Implement @memory: built-in handler","description":"Create built-in @memory: pattern handler. Support actions: save, search, forget, list. Connect to Mem0 backend. Inject results back to agent. See HOOKS_API.md and MEMORY_STACK_DECISION.md.","status":"open","priority":1,"issue_type":"task","assignee":"Storage","collaborators":["Router"],"depends_on":["hook-001","mem-001"],"created_at":"2025-12-21T16:00:00Z","updated_at":"2025-12-21T16:00:00Z","labels":["hooks","memory"]}
+{"id":"hook-004","title":"[Hooks] Implement relay.config.ts loader","description":"Create config loader for relay.config.ts. Support project root and ~/.config/agent-relay/. Load pattern handlers and lifecycle hooks. Type-safe config with RelayConfig interface. See HOOKS_API.md.","status":"open","priority":2,"issue_type":"task","assignee":"CLI","depends_on":["hook-002"],"created_at":"2025-12-21T16:00:00Z","updated_at":"2025-12-21T16:00:00Z","labels":["hooks","config"]}
+{"id":"hook-005","title":"[Hooks] Programmatic hook API","description":"Export Relay class with pattern() and on() methods. Allow programmatic registration of handlers. TypeScript types for HookContext, PatternHandler. See HOOKS_API.md.","status":"open","priority":2,"issue_type":"task","assignee":"Architect","collaborators":["Router"],"depends_on":["hook-002"],"created_at":"2025-12-21T16:00:00Z","updated_at":"2025-12-21T16:00:00Z","labels":["hooks","api"]}
+{"id":"hook-006","title":"[Hooks] Document hooks API with examples","description":"Write hooks documentation: pattern syntax, lifecycle events, config file format, programmatic API. Include examples for @memory:, @notify:, @deploy: patterns. Update README.","status":"open","priority":3,"issue_type":"task","assignee":"Architect","depends_on":["hook-005"],"created_at":"2025-12-21T16:00:00Z","updated_at":"2025-12-21T16:00:00Z","labels":["hooks","docs"]}
diff --git a/docs/HOOKS_API.md b/docs/HOOKS_API.md
new file mode 100644
index 000000000..c3b73c65b
--- /dev/null
+++ b/docs/HOOKS_API.md
@@ -0,0 +1,340 @@
+# Agent Relay Hooks API
+
+**Date:** 2025-12-21
+**Status:** Proposed
+
+## Overview
+
+Hooks are a core primitive in agent-relay that allow:
+1. **Intercepting agent output** - React to patterns, events, session lifecycle
+2. **Injecting prompts** - Guide agent behavior automatically
+3. **Extending with namespaces** - User-defined `@pattern:` handlers
+
+## Pattern Namespaces
+
+agent-relay intercepts output patterns in the format `@namespace:target message`.
+
+### Built-in Namespaces
+
+| Namespace | Purpose | Example |
+|-----------|---------|---------|
+| `@relay:` | Inter-agent messaging | `@relay:Alice Check the tests` |
+| `@memory:` | Memory operations | `@memory:save User prefers dark mode` |
+| `@broadcast:` | Broadcast to all | `@relay:* Status update` |
+
+### Memory Namespace
+
+```
+@memory:save <content>        # Store a memory
+@memory:search <query>        # Retrieve relevant memories
+@memory:forget <id>           # Delete a memory
+@memory:list                  # List recent memories
+```
+
+### User-Defined Namespaces
+
+Users can register custom pattern handlers:
+
+```typescript
+// relay.config.ts
+export default {
+  patterns: {
+    // Custom namespace: @deploy:
+    deploy: {
+      handler: async (target, message, context) => {
+        if (target === 'staging') {
+          await exec('npm run deploy:staging');
+          return { inject: 'Deployed to staging successfully' };
+        }
+      }
+    },
+
+    // Custom namespace: @notify:
+    notify: {
+      handler: async (target, message, context) => {
+        await fetch('https://slack.com/api/post', {
+          body: JSON.stringify({ channel: target, text: message })
+        });
+      }
+    }
+  }
+};
+```
+
+Usage in agent output:
+```
+@deploy:staging Release v1.2.3
+@notify:#engineering Build complete
+```
+
+## Hook Lifecycle
+
+```
+┌─────────────────────────────────────────────────────────────────────────────┐
+│                           HOOK LIFECYCLE                                     │
+│                                                                              │
+│  SESSION START                                                               │
+│       │                                                                      │
+│       ▼                                                                      │
+│  ┌─────────────────┐                                                        │
+│  │ onSessionStart  │ → Inject initial context, load memories                │
+│  └────────┬────────┘                                                        │
+│           │                                                                  │
+│           ▼                                                                  │
+│  ┌─────────────────────────────────────────────────────────────────┐       │
+│  │                      AGENT RUNNING                               │       │
+│  │                                                                  │       │
+│  │   Agent Output ──► onOutput ──► Pattern Match? ──► Handler      │       │
+│  │        │                              │                          │       │
+│  │        │                              ▼                          │       │
+│  │        │                        @relay: → route message          │       │
+│  │        │                        @memory: → store/search          │       │
+│  │        │                        @custom: → user handler          │       │
+│  │        │                                                         │       │
+│  │        ▼                                                         │       │
+│  │   onToolCall ──► Before/after tool execution                    │       │
+│  │        │                                                         │       │
+│  │        ▼                                                         │       │
+│  │   onMessageReceived ──► Inject incoming relay messages          │       │
+│  │        │                                                         │       │
+│  │        ▼                                                         │       │
+│  │   onIdle ──► Periodic prompts (memory review, status)           │       │
+│  │                                                                  │       │
+│  └─────────────────────────────────────────────────────────────────┘       │
+│           │                                                                  │
+│           ▼                                                                  │
+│  ┌─────────────────┐                                                        │
+│  │ onSessionEnd    │ → Prompt for memory save, cleanup                      │
+│  └─────────────────┘                                                        │
+│                                                                              │
+└─────────────────────────────────────────────────────────────────────────────┘
+```
+
+## Hook API
+
+### Configuration File
+
+```typescript
+// relay.config.ts (in project root or ~/.config/agent-relay/)
+import type { RelayConfig } from 'agent-relay';
+
+export default {
+  // Pattern handlers (namespaces)
+  patterns: {
+    memory: 'builtin',  // Use built-in memory handler
+    deploy: { handler: myDeployHandler },
+  },
+
+  // Lifecycle hooks
+  hooks: {
+    onSessionStart: async (ctx) => {
+      // Load relevant memories
+      const memories = await ctx.memory.search(ctx.workingDir);
+      return { inject: `Relevant context:\n${memories}` };
+    },
+
+    onSessionEnd: async (ctx) => {
+      return {
+        inject: `Session ending. Save any important learnings with @memory:save`
+      };
+    },
+
+    onOutput: async (output, ctx) => {
+      // Custom output processing
+      if (output.includes('ERROR')) {
+        await ctx.notify('errors', output);
+      }
+    },
+
+    onIdle: async (ctx) => {
+      // Called after 30s of no output
+      // Could prompt for status update
+    },
+  },
+
+  // Memory configuration
+  memory: {
+    backend: 'mem0',  // or 'qdrant', 'custom'
+    autoSave: false,  // Don't auto-extract, let agent decide
+    promptOnEnd: true, // Prompt to save at session end
+  },
+} satisfies RelayConfig;
+```
+
+### Programmatic API
+
+```typescript
+import { Relay } from 'agent-relay';
+
+const relay = new Relay({
+  name: 'MyAgent',
+});
+
+// Register pattern handler
+relay.pattern('deploy', async (target, message, ctx) => {
+  console.log(`Deploying to ${target}: ${message}`);
+  await deploy(target);
+  return { inject: `Deployed to ${target}` };
+});
+
+// Register lifecycle hook
+relay.on('sessionStart', async (ctx) => {
+  const memories = await loadMemories(ctx.agentId);
+  ctx.inject(`Your memories:\n${memories}`);
+});
+
+relay.on('sessionEnd', async (ctx) => {
+  ctx.inject('Save important learnings with @memory:save');
+});
+
+// Start with wrapped command
+relay.wrap('claude');
+```
+
+### Hook Context
+
+```typescript
+interface HookContext {
+  // Agent info
+  agentId: string;
+  agentName: string;
+  sessionId: string;
+
+  // Environment
+  workingDir: string;
+  env: Record<string, string>;
+
+  // Actions
+  inject(text: string): void;      // Inject text to agent stdin
+  send(to: string, msg: string): void;  // Send relay message
+
+  // Built-in services
+  memory: MemoryService;           // Memory operations
+  relay: RelayService;             // Messaging operations
+
+  // Session state
+  output: string[];                // All output so far
+  messages: Message[];             // All relay messages
+}
+```
+
+## Built-in Pattern Handlers
+
+### @relay: (Messaging)
+
+```typescript
+// Built-in, always available
+relay.pattern('relay', async (target, message, ctx) => {
+  if (target === '*') {
+    await ctx.relay.broadcast(message);
+  } else {
+    await ctx.relay.send(target, message);
+  }
+});
+```
+
+### @memory: (Memory Operations)
+
+```typescript
+// Built-in when memory is configured
+relay.pattern('memory', async (action, content, ctx) => {
+  switch (action) {
+    case 'save':
+      await ctx.memory.add(content, { agentId: ctx.agentId });
+      return { inject: `✓ Saved to memory` };
+
+    case 'search':
+      const results = await ctx.memory.search(content);
+      return { inject: `Memories:\n${format(results)}` };
+
+    case 'forget':
+      await ctx.memory.delete(content);
+      return { inject: `✓ Forgotten` };
+  }
+});
+```
+
+## Example: Full Memory Integration
+
+```typescript
+// relay.config.ts
+export default {
+  patterns: {
+    memory: 'builtin',
+  },
+
+  hooks: {
+    onSessionStart: async (ctx) => {
+      // Search for relevant context based on current directory/project
+      const projectMemories = await ctx.memory.search(
+        `project: ${ctx.workingDir}`
+      );
+      const userPrefs = await ctx.memory.search('user preferences');
+
+      if (projectMemories.length || userPrefs.length) {
+        return {
+          inject: `
+[CONTEXT FROM MEMORY]
+${projectMemories.map(m => `- ${m.content}`).join('\n')}
+
+[USER PREFERENCES]
+${userPrefs.map(m => `- ${m.content}`).join('\n')}
+`
+        };
+      }
+    },
+
+    onSessionEnd: async (ctx) => {
+      return {
+        inject: `
+[SESSION ENDING]
+If you learned anything important, save it:
+  @memory:save <what you learned>
+
+Examples:
+  @memory:save User prefers TypeScript over JavaScript
+  @memory:save This project uses Prisma for database access
+  @memory:save Auth tokens stored in httpOnly cookies
+`
+      };
+    },
+  },
+
+  memory: {
+    backend: 'mem0',
+    config: {
+      vectorStore: { provider: 'qdrant', url: 'http://localhost:6333' },
+      embedder: { provider: 'ollama', model: 'nomic-embed-text' },
+    },
+  },
+};
+```
+
+## Escaping Patterns
+
+To output literal `@namespace:` without triggering handlers:
+
+```
+\@relay:AgentName    # Outputs literally, not routed
+\\@relay:AgentName   # Outputs \@relay:AgentName
+```
+
+## Priority & Order
+
+1. Patterns are matched in order of specificity
+2. Built-in patterns run before user patterns (unless overridden)
+3. Multiple handlers for same pattern run in registration order
+4. Return `{ stop: true }` to prevent further handlers
+
+## Next Steps
+
+1. Implement pattern registry in agent-relay daemon
+2. Add hook lifecycle events to wrapper
+3. Implement @memory: built-in handler
+4. Create relay.config.ts loader
+5. Add documentation and examples
+
+## Related
+
+- [MEMORY_STACK_DECISION.md](./MEMORY_STACK_DECISION.md) - Memory backend choice
+- [FEDERATION_PROPOSAL.md](./FEDERATION_PROPOSAL.md) - Cross-server messaging

From 37238ba227c165e3e1e64eaba8b59e2e9ba3e1e9 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 21 Dec 2025 16:07:08 +0000
Subject: [PATCH 12/19] Document lifecycle events with implementation details

- Add detailed spec for each lifecycle event:
  - onSessionStart: when, trigger point, code example, use cases
  - onOutput: polling mechanism, handler signature, performance notes
  - onIdle: threshold config, once-per-period firing
  - onMessageReceived: suppress/modify capability
  - onSessionEnd: SIGINT handling, wait for response
- Add HookEmitter class design
- Add Event Summary Table
- Create 7 new granular tasks (hook-007 to hook-013)

Tasks cover: HookEmitter, each lifecycle event, and types.
---
 .beads/issues.jsonl |   7 +
 docs/HOOKS_API.md   | 320 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 327 insertions(+)

diff --git a/.beads/issues.jsonl b/.beads/issues.jsonl
index 37a6e2679..43727fe22 100644
--- a/.beads/issues.jsonl
+++ b/.beads/issues.jsonl
@@ -101,3 +101,10 @@
 {"id":"hook-004","title":"[Hooks] Implement relay.config.ts loader","description":"Create config loader for relay.config.ts. Support project root and ~/.config/agent-relay/. Load pattern handlers and lifecycle hooks. Type-safe config with RelayConfig interface. See HOOKS_API.md.","status":"open","priority":2,"issue_type":"task","assignee":"CLI","depends_on":["hook-002"],"created_at":"2025-12-21T16:00:00Z","updated_at":"2025-12-21T16:00:00Z","labels":["hooks","config"]}
 {"id":"hook-005","title":"[Hooks] Programmatic hook API","description":"Export Relay class with pattern() and on() methods. Allow programmatic registration of handlers. TypeScript types for HookContext, PatternHandler. See HOOKS_API.md.","status":"open","priority":2,"issue_type":"task","assignee":"Architect","collaborators":["Router"],"depends_on":["hook-002"],"created_at":"2025-12-21T16:00:00Z","updated_at":"2025-12-21T16:00:00Z","labels":["hooks","api"]}
 {"id":"hook-006","title":"[Hooks] Document hooks API with examples","description":"Write hooks documentation: pattern syntax, lifecycle events, config file format, programmatic API. Include examples for @memory:, @notify:, @deploy: patterns. Update README.","status":"open","priority":3,"issue_type":"task","assignee":"Architect","depends_on":["hook-005"],"created_at":"2025-12-21T16:00:00Z","updated_at":"2025-12-21T16:00:00Z","labels":["hooks","docs"]}
+{"id":"hook-007","title":"[Hooks] Create HookEmitter class","description":"Create src/hooks/emitter.ts with HookEmitter class. Support on(event, handler), emit(event, ...args). Handle async handlers. Support stop propagation. Load handlers from config object. See HOOKS_API.md.","status":"open","priority":1,"issue_type":"task","assignee":"Router","created_at":"2025-12-21T17:00:00Z","updated_at":"2025-12-21T17:00:00Z","labels":["hooks","core"]}
+{"id":"hook-008","title":"[Hooks] Implement onSessionStart event","description":"Add onSessionStart hook to TmuxWrapper.start(). Fire after tmux spawn, before polling. Pass HookContext with agentId, sessionId, workingDir. Inject returned text. See HOOKS_API.md lifecycle spec.","status":"open","priority":1,"issue_type":"task","assignee":"Router","depends_on":["hook-007"],"created_at":"2025-12-21T17:00:00Z","updated_at":"2025-12-21T17:00:00Z","labels":["hooks","lifecycle"]}
+{"id":"hook-009","title":"[Hooks] Implement onSessionEnd event","description":"Add onSessionEnd hook to TmuxWrapper.stop() and SIGINT handler. Fire before cleanup. Inject returned text and wait for agent response (5s timeout). See HOOKS_API.md lifecycle spec.","status":"open","priority":1,"issue_type":"task","assignee":"Router","depends_on":["hook-007"],"created_at":"2025-12-21T17:00:00Z","updated_at":"2025-12-21T17:00:00Z","labels":["hooks","lifecycle"]}
+{"id":"hook-010","title":"[Hooks] Implement onOutput event","description":"Add onOutput hook to TmuxWrapper.pollOutput(). Fire on new output diff. Pass output string and context. Keep handlers fast (warn if >100ms). See HOOKS_API.md lifecycle spec.","status":"open","priority":2,"issue_type":"task","assignee":"Router","depends_on":["hook-007"],"created_at":"2025-12-21T17:00:00Z","updated_at":"2025-12-21T17:00:00Z","labels":["hooks","lifecycle"]}
+{"id":"hook-011","title":"[Hooks] Implement onIdle event","description":"Add onIdle hook to TmuxWrapper.pollOutput(). Fire when no output for idleThreshold (default 30s). Fire once per idle period, not continuously. Configurable threshold. See HOOKS_API.md lifecycle spec.","status":"open","priority":2,"issue_type":"task","assignee":"Router","depends_on":["hook-007"],"created_at":"2025-12-21T17:00:00Z","updated_at":"2025-12-21T17:00:00Z","labels":["hooks","lifecycle"]}
+{"id":"hook-012","title":"[Hooks] Implement onMessageReceived event","description":"Add onMessageReceived hook to TmuxWrapper.handleIncomingMessage(). Fire before injection. Allow suppress or custom inject. Pass RelayMessage and context. See HOOKS_API.md lifecycle spec.","status":"open","priority":1,"issue_type":"task","assignee":"Router","depends_on":["hook-007"],"created_at":"2025-12-21T17:00:00Z","updated_at":"2025-12-21T17:00:00Z","labels":["hooks","lifecycle"]}
+{"id":"hook-013","title":"[Hooks] Define HookContext and HookResult types","description":"Create src/hooks/types.ts with HookContext interface (agentId, sessionId, workingDir, env, inject(), send(), memory, relay, output[], messages[]). Define HookResult (inject?, suppress?, stop?). See HOOKS_API.md.","status":"open","priority":1,"issue_type":"task","assignee":"Architect","created_at":"2025-12-21T17:00:00Z","updated_at":"2025-12-21T17:00:00Z","labels":["hooks","types"]}
diff --git a/docs/HOOKS_API.md b/docs/HOOKS_API.md
index c3b73c65b..400899423 100644
--- a/docs/HOOKS_API.md
+++ b/docs/HOOKS_API.md
@@ -110,6 +110,326 @@ Usage in agent output:
 └─────────────────────────────────────────────────────────────────────────────┘
 ```
 
+## Lifecycle Events: Detailed Specification
+
+### onSessionStart
+
+**When:** Immediately after tmux session is created, before agent CLI starts producing output.
+
+**Trigger point:** `TmuxWrapper.start()` after spawn, before first `pollOutput()`.
+
+```typescript
+// In src/wrapper/tmux-wrapper.ts
+async start(command: string) {
+  await this.spawnTmuxSession(command);
+
+  // TRIGGER: onSessionStart
+  const result = await this.hooks.emit('sessionStart', {
+    agentId: this.agentId,
+    agentName: this.agentName,
+    sessionId: this.sessionId,
+    workingDir: process.cwd(),
+  });
+
+  // Inject any returned text (e.g., loaded memories)
+  if (result?.inject) {
+    await this.injectText(result.inject);
+  }
+
+  this.startPolling();
+}
+```
+
+**Use cases:**
+- Load relevant memories from Mem0 based on project/directory
+- Inject user preferences ("User prefers TypeScript")
+- Set up agent context ("You are working on the auth module")
+
+**Handler signature:**
+```typescript
+onSessionStart: (ctx: HookContext) => Promise<HookResult | void>
+```
+
+---
+
+### onOutput
+
+**When:** Every time new output is captured from the agent (polled every 100ms, fires on diff).
+
+**Trigger point:** `TmuxWrapper.pollOutput()` when `newOutput !== lastOutput`.
+
+```typescript
+// In src/wrapper/tmux-wrapper.ts
+async pollOutput() {
+  const paneContent = await this.capturePane();
+  const newOutput = this.diffOutput(paneContent, this.lastContent);
+
+  if (newOutput) {
+    this.lastContent = paneContent;
+    this.lastOutputTime = Date.now();
+
+    // TRIGGER: onOutput
+    await this.hooks.emit('output', newOutput, this.context);
+
+    // Check for @pattern: matches
+    await this.matchPatterns(newOutput);
+  }
+}
+```
+
+**Use cases:**
+- Log all agent output to file/database
+- Detect errors and alert
+- Track progress metrics
+- Custom pattern matching beyond @namespace:
+
+**Handler signature:**
+```typescript
+onOutput: (output: string, ctx: HookContext) => Promise<HookResult | void>
+```
+
+**Note:** This fires frequently. Keep handlers fast. Don't inject on every output.
+
+---
+
+### onIdle
+
+**When:** Agent has produced no output for `idleThreshold` (default 30 seconds).
+
+**Trigger point:** `TmuxWrapper.pollOutput()` when idle time exceeds threshold.
+
+```typescript
+// In src/wrapper/tmux-wrapper.ts
+private idleThreshold = 30000; // 30 seconds
+private lastIdleNotification = 0;
+
+async pollOutput() {
+  // ... capture and diff ...
+
+  const idleTime = Date.now() - this.lastOutputTime;
+
+  // TRIGGER: onIdle (once per idle period, not continuously)
+  if (idleTime > this.idleThreshold &&
+      Date.now() - this.lastIdleNotification > this.idleThreshold) {
+    this.lastIdleNotification = Date.now();
+
+    const result = await this.hooks.emit('idle', this.context);
+    if (result?.inject) {
+      await this.injectText(result.inject);
+    }
+  }
+}
+```
+
+**Use cases:**
+- Prompt agent for status update
+- Ask if agent is stuck or needs help
+- Suggest next steps
+- Trigger auto-save of work in progress
+
+**Handler signature:**
+```typescript
+onIdle: (ctx: HookContext) => Promise<HookResult | void>
+```
+
+**Configuration:**
+```typescript
+// relay.config.ts
+export default {
+  hooks: {
+    onIdle: async (ctx) => {
+      return { inject: '[STATUS CHECK] Are you making progress?' };
+    }
+  },
+  options: {
+    idleThreshold: 60000, // 60 seconds instead of default 30
+  }
+};
+```
+
+---
+
+### onMessageReceived
+
+**When:** A relay message arrives for this agent from another agent or broadcast.
+
+**Trigger point:** `TmuxWrapper.handleIncomingMessage()` when daemon delivers message.
+
+```typescript
+// In src/wrapper/tmux-wrapper.ts
+async handleIncomingMessage(message: RelayMessage) {
+  // TRIGGER: onMessageReceived (before injection)
+  const result = await this.hooks.emit('messageReceived', message, this.context);
+
+  // Allow handler to modify or suppress injection
+  if (result?.suppress) {
+    return; // Don't inject this message
+  }
+
+  const textToInject = result?.inject || this.formatMessage(message);
+  await this.injectText(textToInject);
+}
+```
+
+**Use cases:**
+- Custom message formatting
+- Filter/suppress certain messages
+- Log incoming messages
+- Transform message content
+- Route to different handlers based on sender
+
+**Handler signature:**
+```typescript
+onMessageReceived: (message: RelayMessage, ctx: HookContext) => Promise<HookResult | void>
+```
+
+**Example - Custom formatting:**
+```typescript
+onMessageReceived: async (msg, ctx) => {
+  // Add priority indicator
+  const priority = msg.metadata?.urgent ? '🚨 URGENT' : '📨';
+  return {
+    inject: `${priority} Message from ${msg.from}: ${msg.content}`
+  };
+}
+```
+
+---
+
+### onSessionEnd
+
+**When:** Agent session is ending (user pressed Ctrl+C, agent exited, or explicit stop).
+
+**Trigger point:** `TmuxWrapper.stop()` or SIGINT/SIGTERM handler.
+
+```typescript
+// In src/wrapper/tmux-wrapper.ts
+async stop() {
+  // TRIGGER: onSessionEnd (before cleanup)
+  const result = await this.hooks.emit('sessionEnd', this.context);
+
+  if (result?.inject) {
+    await this.injectText(result.inject);
+    // Give agent time to process and respond
+    await this.waitForResponse(5000);
+  }
+
+  await this.cleanup();
+}
+
+// Also in signal handlers
+process.on('SIGINT', async () => {
+  await wrapper.stop();
+  process.exit(0);
+});
+```
+
+**Use cases:**
+- Prompt agent to save important learnings
+- Capture final summary
+- Cleanup resources
+- Save session transcript
+
+**Handler signature:**
+```typescript
+onSessionEnd: (ctx: HookContext) => Promise<HookResult | void>
+```
+
+**Example - Memory prompt:**
+```typescript
+onSessionEnd: async (ctx) => {
+  return {
+    inject: `
+[SESSION ENDING]
+Before you go, save any important learnings:
+  @memory:save <what you learned>
+`
+  };
+}
+```
+
+---
+
+### onToolCall (Future)
+
+**When:** Agent invokes a tool (requires parsing tool calls from output).
+
+**Status:** Future enhancement - requires understanding agent's tool output format.
+
+```typescript
+onToolCall: (tool: string, args: any, ctx: HookContext) => Promise<HookResult | void>
+```
+
+**Use cases:**
+- Audit tool usage
+- Block dangerous operations
+- Inject additional context before tool runs
+
+---
+
+## Implementation Location
+
+All lifecycle events are triggered from `src/wrapper/tmux-wrapper.ts`:
+
+```typescript
+// src/wrapper/tmux-wrapper.ts - Current structure
+export class TmuxWrapper {
+  // ADD: Hook emitter
+  private hooks: HookEmitter;
+
+  constructor(config: WrapperConfig) {
+    this.hooks = new HookEmitter(config.hooks);
+  }
+
+  async start() { /* triggers onSessionStart */ }
+  async pollOutput() { /* triggers onOutput, onIdle */ }
+  async handleIncomingMessage() { /* triggers onMessageReceived */ }
+  async stop() { /* triggers onSessionEnd */ }
+}
+```
+
+New file needed: `src/hooks/emitter.ts`:
+
+```typescript
+// src/hooks/emitter.ts
+export class HookEmitter {
+  private handlers: Map<string, HookHandler[]>;
+
+  constructor(config?: HooksConfig) {
+    this.handlers = new Map();
+    if (config) this.loadFromConfig(config);
+  }
+
+  on(event: HookEvent, handler: HookHandler) {
+    const existing = this.handlers.get(event) || [];
+    this.handlers.set(event, [...existing, handler]);
+  }
+
+  async emit(event: HookEvent, ...args: any[]): Promise<HookResult | void> {
+    const handlers = this.handlers.get(event) || [];
+    let result: HookResult | void;
+
+    for (const handler of handlers) {
+      result = await handler(...args);
+      if (result?.stop) break; // Stop propagation
+    }
+
+    return result;
+  }
+}
+```
+
+## Event Summary Table
+
+| Event | Trigger | Frequency | Can Inject? | Use Case |
+|-------|---------|-----------|-------------|----------|
+| `onSessionStart` | Session spawn | Once | ✅ Yes | Load memories, set context |
+| `onOutput` | New output captured | Many (100ms poll) | ⚠️ Rarely | Logging, error detection |
+| `onIdle` | No output for 30s | Periodic | ✅ Yes | Status prompts |
+| `onMessageReceived` | Relay message arrives | Per message | ✅ Yes | Custom formatting |
+| `onSessionEnd` | Session closing | Once | ✅ Yes | Save memories, cleanup |
+| `onToolCall` | Tool invoked | Per tool | Future | Audit, block |
+
 ## Hook API
 
 ### Configuration File

From dbaa97b0c7295af4be9ff871eba125856f5061bc Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 21 Dec 2025 16:10:53 +0000
Subject: [PATCH 13/19] Define tightly scoped hook capabilities and sandboxing

Hook Context (read-only):
- agentId, agentName, sessionId, workingDir, projectName
- recentOutput (last 50 chunks), recentMessages (last 20)
- Timing: sessionStartTime, lastOutputTime, idleSeconds

Hook Result (allowed actions):
- inject: max 2000 chars, sanitized
- suppress: for onMessageReceived only
- stop: prevent other handlers
- sendMessage: one per invocation, max 5000 chars
- log: audit log entry

Prohibited:
- File system access
- Shell execution
- Network requests
- Env modification
- Full output access

Capability escalation via explicit config grants.
Added hook-014 task for sandboxing implementation.
---
 .beads/issues.jsonl |   1 +
 docs/HOOKS_API.md   | 126 ++++++++++++++++++++++++++++++++++++--------
 2 files changed, 106 insertions(+), 21 deletions(-)

diff --git a/.beads/issues.jsonl b/.beads/issues.jsonl
index 43727fe22..f0d944081 100644
--- a/.beads/issues.jsonl
+++ b/.beads/issues.jsonl
@@ -108,3 +108,4 @@
 {"id":"hook-011","title":"[Hooks] Implement onIdle event","description":"Add onIdle hook to TmuxWrapper.pollOutput(). Fire when no output for idleThreshold (default 30s). Fire once per idle period, not continuously. Configurable threshold. See HOOKS_API.md lifecycle spec.","status":"open","priority":2,"issue_type":"task","assignee":"Router","depends_on":["hook-007"],"created_at":"2025-12-21T17:00:00Z","updated_at":"2025-12-21T17:00:00Z","labels":["hooks","lifecycle"]}
 {"id":"hook-012","title":"[Hooks] Implement onMessageReceived event","description":"Add onMessageReceived hook to TmuxWrapper.handleIncomingMessage(). Fire before injection. Allow suppress or custom inject. Pass RelayMessage and context. See HOOKS_API.md lifecycle spec.","status":"open","priority":1,"issue_type":"task","assignee":"Router","depends_on":["hook-007"],"created_at":"2025-12-21T17:00:00Z","updated_at":"2025-12-21T17:00:00Z","labels":["hooks","lifecycle"]}
 {"id":"hook-013","title":"[Hooks] Define HookContext and HookResult types","description":"Create src/hooks/types.ts with HookContext interface (agentId, sessionId, workingDir, env, inject(), send(), memory, relay, output[], messages[]). Define HookResult (inject?, suppress?, stop?). See HOOKS_API.md.","status":"open","priority":1,"issue_type":"task","assignee":"Architect","created_at":"2025-12-21T17:00:00Z","updated_at":"2025-12-21T17:00:00Z","labels":["hooks","types"]}
+{"id":"hook-014","title":"[Hooks] Implement hook sandboxing and limits","description":"Enforce hook restrictions: Object.freeze(ctx) for immutability, 2000 char inject limit, 5000 char message limit, one sendMessage per invocation. Add capability escalation config (allowNetworkInHooks, allowFileReadInHooks, unlimitedInjection). See HOOKS_API.md sandboxing section.","status":"open","priority":1,"issue_type":"task","assignee":"Security","depends_on":["hook-013"],"created_at":"2025-12-21T18:00:00Z","updated_at":"2025-12-21T18:00:00Z","labels":["hooks","security"]}
diff --git a/docs/HOOKS_API.md b/docs/HOOKS_API.md
index 400899423..dc1fad17a 100644
--- a/docs/HOOKS_API.md
+++ b/docs/HOOKS_API.md
@@ -511,33 +511,117 @@ relay.on('sessionEnd', async (ctx) => {
 relay.wrap('claude');
 ```
 
-### Hook Context
+### Hook Context (Read-Only)
+
+Hooks receive a **read-only context** with limited information. They cannot take actions directly - they return a `HookResult` that the relay system interprets.
 
 ```typescript
 interface HookContext {
-  // Agent info
-  agentId: string;
-  agentName: string;
-  sessionId: string;
-
-  // Environment
-  workingDir: string;
-  env: Record<string, string>;
-
-  // Actions
-  inject(text: string): void;      // Inject text to agent stdin
-  send(to: string, msg: string): void;  // Send relay message
-
-  // Built-in services
-  memory: MemoryService;           // Memory operations
-  relay: RelayService;             // Messaging operations
-
-  // Session state
-  output: string[];                // All output so far
-  messages: Message[];             // All relay messages
+  // Agent identity (read-only)
+  readonly agentId: string;
+  readonly agentName: string;
+  readonly sessionId: string;
+
+  // Environment (read-only)
+  readonly workingDir: string;
+  readonly projectName: string;
+
+  // Session state (read-only, last N items)
+  readonly recentOutput: readonly string[];  // Last 50 output chunks
+  readonly recentMessages: readonly Message[]; // Last 20 messages
+
+  // Timing (read-only)
+  readonly sessionStartTime: number;
+  readonly lastOutputTime: number;
+  readonly idleSeconds: number;
+}
+```
+
+### Hook Result (Allowed Actions)
+
+Hooks communicate intent via return value. The relay system executes allowed actions.
+
+```typescript
+interface HookResult {
+  // Text injection (max 2000 chars, sanitized)
+  inject?: string;
+
+  // Suppress default behavior (onMessageReceived only)
+  suppress?: boolean;
+
+  // Stop other handlers from running
+  stop?: boolean;
+
+  // Send relay message (limited to 1 per hook invocation)
+  sendMessage?: {
+    to: string;      // Agent name or "*" for broadcast
+    content: string; // Max 5000 chars
+  };
+
+  // Log to relay audit log (optional)
+  log?: string;
 }
 ```
 
+### What Hooks CANNOT Do
+
+| Prohibited | Why |
+|------------|-----|
+| File system access | Use @pattern: handlers for controlled file ops |
+| Shell execution | Security risk - use dedicated patterns |
+| Network requests | Use @pattern: handlers with explicit config |
+| Modify env vars | Read-only context |
+| Access full output | Memory bounded - only recent chunks |
+| Unlimited injection | Max 2000 chars per injection |
+| Multiple messages | One sendMessage per invocation |
+
+### Sandboxing
+
+Hooks run in a restricted context:
+
+```typescript
+// Hooks are pure functions: context in, result out
+type HookHandler = (ctx: HookContext) => Promise<HookResult | void>;
+
+// The relay system enforces limits
+function executeHook(handler: HookHandler, ctx: HookContext): HookResult {
+  const result = await handler(Object.freeze(ctx)); // Immutable context
+
+  // Enforce limits
+  if (result?.inject && result.inject.length > 2000) {
+    result.inject = result.inject.slice(0, 2000) + '... [truncated]';
+  }
+
+  if (result?.sendMessage?.content.length > 5000) {
+    throw new HookError('Message exceeds 5000 char limit');
+  }
+
+  return result;
+}
+```
+
+### Capability Escalation
+
+For advanced use cases, users can enable additional capabilities in config:
+
+```typescript
+// relay.config.ts
+export default {
+  hooks: {
+    onSessionEnd: myHandler,
+  },
+
+  // Explicit capability grants (default: all false)
+  capabilities: {
+    allowNetworkInHooks: false,  // Permit fetch() in handlers
+    allowFileReadInHooks: false, // Permit fs.readFile()
+    unlimitedInjection: false,   // Remove 2000 char limit
+  }
+};
+```
+
+**Default is locked down.** Users must explicitly opt-in to dangerous capabilities.
+
 ## Built-in Pattern Handlers
 
 ### @relay: (Messaging)

From be9f0da74646fa208fdba13316049db029d281e8 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 21 Dec 2025 16:16:40 +0000
Subject: [PATCH 14/19] Add 7 practical hook examples to documentation

Examples cover:
1. Memory integration - load context, prompt to save
2. Error detection - alert coordinator on failures
3. Message filtering - suppress/highlight by priority
4. Custom pattern - @ticket: handler
5. Coordinator hooks - special behavior for lead agent
6. Minimal config - just session end prompt
7. Debug mode - log all events

Each example demonstrates:
- Using HookContext (read-only)
- Returning HookResult (inject, sendMessage, log, suppress)
- Staying within sandbox limits
---
 docs/HOOKS_API.md | 215 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 215 insertions(+)

diff --git a/docs/HOOKS_API.md b/docs/HOOKS_API.md
index dc1fad17a..09e3ccaaf 100644
--- a/docs/HOOKS_API.md
+++ b/docs/HOOKS_API.md
@@ -622,6 +622,221 @@ export default {
 
 **Default is locked down.** Users must explicitly opt-in to dangerous capabilities.
 
+---
+
+## Examples
+
+### Example 1: Memory Integration (Complete)
+
+Load memories at session start, prompt to save at session end.
+
+```typescript
+// relay.config.ts
+import type { RelayConfig } from 'agent-relay';
+
+export default {
+  hooks: {
+    onSessionStart: async (ctx) => {
+      // Inject project context from memory
+      return {
+        inject: `[CONTEXT] Working in ${ctx.projectName} (${ctx.workingDir})`,
+        log: `Session started for ${ctx.agentName}`
+      };
+    },
+
+    onSessionEnd: async (ctx) => {
+      return {
+        inject: `[SESSION END] Save learnings with @memory:save <insight>`
+      };
+    },
+
+    onIdle: async (ctx) => {
+      // Only prompt if idle for extended period
+      if (ctx.idleSeconds > 60) {
+        return {
+          inject: '[STATUS] Are you blocked? Need help?'
+        };
+      }
+    }
+  }
+} satisfies RelayConfig;
+```
+
+---
+
+### Example 2: Error Detection and Alerting
+
+Detect errors in output and notify another agent.
+
+```typescript
+// relay.config.ts
+export default {
+  hooks: {
+    onOutput: async (output, ctx) => {
+      // Check for error patterns
+      if (output.includes('Error:') || output.includes('FAILED')) {
+        return {
+          sendMessage: {
+            to: 'Coordinator',
+            content: `Alert: ${ctx.agentName} encountered error in ${ctx.projectName}`
+          },
+          log: `Error detected: ${output.slice(0, 100)}`
+        };
+      }
+      // No injection - just observing
+    }
+  }
+};
+```
+
+---
+
+### Example 3: Message Filtering
+
+Suppress low-priority messages, format high-priority ones.
+
+```typescript
+// relay.config.ts
+export default {
+  hooks: {
+    onMessageReceived: async (msg, ctx) => {
+      // Suppress status broadcasts while busy
+      if (msg.from === '*' && msg.content.startsWith('STATUS:')) {
+        return { suppress: true, log: `Suppressed broadcast from ${msg.from}` };
+      }
+
+      // Highlight urgent messages
+      if (msg.content.includes('URGENT')) {
+        return {
+          inject: `\n>>> URGENT from ${msg.from}: ${msg.content} <<<\n`
+        };
+      }
+
+      // Default formatting (return nothing to use default)
+    }
+  }
+};
+```
+
+---
+
+### Example 4: Custom Pattern Handler
+
+Define `@ticket:` pattern to create tickets.
+
+```typescript
+// relay.config.ts
+export default {
+  patterns: {
+    ticket: {
+      handler: async (target, message, ctx) => {
+        // target = priority (high, medium, low)
+        // message = ticket description
+
+        // Just log and acknowledge - no external calls in default sandbox
+        return {
+          inject: `Ticket logged: [${target}] ${message.slice(0, 50)}...`,
+          log: `ticket:${target} - ${message}`
+        };
+      }
+    }
+  }
+};
+
+// Agent usage:
+// @ticket:high Fix authentication timeout in login flow
+// @ticket:low Update README with new CLI options
+```
+
+---
+
+### Example 5: Coordinator Agent Hooks
+
+Special hooks for a coordinating agent that manages others.
+
+```typescript
+// relay.config.ts
+export default {
+  hooks: {
+    onSessionStart: async (ctx) => {
+      if (ctx.agentName === 'Coordinator') {
+        return {
+          inject: `[COORDINATOR MODE]
+You are managing the following agents. Monitor their status.
+Use @relay:AgentName to communicate.
+Use @relay:* to broadcast.`,
+          sendMessage: {
+            to: '*',
+            content: 'Coordinator online. Report status.'
+          }
+        };
+      }
+    },
+
+    onMessageReceived: async (msg, ctx) => {
+      // Log all incoming messages for coordinator
+      if (ctx.agentName === 'Coordinator') {
+        return {
+          log: `[${msg.from}] ${msg.content.slice(0, 200)}`
+        };
+      }
+    }
+  }
+};
+```
+
+---
+
+### Example 6: Minimal Config (Just Memory Prompts)
+
+Simplest useful configuration.
+
+```typescript
+// relay.config.ts
+export default {
+  hooks: {
+    onSessionEnd: async () => ({
+      inject: 'Save anything important: @memory:save <what you learned>'
+    })
+  }
+};
+```
+
+---
+
+### Example 7: Debug Mode
+
+Log everything for troubleshooting.
+
+```typescript
+// relay.config.ts
+export default {
+  hooks: {
+    onSessionStart: async (ctx) => ({
+      log: `START: ${ctx.agentName} in ${ctx.workingDir}`
+    }),
+
+    onOutput: async (output, ctx) => ({
+      log: `OUTPUT[${ctx.agentName}]: ${output.slice(0, 100)}`
+    }),
+
+    onMessageReceived: async (msg, ctx) => ({
+      log: `MSG[${ctx.agentName}] from ${msg.from}: ${msg.content.slice(0, 100)}`
+    }),
+
+    onIdle: async (ctx) => ({
+      log: `IDLE: ${ctx.agentName} for ${ctx.idleSeconds}s`
+    }),
+
+    onSessionEnd: async (ctx) => ({
+      log: `END: ${ctx.agentName}`
+    })
+  }
+};
+```
+
+---
+
 ## Built-in Pattern Handlers
 
 ### @relay: (Messaging)

From a8e5bca6165d03dfec7115bbb13e927188c20acb Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Sun, 21 Dec 2025 16:36:01 +0000
Subject: [PATCH 15/19] Add example use cases for each lifecycle hook

onSessionStart (2 examples):
- Inject project context
- Role-based context by agent name

onOutput (3 examples):
- Error detection and alerting
- Progress tracking (test results)
- Security keyword alerting

onIdle (3 examples):
- Escalating idle prompts (30s gentle, 2min urgent)
- Auto-save reminder
- Silent coordinator notification

onMessageReceived (4 examples):
- Custom formatting with priority
- Suppress broadcasts while focused
- Filter by sender whitelist
- Transform task assignments

onSessionEnd (4 examples):
- Memory save prompt
- Notify team of departure with duration
- Request summary before exit
- Silent logging only
---
 docs/HOOKS_API.md | 179 ++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 179 insertions(+)

diff --git a/docs/HOOKS_API.md b/docs/HOOKS_API.md
index 09e3ccaaf..a0c3da106 100644
--- a/docs/HOOKS_API.md
+++ b/docs/HOOKS_API.md
@@ -150,6 +150,32 @@ async start(command: string) {
 onSessionStart: (ctx: HookContext) => Promise<HookResult | void>
 ```
 
+**Example - Inject project context:**
+```typescript
+onSessionStart: async (ctx) => {
+  // Greet agent with project info
+  return {
+    inject: `[CONTEXT] You are working in ${ctx.projectName}.
+Working directory: ${ctx.workingDir}
+Remember to save important learnings with @memory:save`,
+    log: `Session started: ${ctx.agentName} in ${ctx.projectName}`
+  };
+}
+```
+
+**Example - Role-based context:**
+```typescript
+onSessionStart: async (ctx) => {
+  const roles = {
+    'Reviewer': 'Focus on code quality, security, and best practices.',
+    'Architect': 'Design systems, make technical decisions.',
+    'Developer': 'Implement features, fix bugs, write tests.',
+  };
+  const role = roles[ctx.agentName] || 'General assistant.';
+  return { inject: `[ROLE] ${role}` };
+}
+```
+
 ---
 
 ### onOutput
@@ -190,6 +216,45 @@ onOutput: (output: string, ctx: HookContext) => Promise<HookResult | void>
 
 **Note:** This fires frequently. Keep handlers fast. Don't inject on every output.
 
+**Example - Error detection:**
+```typescript
+onOutput: async (output, ctx) => {
+  // Alert coordinator when errors occur
+  if (output.includes('Error:') || output.includes('FATAL')) {
+    return {
+      sendMessage: {
+        to: 'Coordinator',
+        content: `[ALERT] ${ctx.agentName} hit error: ${output.slice(0, 200)}`
+      },
+      log: `Error in ${ctx.agentName}: ${output.slice(0, 100)}`
+    };
+  }
+}
+```
+
+**Example - Progress tracking:**
+```typescript
+onOutput: async (output, ctx) => {
+  // Log test results
+  if (output.includes('PASS') || output.includes('FAIL')) {
+    return { log: `[TEST] ${ctx.agentName}: ${output.slice(0, 150)}` };
+  }
+}
+```
+
+**Example - Keyword alerting:**
+```typescript
+onOutput: async (output, ctx) => {
+  // Notify on security-related output
+  const keywords = ['vulnerability', 'CVE-', 'security', 'exploit'];
+  if (keywords.some(k => output.toLowerCase().includes(k))) {
+    return {
+      sendMessage: { to: 'Security', content: `Review needed: ${output.slice(0, 300)}` }
+    };
+  }
+}
+```
+
 ---
 
 ### onIdle
@@ -247,6 +312,43 @@ export default {
 };
 ```
 
+**Example - Escalating idle prompts:**
+```typescript
+onIdle: async (ctx) => {
+  // Gentle prompt after 30s, escalate after 2min
+  if (ctx.idleSeconds > 120) {
+    return {
+      inject: '[STUCK?] No activity for 2+ minutes. Need help?',
+      sendMessage: { to: 'Coordinator', content: `${ctx.agentName} idle for ${ctx.idleSeconds}s` }
+    };
+  } else if (ctx.idleSeconds > 30) {
+    return { inject: '[STATUS] Still working? Update with @relay:* STATUS: ...' };
+  }
+}
+```
+
+**Example - Auto-save reminder:**
+```typescript
+onIdle: async (ctx) => {
+  return {
+    inject: '[REMINDER] Consider saving progress: @memory:save <current status>'
+  };
+}
+```
+
+**Example - Notify coordinator only (no injection):**
+```typescript
+onIdle: async (ctx) => {
+  // Silent monitoring - don't interrupt agent
+  if (ctx.idleSeconds > 60) {
+    return {
+      sendMessage: { to: 'Coordinator', content: `${ctx.agentName} idle ${ctx.idleSeconds}s` },
+      log: `Idle alert: ${ctx.agentName}`
+    };
+  }
+}
+```
+
 ---
 
 ### onMessageReceived
@@ -294,6 +396,44 @@ onMessageReceived: async (msg, ctx) => {
 }
 ```
 
+**Example - Suppress broadcasts while focused:**
+```typescript
+onMessageReceived: async (msg, ctx) => {
+  // Suppress status broadcasts, keep direct messages
+  if (msg.from === '*' && msg.content.startsWith('STATUS:')) {
+    return { suppress: true, log: `Suppressed broadcast: ${msg.content.slice(0, 50)}` };
+  }
+}
+```
+
+**Example - Filter by sender:**
+```typescript
+onMessageReceived: async (msg, ctx) => {
+  // Only accept messages from Coordinator and Reviewer
+  const allowedSenders = ['Coordinator', 'Reviewer'];
+  if (!allowedSenders.includes(msg.from)) {
+    return { suppress: true, log: `Blocked message from ${msg.from}` };
+  }
+}
+```
+
+**Example - Transform task assignments:**
+```typescript
+onMessageReceived: async (msg, ctx) => {
+  // Reformat task assignments with clear structure
+  if (msg.content.startsWith('TASK:')) {
+    const task = msg.content.replace('TASK:', '').trim();
+    return {
+      inject: `
+───────────────────────────────
+NEW TASK from ${msg.from}:
+${task}
+───────────────────────────────`
+    };
+  }
+}
+```
+
 ---
 
 ### onSessionEnd
@@ -348,6 +488,45 @@ Before you go, save any important learnings:
 }
 ```
 
+**Example - Notify team of departure:**
+```typescript
+onSessionEnd: async (ctx) => {
+  const duration = Math.round((Date.now() - ctx.sessionStartTime) / 60000);
+  return {
+    inject: '[ENDING] Save your progress!',
+    sendMessage: {
+      to: '*',
+      content: `${ctx.agentName} signing off after ${duration} minutes`
+    },
+    log: `Session ended: ${ctx.agentName} (${duration}m)`
+  };
+}
+```
+
+**Example - Request summary before exit:**
+```typescript
+onSessionEnd: async (ctx) => {
+  return {
+    inject: `[SESSION END] Please provide a brief summary:
+1. What did you accomplish?
+2. What's left to do?
+3. Any blockers for the next agent?
+
+Reply, then I'll save your response.`
+  };
+}
+```
+
+**Example - Silent logging only:**
+```typescript
+onSessionEnd: async (ctx) => {
+  // No injection, just audit log
+  return {
+    log: `END: ${ctx.agentName} | project: ${ctx.projectName} | messages: ${ctx.recentMessages.length}`
+  };
+}
+```
+
 ---
 
 ### onToolCall (Future)

From d540dc15fe39b05200f193f7e04d72dc4773a602 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Wed, 24 Dec 2025 11:11:07 +0000
Subject: [PATCH 16/19] feat: Add detached mode for long-running agent sessions

Add -d/--detach flag to start agents in background, allowing SSH users
to disconnect without losing agent sessions. Includes attach/kill commands
for session management.
---
 package-lock.json           |   5 +-
 src/cli/index.ts            | 123 +++++++++++++++++++++++++++++++++++-
 src/wrapper/tmux-wrapper.ts |  11 ++++
 3 files changed, 135 insertions(+), 4 deletions(-)

diff --git a/package-lock.json b/package-lock.json
index 2542af10d..8e1896d35 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -1,12 +1,13 @@
 {
   "name": "agent-relay",
-  "version": "1.0.7",
+  "version": "1.0.8",
   "lockfileVersion": 3,
   "requires": true,
   "packages": {
     "": {
       "name": "agent-relay",
-      "version": "1.0.7",
+      "version": "1.0.8",
+      "hasInstallScript": true,
       "license": "MIT",
       "dependencies": {
         "better-sqlite3": "^9.4.3",
diff --git a/src/cli/index.ts b/src/cli/index.ts
index f3049fcf9..3d7d1f7e0 100644
--- a/src/cli/index.ts
+++ b/src/cli/index.ts
@@ -49,7 +49,9 @@ program
 program
   .option('-n, --name <name>', 'Agent name (auto-generated if not set)')
   .option('-q, --quiet', 'Disable debug output', false)
+  .option('-d, --detach', 'Start agent in background (detached mode)')
   .option('--prefix <pattern>', 'Relay prefix pattern (default: ->relay:)')
+  .option('--_daemon', 'Internal flag for daemon mode (do not use directly)')
   .argument('[command...]', 'Command to wrap (e.g., claude)')
   .action(async (commandParts, options) => {
     // If no command provided, show help
@@ -58,17 +60,53 @@ program
       return;
     }
 
-    const { getProjectPaths } = await import('../utils/project-namespace.js');
-    const paths = getProjectPaths();
+    const { getProjectPaths, ensureProjectDir } = await import('../utils/project-namespace.js');
+    const paths = ensureProjectDir();
 
     const [mainCommand, ...commandArgs] = commandParts;
     const agentName = options.name ?? generateAgentName();
 
+    // Handle detached mode - spawn daemon and exit
+    if (options.detach && !options._daemon) {
+      const { spawn } = await import('node:child_process');
+
+      // Build args for the daemon process
+      const daemonArgs = [
+        ...process.argv.slice(2).filter(a => a !== '-d' && a !== '--detach'),
+        '--_daemon',
+        '-n', agentName, // Ensure same name is used
+      ];
+
+      // Spawn detached process
+      const child = spawn(process.argv[0], [process.argv[1], ...daemonArgs], {
+        detached: true,
+        stdio: 'ignore',
+        cwd: process.cwd(),
+        env: process.env,
+      });
+
+      // Write PID file for the wrapper
+      const wrapperPidPath = path.join(paths.dataDir, `wrapper-${agentName}.pid`);
+      fs.writeFileSync(wrapperPidPath, String(child.pid));
+
+      child.unref();
+
+      console.log(`Agent: ${agentName}`);
+      console.log(`Project: ${paths.projectId}`);
+      console.log(`Started in detached mode (PID: ${child.pid})`);
+      console.log(`Attach with: agent-relay attach ${agentName}`);
+      console.log(`Stop with:   agent-relay kill ${agentName}`);
+      return;
+    }
+
     console.error(`Agent: ${agentName}`);
     console.error(`Project: ${paths.projectId}`);
 
     const { TmuxWrapper } = await import('../wrapper/tmux-wrapper.js');
 
+    // Daemon mode: run wrapper without attaching to tmux
+    const isDaemon = Boolean(options._daemon);
+
     const wrapper = new TmuxWrapper({
       name: agentName,
       command: mainCommand,
@@ -78,6 +116,7 @@ program
       relayPrefix: options.prefix,
       useInbox: true,
       inboxDir: paths.dataDir, // Use the project-specific data directory for the inbox
+      detached: isDaemon, // Don't attach if running as daemon
     });
 
     process.on('SIGINT', () => {
@@ -85,6 +124,11 @@ program
       process.exit(0);
     });
 
+    process.on('SIGTERM', () => {
+      wrapper.stop();
+      process.exit(0);
+    });
+
     await wrapper.start();
   });
 
@@ -200,6 +244,81 @@ program
     }
   });
 
+// attach - Attach to a running agent's tmux session
+program
+  .command('attach')
+  .description('Attach to a running agent session')
+  .argument('<name>', 'Agent name to attach to')
+  .action(async (name) => {
+    const sessionName = `relay-${name}`;
+
+    // Check if session exists
+    try {
+      await execAsync(`tmux has-session -t ${sessionName} 2>/dev/null`);
+    } catch {
+      console.error(`No session found for agent: ${name}`);
+      console.error(`Run 'agent-relay status' to see available sessions.`);
+      process.exit(1);
+    }
+
+    // Attach to the session
+    const { spawn } = await import('node:child_process');
+    const attach = spawn('tmux', ['attach-session', '-t', sessionName], {
+      stdio: 'inherit',
+    });
+
+    attach.on('exit', (code) => {
+      process.exit(code ?? 0);
+    });
+
+    attach.on('error', (err) => {
+      console.error(`Failed to attach: ${err.message}`);
+      process.exit(1);
+    });
+  });
+
+// kill - Stop a detached agent
+program
+  .command('kill')
+  .description('Stop a detached agent and its tmux session')
+  .argument('<name>', 'Agent name to kill')
+  .option('--force', 'Force kill without confirmation')
+  .action(async (name, options) => {
+    const { getProjectPaths } = await import('../utils/project-namespace.js');
+    const paths = getProjectPaths();
+
+    const sessionName = `relay-${name}`;
+    const wrapperPidPath = path.join(paths.dataDir, `wrapper-${name}.pid`);
+
+    let killed = false;
+
+    // Kill the wrapper process if PID file exists
+    if (fs.existsSync(wrapperPidPath)) {
+      const pid = Number(fs.readFileSync(wrapperPidPath, 'utf-8').trim());
+      try {
+        process.kill(pid, 'SIGTERM');
+        console.log(`Stopped wrapper process (PID: ${pid})`);
+        killed = true;
+      } catch {
+        // Process may already be dead
+      }
+      fs.unlinkSync(wrapperPidPath);
+    }
+
+    // Kill the tmux session
+    try {
+      await execAsync(`tmux kill-session -t ${sessionName}`);
+      console.log(`Killed tmux session: ${sessionName}`);
+      killed = true;
+    } catch {
+      // Session may not exist
+    }
+
+    if (!killed) {
+      console.log(`No running agent found: ${name}`);
+    }
+  });
+
 // agents - List connected agents (from registry file)
 program
   .command('agents')
diff --git a/src/wrapper/tmux-wrapper.ts b/src/wrapper/tmux-wrapper.ts
index a65897ba0..83d76f6c7 100644
--- a/src/wrapper/tmux-wrapper.ts
+++ b/src/wrapper/tmux-wrapper.ts
@@ -75,6 +75,8 @@ export interface TmuxWrapperConfig {
   mouseMode?: boolean;
   /** Relay prefix pattern (default: '->relay:') */
   relayPrefix?: string;
+  /** Run in detached mode (don't attach to tmux, run as background daemon) */
+  detached?: boolean;
 }
 
 /**
@@ -370,6 +372,15 @@ export class TmuxWrapper {
     // Start background polling (silent - no stdout writes)
     this.startSilentPolling();
 
+    // In detached mode, don't attach - just keep polling in background
+    if (this.config.detached) {
+      this.logStderr('Running in detached mode (daemon)', true);
+      this.logStderr(`Attach with: tmux attach -t ${this.sessionName}`, true);
+      // Keep the process alive
+      await new Promise(() => {}); // Never resolves - keeps daemon running
+      return;
+    }
+
     // Attach user to tmux session
     // This takes over stdin/stdout - user sees the real terminal
     this.attachToSession();

From 5320172d6eb8d0da49be81353af4cfb8f61a00d4 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Wed, 24 Dec 2025 11:14:06 +0000
Subject: [PATCH 17/19] test: Add tests for attach, kill commands and detach
 flag

---
 src/cli/index.test.ts | 36 ++++++++++++++++++++++++++++++++++++
 1 file changed, 36 insertions(+)

diff --git a/src/cli/index.test.ts b/src/cli/index.test.ts
index e81bc1692..4be6881b5 100644
--- a/src/cli/index.test.ts
+++ b/src/cli/index.test.ts
@@ -114,6 +114,42 @@ describe('CLI', () => {
     });
   });
 
+  describe('attach', () => {
+    it('should show help for attach command', async () => {
+      const { stdout } = await runCli('attach --help');
+      expect(stdout).toContain('Attach to a running agent session');
+      expect(stdout).toContain('<name>');
+    });
+
+    it('should error when session does not exist', async () => {
+      const { stderr, code } = await runCli('attach nonexistent-agent');
+      expect(code).not.toBe(0);
+      expect(stderr).toContain('No session found');
+    });
+  });
+
+  describe('kill', () => {
+    it('should show help for kill command', async () => {
+      const { stdout } = await runCli('kill --help');
+      expect(stdout).toContain('Stop a detached agent');
+      expect(stdout).toContain('<name>');
+      expect(stdout).toContain('--force');
+    });
+
+    it('should handle killing nonexistent agent gracefully', async () => {
+      const { stdout } = await runCli('kill nonexistent-agent-12345');
+      expect(stdout).toContain('No running agent found');
+    });
+  });
+
+  describe('detach flag', () => {
+    it('should show -d/--detach in help', async () => {
+      const { stdout } = await runCli('--help');
+      expect(stdout).toContain('-d, --detach');
+      expect(stdout).toContain('background');
+    });
+  });
+
   describe('history', () => {
     it('should show history or empty message', async () => {
       const { stdout, code } = await runCli('history --limit 5');

From 5b7d60e97dfc78bd2d7e2fecc882e1885f5c1221 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Wed, 24 Dec 2025 11:15:37 +0000
Subject: [PATCH 18/19] fix(test): Check for .project marker instead of dataDir
 in listProjects test

The test was checking if dataDir exists, but listProjects() requires
the .project marker file to be present.
---
 src/utils/project-namespace.test.ts | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/src/utils/project-namespace.test.ts b/src/utils/project-namespace.test.ts
index 34803481f..d1851f7e1 100644
--- a/src/utils/project-namespace.test.ts
+++ b/src/utils/project-namespace.test.ts
@@ -146,8 +146,9 @@ describe('project-namespace', () => {
       // Get current project paths
       const currentPaths = getProjectPaths();
 
-      // Only test if data dir exists (project has been initialized)
-      if (fs.existsSync(currentPaths.dataDir)) {
+      // Only test if .project marker exists (listProjects requires the marker, not just the dir)
+      const markerPath = path.join(currentPaths.dataDir, '.project');
+      if (fs.existsSync(markerPath)) {
         const projects = listProjects();
         const found = projects.find(p => p.projectId === currentPaths.projectId);
         expect(found).toBeTruthy();

From ff51bdbd2de3bb8ab5f106a199fb1b13d238de36 Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Wed, 24 Dec 2025 11:25:49 +0000
Subject: [PATCH 19/19] chore: Hide internal --_daemon flag from CLI help

---
 src/cli/index.ts | 9 +++------
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/src/cli/index.ts b/src/cli/index.ts
index 3d7d1f7e0..f85855cba 100644
--- a/src/cli/index.ts
+++ b/src/cli/index.ts
@@ -51,7 +51,6 @@ program
   .option('-q, --quiet', 'Disable debug output', false)
   .option('-d, --detach', 'Start agent in background (detached mode)')
   .option('--prefix <pattern>', 'Relay prefix pattern (default: ->relay:)')
-  .option('--_daemon', 'Internal flag for daemon mode (do not use directly)')
   .argument('[command...]', 'Command to wrap (e.g., claude)')
   .action(async (commandParts, options) => {
     // If no command provided, show help
@@ -67,7 +66,8 @@ program
     const agentName = options.name ?? generateAgentName();
 
     // Handle detached mode - spawn daemon and exit
-    if (options.detach && !options._daemon) {
+    const isDaemonProcess = process.argv.includes('--_daemon');
+    if (options.detach && !isDaemonProcess) {
       const { spawn } = await import('node:child_process');
 
       // Build args for the daemon process
@@ -104,9 +104,6 @@ program
 
     const { TmuxWrapper } = await import('../wrapper/tmux-wrapper.js');
 
-    // Daemon mode: run wrapper without attaching to tmux
-    const isDaemon = Boolean(options._daemon);
-
     const wrapper = new TmuxWrapper({
       name: agentName,
       command: mainCommand,
@@ -116,7 +113,7 @@ program
       relayPrefix: options.prefix,
       useInbox: true,
       inboxDir: paths.dataDir, // Use the project-specific data directory for the inbox
-      detached: isDaemon, // Don't attach if running as daemon
+      detached: isDaemonProcess, // Don't attach if running as daemon
     });
 
     process.on('SIGINT', () => {