diff --git a/docs/hooks/tools.mdx b/docs/hooks/tools.mdx index b92cfb76d3..6566917158 100644 --- a/docs/hooks/tools.mdx +++ b/docs/hooks/tools.mdx @@ -552,16 +552,6 @@ If a value is too large for the environment, it may be omitted (not set). Mux al -
-propose_name (2) - -| Env var | JSON path | Type | Description | -| ---------------------- | --------- | ------ | -------------------------------------------------------------------------------------------------- | -| `MUX_TOOL_INPUT_NAME` | `name` | string | Codebase area (1-2 words, max 15 chars): lowercase, hyphens only, e.g. 'sidebar', 'auth', 'config' | -| `MUX_TOOL_INPUT_TITLE` | `title` | string | Human-readable title (2-5 words): verb-noun format like 'Fix plan mode' | - -
-
skills_catalog_read (3) diff --git a/scripts/gen_docs.ts b/scripts/gen_docs.ts index 6aa6bdb54c..a8ca5e4826 100644 --- a/scripts/gen_docs.ts +++ b/scripts/gen_docs.ts @@ -675,6 +675,9 @@ function generateToolHookEnvVarsBlock(): string { const tools = Object.entries(TOOL_DEFINITIONS).sort(([a], [b]) => a.localeCompare(b)); for (const [toolName, def] of tools) { + // Skip internal/bespoke tools (e.g. propose_name, propose_status) β€” users + // can't write hooks for them, so listing their env vars is misleading. + if ((def as { internal?: boolean }).internal) continue; const vars = collectToolHookEnvVarsFromZodSchema(def.schema); if (vars.length === 0) continue; diff --git a/src/browser/components/icons/EmojiIcon/EmojiIcon.tsx b/src/browser/components/icons/EmojiIcon/EmojiIcon.tsx index b9da8229ea..7acbee1ffa 100644 --- a/src/browser/components/icons/EmojiIcon/EmojiIcon.tsx +++ b/src/browser/components/icons/EmojiIcon/EmojiIcon.tsx @@ -62,6 +62,10 @@ const EMOJI_TO_ICON: Record = { // Tool-ish / app-ish "πŸ”§": Wrench, + // πŸ›  (hammer-and-wrench) is what small models pick most often for + // generic "fixing / building" sidebar status, so we map it to Wrench + // alongside πŸ”§ to avoid the Sparkles fallback. + "πŸ› ": Wrench, "πŸ””": Bell, "🌐": Globe, "πŸ“–": BookOpen, diff --git a/src/browser/stores/WorkspaceStore.test.ts b/src/browser/stores/WorkspaceStore.test.ts index af4febbf15..ef98bee413 100644 --- a/src/browser/stores/WorkspaceStore.test.ts +++ b/src/browser/stores/WorkspaceStore.test.ts @@ -1,4 +1,4 @@ -import { describe, expect, it, beforeEach, afterEach, mock, type Mock } from "bun:test"; +import { describe, expect, it, beforeEach, afterEach, mock, spyOn, type Mock } from "bun:test"; import type { DisplayedMessage } from "@/common/types/message"; import type { FrontendWorkspaceMetadata } from "@/common/types/workspace"; import type { StreamStartEvent, ToolCallStartEvent } from "@/common/types/stream"; @@ -2591,6 +2591,60 @@ describe("WorkspaceStore", () => { expect(state.agentStatus).toEqual({ emoji: "πŸ”„", message: "Run typecheck" }); }); + it("live todo derivation wins over aggregator getAgentStatus (status_set/heartbeat) for active workspaces", () => { + // Codex round 6: aggregator.getAgentStatus() conflates status_set and + // muxMeta.displayStatus into one field. A status_set value persisted + // from a previous turn could mask a fresh todo_write in the current + // turn. Live todo must win. + const workspaceId = "active-live-todo-beats-aggregator-status"; + createAndAddWorkspace(store, workspaceId); + seedPinnedTodos(store, workspaceId, [{ content: "Run typecheck", status: "in_progress" }]); + + // Simulate an aggregator that has a non-empty getAgentStatus() + // (e.g. an old status_set from a previous turn). The new precedence + // must ignore it because the live todo derivation is fresher. + const aggregator = store.getAggregator(workspaceId); + if (!aggregator) throw new Error("expected aggregator"); + spyOn(aggregator, "getAgentStatus").mockReturnValue({ + emoji: "πŸ”", + message: "Investigating crash", + }); + + const state = store.getWorkspaceState(workspaceId); + expect(state.agentStatus).toEqual({ emoji: "πŸ”„", message: "Run typecheck" }); + }); + + it("falls back to persisted AI status for active workspaces with no live todos", async () => { + // Live aggregator todos are the freshest signal for "what is the + // agent doing right now" because `todo_write` is processed + // synchronously, before the async setTodoStatus + activity-emit round + // trip. So when the workspace has live todos we prefer those (see + // the existing "derives active workspace status from the current todo + // list" test). When there are NO live todos, the AI-generated + // todoStatus from AgentStatusService still has to surface β€” that's + // the common "free-form chat without a todo list" case. + const workspaceId = "active-ai-no-live-todos"; + const activitySnapshot: WorkspaceActivitySnapshot = { + recency: new Date("2024-01-04T13:00:00.000Z").getTime(), + streaming: true, + lastModel: "claude-sonnet-4", + lastThinkingLevel: null, + todoStatus: { emoji: "πŸ› οΈ", message: "AI-generated summary" }, + }; + + mockActivityList.mockResolvedValue({ [workspaceId]: activitySnapshot }); + recreateStore(); + await tick(0); + + createAndAddWorkspace(store, workspaceId); + // Intentionally no seedPinnedTodos β€” the aggregator has no todos, so + // the live derivation returns undefined and the persisted AI status + // must surface through the fallback chain. + + const state = store.getWorkspaceState(workspaceId); + expect(state.agentStatus).toEqual(activitySnapshot.todoStatus ?? undefined); + }); + it("prefers todo-derived activity status for inactive workspaces", async () => { const workspaceId = "activity-fallback-todo-status-workspace"; const activitySnapshot: WorkspaceActivitySnapshot = { @@ -2634,6 +2688,57 @@ describe("WorkspaceStore", () => { expect(state.agentStatus).toEqual(activitySnapshot.displayStatus ?? undefined); }); + it("uses todoStatus from the activity snapshot for inactive workspaces", async () => { + // todoStatus is the persistent sidebar slot β€” written by both the + // small-model AgentStatusService and the todo-derivation path. Inactive + // workspaces don't run the aggregator, so the snapshot's todoStatus is + // what the sidebar must show. + const workspaceId = "activity-fallback-todo-status-workspace"; + const activitySnapshot: WorkspaceActivitySnapshot = { + recency: new Date("2024-01-04T16:00:00.000Z").getTime(), + streaming: false, + lastModel: "claude-sonnet-4", + lastThinkingLevel: null, + todoStatus: { emoji: "πŸ› οΈ", message: "Wiring sidebar precedence" }, + hasTodos: true, + }; + + mockActivityList.mockResolvedValue({ [workspaceId]: activitySnapshot }); + recreateStore(); + await tick(0); + + createAndAddWorkspace(store, workspaceId, { createdAt: "2020-01-01T00:00:00.000Z" }, false); + + const state = store.getWorkspaceState(workspaceId); + expect(state.agentStatus).toEqual(activitySnapshot.todoStatus ?? undefined); + }); + + it("keeps displayStatus precedence over todoStatus so explicit system status still wins", async () => { + // displayStatus is a deliberate, system-driven signal (e.g. "Compacting + // idle workspace…"). It must outrank todoStatus β€” otherwise a periodic + // small-model rewrite of todoStatus would mask the explicit progress + // message the backend is trying to communicate. + const workspaceId = "activity-fallback-display-over-todo"; + const activitySnapshot: WorkspaceActivitySnapshot = { + recency: new Date("2024-01-04T17:00:00.000Z").getTime(), + streaming: false, + lastModel: "claude-sonnet-4", + lastThinkingLevel: null, + displayStatus: { emoji: "πŸ’€", message: "Compacting idle workspace" }, + todoStatus: { emoji: "πŸ› οΈ", message: "Wiring sidebar precedence" }, + hasTodos: false, + }; + + mockActivityList.mockResolvedValue({ [workspaceId]: activitySnapshot }); + recreateStore(); + await tick(0); + + createAndAddWorkspace(store, workspaceId, { createdAt: "2020-01-01T00:00:00.000Z" }, false); + + const state = store.getWorkspaceState(workspaceId); + expect(state.agentStatus).toEqual(activitySnapshot.displayStatus ?? undefined); + }); + it("suppresses stale legacy status fallback when activity says the todo list is empty", async () => { const workspaceId = "activity-fallback-empty-todo-status"; const activitySnapshot: WorkspaceActivitySnapshot = { diff --git a/src/browser/stores/WorkspaceStore.ts b/src/browser/stores/WorkspaceStore.ts index 6822897317..7d4186f1ae 100644 --- a/src/browser/stores/WorkspaceStore.ts +++ b/src/browser/stores/WorkspaceStore.ts @@ -1745,13 +1745,40 @@ export class WorkspaceStore { !transient.caughtUp && !hasRunningInitMessage; const aggregatorTodos = aggregator.getCurrentTodos(); + // Sidebar status precedence, split into four tiers so each signal + // wins exactly when it should. Active and inactive workspaces draw + // from different sources but resolve through the same priority. + // + // 1. displayStatus (inactive only): system-driven transient status + // from disk, e.g. "Compacting idle workspace…". Always wins. + // 2. liveTodoStatus (active only): the agent's most recent + // `todo_write`, processed synchronously by the aggregator. + // Beats the aggregator's persisted status_set value because + // todo_write is the freshest explicit signal; beats persisted + // todoStatus because the live aggregator state is ahead of + // the async setTodoStatus + activity-emit round-trip. + // 3. fallbackAgentStatus (active only): aggregator.getAgentStatus() + // β€” a blend of heartbeat / idle-compaction / background-turn + // `displayStatus` events (genuinely transient) and the agent's + // own `status_set` tool result (a pinned high-level intent). + // Wins over persisted todoStatus so an AI-generated summary + // doesn't mask an explicit system or agent-set message. + // 4. persistedTodoStatus: activity.todoStatus from disk. Either + // a stale todo derivation or an AgentStatusService AI summary β€” + // both writers target the same slot, last write wins. The + // lowest tier so a newer in-memory signal always preempts. + // For inactive workspaces, `hasTodos === false` blocks the + // legacy aggregator-derive fallback so a freshly cleared todo + // list doesn't briefly resurrect the stale derivation. const displayStatus = useAggregatorState ? undefined : (activity?.displayStatus ?? undefined); - const todoStatus = useAggregatorState - ? (deriveTodoStatus(aggregatorTodos) ?? activity?.todoStatus ?? undefined) + const liveTodoStatus = useAggregatorState ? deriveTodoStatus(aggregatorTodos) : undefined; + const fallbackAgentStatus = useAggregatorState ? aggregator.getAgentStatus() : undefined; + const persistedTodoStatus = useAggregatorState + ? (activity?.todoStatus ?? undefined) : (activity?.todoStatus ?? (activity?.hasTodos === false ? undefined : deriveTodoStatus(aggregatorTodos))); - const fallbackAgentStatus = useAggregatorState ? aggregator.getAgentStatus() : undefined; - const agentStatus = displayStatus ?? todoStatus ?? fallbackAgentStatus; + const agentStatus = + displayStatus ?? liveTodoStatus ?? fallbackAgentStatus ?? persistedTodoStatus; return { name: metadata?.name ?? workspaceId, // Fall back to ID if metadata missing diff --git a/src/common/orpc/schemas/workspace.ts b/src/common/orpc/schemas/workspace.ts index 7f1a4326b9..3e19d8fcd4 100644 --- a/src/common/orpc/schemas/workspace.ts +++ b/src/common/orpc/schemas/workspace.ts @@ -209,7 +209,7 @@ export const WorkspaceActivitySnapshotSchema = z.object({ }), todoStatus: WorkspaceAgentStatusSchema.nullable().optional().meta({ description: - "Status derived from the current todo list (preferred background progress surface in the sidebar).", + "Persistent sidebar status. Set by the small-model AgentStatusService when available, with a todo-derived fallback.", }), hasTodos: z.boolean().optional().meta({ description: "Whether the workspace still had todos when streaming last stopped", diff --git a/src/common/utils/tools/toolDefinitions.ts b/src/common/utils/tools/toolDefinitions.ts index bd5439e5a2..4c869c056b 100644 --- a/src/common/utils/tools/toolDefinitions.ts +++ b/src/common/utils/tools/toolDefinitions.ts @@ -831,6 +831,27 @@ export const ProposeNameToolArgsSchema = z.object({ .describe("Human-readable title (2-5 words): verb-noun format like 'Fix plan mode'"), }); +// ----------------------------------------------------------------------------- +// propose_status (sidebar agent status generation) +// ----------------------------------------------------------------------------- + +export const ProposeStatusToolArgsSchema = z.object({ + emoji: z + .string() + .min(1) + .max(8) + .describe( + "A single emoji that represents the agent's current activity (e.g. 'πŸ”', 'πŸ› οΈ', 'πŸ§ͺ', 'πŸ“')" + ), + message: z + .string() + .min(2) + .max(60) + .describe( + "A short verb-led phrase (2-6 words) describing what the agent is currently working on, in sentence case, no punctuation, no quotes (e.g. 'Investigating crash', 'Implementing sidebar status')" + ), +}); + const MuxConfigFileSchema = z.enum(["providers", "config"]); /** @@ -1320,11 +1341,23 @@ export const TOOL_DEFINITIONS = { "Each question must include 2–4 options; an 'Other' choice is provided automatically.", schema: AskUserQuestionToolArgsSchema, }, + // `internal` tools are excluded from user-facing tool docs (hooks/tools.mdx + // env-var tables) because users can't write hooks for them β€” they run via + // bespoke streamText paths in their own services, not the standard tool + // execution pipeline. See gen_docs.ts. propose_name: { description: "Propose a workspace name and title. You MUST call this tool exactly once with your chosen name and title. " + "Do not emit a text response; call this tool immediately.", schema: ProposeNameToolArgsSchema, + internal: true, + }, + propose_status: { + description: + "Propose a short sidebar status (emoji + 2-6 word verb-led phrase) summarizing what the agent is currently doing. " + + "You MUST call this tool exactly once. Do not emit a text response; call this tool immediately.", + schema: ProposeStatusToolArgsSchema, + internal: true, }, propose_plan: { description: diff --git a/src/common/utils/tools/tools.ts b/src/common/utils/tools/tools.ts index e1870d3bde..63cb5a5020 100644 --- a/src/common/utils/tools/tools.ts +++ b/src/common/utils/tools/tools.ts @@ -431,10 +431,12 @@ export async function getToolsForModel( ...(config.advisorRuntime ? { advisor: createAdvisorTool(config) } : {}), ask_user_question: createAskUserQuestionTool(config), propose_plan: createProposePlanTool(config), - // propose_name is intentionally NOT registered here β€” it's only used by - // the internal workspace-naming path (workspaceTitleGenerator.ts) which - // creates the tool inline. Exposing it in the default toolset would let - // exec-derived agents see its "call me immediately" description. + // propose_name and propose_status are intentionally NOT registered here β€” + // they are only used by the internal workspace-naming path + // (workspaceTitleGenerator.ts) and the sidebar agent-status path + // (workspaceStatusGenerator.ts), which create the tool inline. Exposing + // them in the default toolset would let exec-derived agents see their + // "call me immediately" descriptions. ...(config.enableAgentReport ? { agent_report: createAgentReportTool(config) } : {}), switch_agent: createSwitchAgentTool(config), todo_write: createTodoWriteTool(config), diff --git a/src/constants/agentStatus.ts b/src/constants/agentStatus.ts new file mode 100644 index 0000000000..125658a004 --- /dev/null +++ b/src/constants/agentStatus.ts @@ -0,0 +1,47 @@ +/** + * Constants controlling the AI-generated sidebar agent status. + * + * The status is produced by the same "small model" path used for workspace + * title generation. We feed only a trailing window of chat (capped by both + * message count and token budget) and skip regeneration whenever the input + * is byte-for-byte unchanged. + */ + +/** + * Per-workspace regen intervals split four ways: streaming workspaces + * (active) refresh much faster so the user can follow the agent in real + * time; idle workspaces (no active stream) back off because the chat + * isn't moving anyway. Either case backs off further when the desktop + * window is blurred. + */ +export const AGENT_STATUS_ACTIVE_FOCUSED_INTERVAL_MS = 10 * 1000; +export const AGENT_STATUS_ACTIVE_UNFOCUSED_INTERVAL_MS = 30 * 1000; +export const AGENT_STATUS_IDLE_FOCUSED_INTERVAL_MS = 30 * 1000; +export const AGENT_STATUS_IDLE_UNFOCUSED_INTERVAL_MS = 2 * 60 * 1000; + +/** + * How often the scheduler wakes up to scan workspaces. Per-workspace cadence + * is enforced separately, so this can be small enough to make focus + * transitions feel snappy without driving redundant work. With + * AGENT_STATUS_MAX_CONCURRENT=1 the per-tick dispatch naturally smooths load + * across many workspaces β€” no separate startup delay needed. + */ +export const AGENT_STATUS_TICK_INTERVAL_MS = 10 * 1000; + +/** Token budget for the trailing chat-transcript window we feed the model. */ +export const AGENT_STATUS_MAX_TRANSCRIPT_TOKENS = 8000; + +/** Cap on the number of trailing messages we pull off disk before token trimming. */ +export const AGENT_STATUS_MAX_TRAILING_MESSAGES = 80; + +/** + * Cap on per-message text length before tokenization. Bounds pathological + * single messages (huge tool outputs) that would otherwise burn the budget. + */ +export const AGENT_STATUS_MAX_MESSAGE_CHARS = 4000; + +/** + * Maximum concurrent model invocations across all workspaces. Keep small so + * a multi-workspace sweep doesn't spike provider bills or rate limits. + */ +export const AGENT_STATUS_MAX_CONCURRENT = 1; diff --git a/src/node/services/ExtensionMetadataService.ts b/src/node/services/ExtensionMetadataService.ts index 4cd28091b3..cd1272e8e1 100644 --- a/src/node/services/ExtensionMetadataService.ts +++ b/src/node/services/ExtensionMetadataService.ts @@ -113,8 +113,12 @@ export class ExtensionMetadataService { } /** - * Initialize the service by ensuring directory exists and clearing stale streaming flags. - * Call this once on app startup. + * Initialize the service by ensuring directory exists and clearing stale + * streaming flags. Call once on app startup. + * + * Per AGENTS.md ("Startup-time initialization must never crash the app") + * disk failures here are logged and swallowed; save() itself throws so + * strict callers (e.g. AgentStatusService) can react. */ async initialize(): Promise { // Ensure directory exists @@ -122,11 +126,20 @@ export class ExtensionMetadataService { try { await access(dir, constants.F_OK); } catch { - await mkdir(dir, { recursive: true }); + try { + await mkdir(dir, { recursive: true }); + } catch (error) { + log.error("ExtensionMetadataService: failed to create metadata dir at startup", { error }); + return; + } } // Clear stale streaming flags (from crashes) - await this.clearStaleStreaming(); + try { + await this.clearStaleStreaming(); + } catch (error) { + log.error("ExtensionMetadataService: failed to clear stale streaming at startup", { error }); + } } private async load(): Promise { @@ -154,11 +167,16 @@ export class ExtensionMetadataService { } private async save(data: ExtensionMetadataFile): Promise { + // Throws on failure so callers that need to know whether the write + // actually happened (e.g. AgentStatusService dedup) can react. + // emitWorkspaceActivityUpdate (the historical wrapper used elsewhere) + // downgrades throws to logged warnings for log-and-continue paths. try { const content = JSON.stringify(data, null, 2); await writeFileAtomic(this.filePath, content, "utf-8"); } catch (error) { log.error("Failed to save metadata:", error); + throw error; } } @@ -226,6 +244,55 @@ export class ExtensionMetadataService { }); } + /** + * AgentStatusService writes its AI-generated payload into the same + * `todoStatus` field used by the todo-derived path. Passing `null` clears + * the slot. + * + * Unlike `setTodoStatus`, this writer: + * - Never advances `recency`. Background regeneration must not promote + * idle workspaces in the sidebar or mark them unread. Existing entries + * keep their user-interaction recency; brand-new entries (rare: chat + * exists but no metadata yet) are seeded with `recency=0` until the + * next real user interaction. + * - Doesn't touch `hasTodos`. The todo-derivation path owns that flag. + */ + async setSidebarStatus( + workspaceId: string, + status: ExtensionAgentStatus | null, + options: { skipIfRecencyAdvancedSince?: number | null } = {} + ): Promise { + return this.withSerializedMutation(async () => { + const data = await this.load(); + const existing = coerceExtensionMetadata(data.workspaces[workspaceId]); + const workspace: ExtensionMetadata = existing ?? { + recency: 0, + streaming: false, + lastModel: null, + lastThinkingLevel: null, + agentStatus: null, + displayStatus: null, + lastStatusUrl: null, + }; + if ( + options.skipIfRecencyAdvancedSince !== undefined && + existing && + (options.skipIfRecencyAdvancedSince === null || + existing.recency > options.skipIfRecencyAdvancedSince) + ) { + return null; + } + if (status) { + workspace.todoStatus = status; + } else { + delete workspace.todoStatus; + } + data.workspaces[workspaceId] = workspace; + await this.save(data); + return toWorkspaceActivitySnapshot(workspace); + }); + } + /** * Update the latest transient non-todo status payload for a workspace. */ diff --git a/src/node/services/agentSkills/builtInSkillContent.generated.ts b/src/node/services/agentSkills/builtInSkillContent.generated.ts index 166b37f5d5..5112cb95a6 100644 --- a/src/node/services/agentSkills/builtInSkillContent.generated.ts +++ b/src/node/services/agentSkills/builtInSkillContent.generated.ts @@ -4200,16 +4200,6 @@ export const BUILTIN_SKILL_FILES: Record> = { "
", "", "
", - "propose_name (2)", - "", - "| Env var | JSON path | Type | Description |", - "| ---------------------- | --------- | ------ | -------------------------------------------------------------------------------------------------- |", - "| `MUX_TOOL_INPUT_NAME` | `name` | string | Codebase area (1-2 words, max 15 chars): lowercase, hyphens only, e.g. 'sidebar', 'auth', 'config' |", - "| `MUX_TOOL_INPUT_TITLE` | `title` | string | Human-readable title (2-5 words): verb-noun format like 'Fix plan mode' |", - "", - "
", - "", - "
", "skills_catalog_read (3)", "", "| Env var | JSON path | Type | Description |", diff --git a/src/node/services/agentStatusService.test.ts b/src/node/services/agentStatusService.test.ts new file mode 100644 index 0000000000..133ebb7b9e --- /dev/null +++ b/src/node/services/agentStatusService.test.ts @@ -0,0 +1,1048 @@ +import { describe, test, expect, beforeEach, afterEach, mock, spyOn } from "bun:test"; +import { mkdtempSync, rmSync } from "fs"; +import { tmpdir } from "os"; +import { join } from "path"; +import type { ProjectsConfig, ProjectConfig, Workspace } from "@/common/types/project"; +import { Ok, Err } from "@/common/types/result"; +import { createMuxMessage } from "@/common/types/message"; +import type { Config } from "@/node/config"; +import type { AIService } from "./aiService"; +import { ExtensionMetadataService } from "./ExtensionMetadataService"; +import type { WindowService } from "./windowService"; +import type { WorkspaceService } from "./workspaceService"; +import type { TokenizerService } from "./tokenizerService"; +import { AgentStatusService } from "./agentStatusService"; +import * as workspaceStatusGenerator from "./workspaceStatusGenerator"; +import { createTestHistoryService } from "./testHistoryService"; + +interface AgentStatusServiceInternals { + runTick(): Promise; + runForWorkspace(workspaceId: string, observedRecency?: number | null): Promise; +} + +interface ActivitySnapshotForTest { + streaming: boolean; + recency?: number; +} + +describe("AgentStatusService", () => { + const workspaceId = "ws-test"; + const projectPath = "/test/project"; + + let historyHandle: Awaited>; + let projectsConfig: ProjectsConfig; + let mockConfig: Config; + let mockExtensionMetadata: ExtensionMetadataService; + let mockWorkspaceService: WorkspaceService; + let mockTokenizer: TokenizerService; + let mockAiService: AIService; + let windowService: WindowService; + let isFocused = true; + let setSidebarStatusMock: ReturnType< + typeof mock< + ( + workspaceId: string, + status: unknown, + options?: { skipIfRecencyAdvancedSince?: number | null } + ) => Promise<{ recency: number } | null> + > + >; + let getAllSnapshotsMock: ReturnType< + typeof mock<() => Promise>> + >; + let getSnapshotMock: ReturnType< + typeof mock<(workspaceId: string) => Promise<{ recency: number } | null>> + >; + let emitWorkspaceActivityMock: ReturnType< + typeof mock<(workspaceId: string, snapshot: unknown) => void> + >; + let getCandidatesMock: ReturnType Promise>>; + let generateSpy: ReturnType< + typeof spyOn + >; + + function makeWorkspaceEntry(overrides: Partial = {}): Workspace { + return { + id: workspaceId, + name: workspaceId, + path: "/test/path", + ...overrides, + } as unknown as Workspace; + } + + function makeProjectsConfig(workspaces: Workspace[]): ProjectsConfig { + return { + projects: new Map([ + [projectPath, { workspaces } as unknown as ProjectConfig], + ]), + }; + } + + // Bypass the scheduler timers so each test step is deterministic. + function createService(options?: { clock?: () => number }): AgentStatusService { + return new AgentStatusService( + mockConfig, + historyHandle.historyService, + mockTokenizer, + mockExtensionMetadata, + mockWorkspaceService, + windowService, + mockAiService, + { + clock: options?.clock, + tickIntervalMs: 60 * 60 * 1000, + } + ); + } + + function getInternals(service: AgentStatusService): AgentStatusServiceInternals { + return service as unknown as AgentStatusServiceInternals; + } + + beforeEach(async () => { + historyHandle = await createTestHistoryService(); + projectsConfig = makeProjectsConfig([makeWorkspaceEntry()]); + + mockConfig = { + loadConfigOrDefault: mock(() => projectsConfig), + getSessionDir: historyHandle.config.getSessionDir.bind(historyHandle.config), + } as unknown as Config; + + emitWorkspaceActivityMock = mock(() => undefined); + getCandidatesMock = mock((_id: string) => Promise.resolve(["anthropic:claude-haiku-4-5"])); + mockWorkspaceService = { + getWorkspaceTitleModelCandidates: getCandidatesMock, + emitWorkspaceActivity: emitWorkspaceActivityMock, + } as unknown as WorkspaceService; + + setSidebarStatusMock = mock((_workspaceId: string, _status: unknown, _options?: unknown) => + Promise.resolve({ recency: 0 }) + ); + // Default: no snapshots β†’ no workspaces are streaming β†’ idle intervals. + // Tests that exercise the active intervals override this per-test. + getAllSnapshotsMock = mock(() => Promise.resolve(new Map())); + getSnapshotMock = mock((_workspaceId: string) => Promise.resolve(null)); + mockExtensionMetadata = { + setSidebarStatus: setSidebarStatusMock, + getAllSnapshots: getAllSnapshotsMock, + getSnapshot: getSnapshotMock, + } as unknown as ExtensionMetadataService; + + mockTokenizer = { + // Cheap deterministic tokenizer (~1 token per 4 chars). + countTokensBatch: mock((_model: string, texts: string[]) => + Promise.resolve(texts.map((t) => Math.ceil(t.length / 4))) + ), + } as unknown as TokenizerService; + + mockAiService = {} as unknown as AIService; + + isFocused = true; + windowService = { isFocused: () => isFocused } as unknown as WindowService; + + generateSpy = spyOn(workspaceStatusGenerator, "generateWorkspaceStatus").mockResolvedValue( + Ok({ + status: { emoji: "πŸ› οΈ", message: "Editing source" }, + modelUsed: "anthropic:claude-haiku-4-5", + }) + ); + }); + + afterEach(async () => { + generateSpy.mockRestore(); + await historyHandle.cleanup(); + }); + + test("generates and persists a fresh AI status when chat history exists", async () => { + await historyHandle.historyService.appendToHistory( + workspaceId, + createMuxMessage("u1", "user", "Please run the test suite") + ); + await historyHandle.historyService.appendToHistory( + workspaceId, + createMuxMessage("a1", "assistant", "Running tests now") + ); + + const service = createService(); + await getInternals(service).runForWorkspace(workspaceId); + + expect(generateSpy).toHaveBeenCalledTimes(1); + const generationCall = generateSpy.mock.calls[0]; + expect(generationCall[0]).toContain("User: Please run the test suite"); + expect(generationCall[0]).toContain("Assistant: Running tests now"); + expect(generationCall[1]).toEqual(["anthropic:claude-haiku-4-5"]); + + expect(setSidebarStatusMock).toHaveBeenCalledTimes(1); + const [persistedWorkspaceId, persistedStatus] = setSidebarStatusMock.mock.calls[0]; + expect(persistedWorkspaceId).toBe(workspaceId); + expect(persistedStatus).toEqual({ emoji: "πŸ› οΈ", message: "Editing source" }); + }); + + test("skips regeneration when the trailing transcript is unchanged (dedup)", async () => { + // "Frozen chat" behavior: identical hash β†’ no further LLM calls. + await historyHandle.historyService.appendToHistory( + workspaceId, + createMuxMessage("u1", "user", "Idle workspace") + ); + + const service = createService(); + await getInternals(service).runForWorkspace(workspaceId); + expect(generateSpy).toHaveBeenCalledTimes(1); + expect(setSidebarStatusMock).toHaveBeenCalledTimes(1); + + await getInternals(service).runForWorkspace(workspaceId); + expect(generateSpy).toHaveBeenCalledTimes(1); + expect(setSidebarStatusMock).toHaveBeenCalledTimes(1); + }); + + test("includes the in-flight partial assistant message so the hash refreshes mid-stream", async () => { + // The assistant's mid-stream output lives in partial.json before being + // committed to chat.jsonl. If buildTrailingTranscript ignored partials, + // the hash would stay constant during long streams and dedup would + // suppress the very updates the feature exists to surface. + await historyHandle.historyService.appendToHistory( + workspaceId, + createMuxMessage("u1", "user", "kick off a long task") + ); + + const service = createService(); + await getInternals(service).runForWorkspace(workspaceId); + expect(generateSpy).toHaveBeenCalledTimes(1); + + const partial = createMuxMessage("a-partial", "assistant", "Reading config files"); + await historyHandle.historyService.writePartial(workspaceId, partial); + + // Dedup would have suppressed this second call if the partial was missing + // from the trailing window. + await getInternals(service).runForWorkspace(workspaceId); + expect(generateSpy).toHaveBeenCalledTimes(2); + expect(generateSpy.mock.calls[1][0]).toContain("Assistant: Reading config files"); + }); + + test("re-generates after the trailing transcript changes", async () => { + await historyHandle.historyService.appendToHistory( + workspaceId, + createMuxMessage("u1", "user", "Initial request") + ); + const service = createService(); + await getInternals(service).runForWorkspace(workspaceId); + expect(generateSpy).toHaveBeenCalledTimes(1); + + await historyHandle.historyService.appendToHistory( + workspaceId, + createMuxMessage("u2", "user", "Second request") + ); + await getInternals(service).runForWorkspace(workspaceId); + expect(generateSpy).toHaveBeenCalledTimes(2); + expect(setSidebarStatusMock).toHaveBeenCalledTimes(2); + }); + + test("skips regeneration when there is no chat history yet", async () => { + // Empty workspaces have nothing to summarize. Don't pay for a + // hallucinated status, and don't blank an existing todoStatus on disk. + const service = createService(); + await getInternals(service).runForWorkspace(workspaceId); + expect(generateSpy).not.toHaveBeenCalled(); + expect(setSidebarStatusMock).not.toHaveBeenCalled(); + }); + + test("empty workspaces consume observed recency so they do not starve populated workspaces", async () => { + const emptyWorkspaceId = "ws-empty"; + projectsConfig = makeProjectsConfig([ + makeWorkspaceEntry({ id: emptyWorkspaceId, name: emptyWorkspaceId } as Partial), + makeWorkspaceEntry(), + ]); + await historyHandle.historyService.appendToHistory( + workspaceId, + createMuxMessage("u1", "user", "Populated workspace") + ); + getAllSnapshotsMock.mockImplementation(() => + Promise.resolve( + new Map([ + [emptyWorkspaceId, { streaming: false, recency: 100 }], + ]) + ) + ); + + let now = 1_000_000; + const service = createService({ clock: () => now }); + const internals = getInternals(service); + + await internals.runTick(); + expect(generateSpy).not.toHaveBeenCalled(); + + now += 10_000; + await internals.runTick(); + + expect(generateSpy).toHaveBeenCalledTimes(1); + expect(generateSpy.mock.calls[0][0]).toContain("User: Populated workspace"); + }); + + test("idle workspaces regenerate at the idle focused/unfocused intervals", async () => { + await historyHandle.historyService.appendToHistory( + workspaceId, + createMuxMessage("u1", "user", "Hello") + ); + await historyHandle.historyService.appendToHistory( + workspaceId, + createMuxMessage("a1", "assistant", "Hi") + ); + + let now = 1_000_000; + const service = createService({ clock: () => now }); + const internals = getInternals(service); + + // First focused tick generates. We mutate history between ticks so the + // dedup hash differs β€” otherwise this test would pass for the wrong + // reason. + isFocused = true; + await internals.runTick(); + await historyHandle.historyService.appendToHistory( + workspaceId, + createMuxMessage("u2", "user", "follow-up A") + ); + expect(generateSpy).toHaveBeenCalledTimes(1); + + // Inside the focused interval: skipped. + now += 5_000; + await internals.runTick(); + expect(generateSpy).toHaveBeenCalledTimes(1); + + // Past the focused interval: regenerates. + now += 30_000; + await internals.runTick(); + expect(generateSpy).toHaveBeenCalledTimes(2); + + // Unfocused: 60s elapsed is past focused but short of the unfocused + // interval (2 minutes), so the scheduler must wait. + isFocused = false; + await historyHandle.historyService.appendToHistory( + workspaceId, + createMuxMessage("u3", "user", "follow-up B") + ); + now += 60_000; + await internals.runTick(); + expect(generateSpy).toHaveBeenCalledTimes(2); + + // Past the unfocused interval: regenerates. + now += 120_000; + await internals.runTick(); + expect(generateSpy).toHaveBeenCalledTimes(3); + }); + + test("a user message recency bump bypasses the idle cadence so stale pre-pivot status refreshes", async () => { + // User rationale: a chat message is often a real pivot to the task at + // hand. If we wait for the normal idle cadence, the sidebar can keep + // showing the old pre-pivot status after the user has clearly changed + // direction. + await historyHandle.historyService.appendToHistory( + workspaceId, + createMuxMessage("u1", "user", "Initial request") + ); + + let recency = 100; + getAllSnapshotsMock.mockImplementation(() => + Promise.resolve( + new Map([[workspaceId, { streaming: false, recency }]]) + ) + ); + + let now = 1_000_000; + const service = createService({ clock: () => now }); + const internals = getInternals(service); + + await internals.runTick(); + expect(generateSpy).toHaveBeenCalledTimes(1); + + now += 5_000; + recency = 200; + await historyHandle.historyService.appendToHistory( + workspaceId, + createMuxMessage("u2", "user", "Pivot to new task") + ); + await internals.runTick(); + + // Still inside the 30s idle-focused interval, but the user-recency bump + // resets the clock so we regenerate against the pivot immediately. + expect(generateSpy).toHaveBeenCalledTimes(2); + expect(generateSpy.mock.calls[1][0]).toContain("User: Pivot to new task"); + + now += 5_000; + await historyHandle.historyService.appendToHistory( + workspaceId, + createMuxMessage("a1", "assistant", "Acknowledged") + ); + await internals.runTick(); + + // Non-user transcript changes still obey cadence when recency is stable. + expect(generateSpy).toHaveBeenCalledTimes(2); + }); + + test("does not consume a user recency bump until the pivot message reaches history", async () => { + await historyHandle.historyService.appendToHistory( + workspaceId, + createMuxMessage("u1", "user", "Initial request") + ); + + let recency = 100; + getAllSnapshotsMock.mockImplementation(() => + Promise.resolve( + new Map([[workspaceId, { streaming: false, recency }]]) + ) + ); + + let now = 1_000_000; + const service = createService({ clock: () => now }); + const internals = getInternals(service); + + await internals.runTick(); + expect(generateSpy).toHaveBeenCalledTimes(1); + + // sendMessage updates workspace recency before the user message is + // durably appended to history. A scheduler tick in that gap sees the + // recency bump but the old transcript hash; it must leave the bump + // unconsumed so the next tick can still bypass cadence once history + // catches up. + now += 5_000; + recency = now; + await internals.runTick(); + expect(generateSpy).toHaveBeenCalledTimes(1); + + await historyHandle.historyService.appendToHistory( + workspaceId, + createMuxMessage("u2", "user", "Pivot after recency") + ); + now += 10_000; + await internals.runTick(); + + expect(generateSpy).toHaveBeenCalledTimes(2); + expect(generateSpy.mock.calls[1][0]).toContain("User: Pivot after recency"); + }); + + test("defers a first recent recency bump so startup cannot settle on stale pre-pivot history", async () => { + await historyHandle.historyService.appendToHistory( + workspaceId, + createMuxMessage("u1", "user", "Old request before restart") + ); + + let now = 1_000_000; + const recency = now - 1_000; + getAllSnapshotsMock.mockImplementation(() => + Promise.resolve( + new Map([[workspaceId, { streaming: false, recency }]]) + ) + ); + + const service = createService({ clock: () => now }); + const internals = getInternals(service); + + // After a restart the in-memory hash baseline is empty. If this tick is + // racing with sendMessage's recency update, generating now would settle + // on old history and consume the pivot signal before the user message is + // appended. Defer one tick instead. + await internals.runTick(); + expect(generateSpy).not.toHaveBeenCalled(); + + await historyHandle.historyService.appendToHistory( + workspaceId, + createMuxMessage("u2", "user", "Pivot after restart") + ); + now += 10_000; + await internals.runTick(); + + expect(generateSpy).toHaveBeenCalledTimes(1); + expect(generateSpy.mock.calls[0][0]).toContain("User: Pivot after restart"); + }); + + test("dedup skips consume stale recency priority after the history catchup window", async () => { + const staleWorkspaceId = "ws-stale-recency"; + projectsConfig = makeProjectsConfig([ + makeWorkspaceEntry({ id: staleWorkspaceId, name: staleWorkspaceId } as Partial), + makeWorkspaceEntry(), + ]); + await historyHandle.historyService.appendToHistory( + staleWorkspaceId, + createMuxMessage("u-stale", "user", "Already summarized") + ); + await historyHandle.historyService.appendToHistory( + workspaceId, + createMuxMessage("u-good", "user", "Waiting behind stale recency") + ); + + let now = 1_000_000; + let recency = 100; + getAllSnapshotsMock.mockImplementation(() => + Promise.resolve( + new Map([ + [staleWorkspaceId, { streaming: false, recency }], + ]) + ) + ); + const service = createService({ clock: () => now }); + const internals = getInternals(service); + + await internals.runTick(); + expect(generateSpy).toHaveBeenCalledTimes(1); + expect(generateSpy.mock.calls[0][0]).toContain("User: Already summarized"); + + now += 5_000; + recency = now; + await internals.runTick(); + expect(generateSpy).toHaveBeenCalledTimes(1); + + now += 10_000; + await internals.runTick(); + expect(generateSpy).toHaveBeenCalledTimes(1); + + now += 10_000; + await internals.runTick(); + expect(generateSpy).toHaveBeenCalledTimes(2); + expect(generateSpy.mock.calls[1][0]).toContain("User: Waiting behind stale recency"); + }); + + test("streaming workspaces regenerate at the active intervals (10s focused, 30s unfocused)", async () => { + // The user-visible reason this test exists: when an agent is actively + // working, the sidebar status should refresh fast enough that the user + // can follow along (every 10s when watching, every 30s otherwise), + // versus the slower 30s/120s cadence for chats that aren't moving. + await historyHandle.historyService.appendToHistory( + workspaceId, + createMuxMessage("u1", "user", "kick off a long task") + ); + // Mark the workspace as currently streaming so dispatch picks the + // active intervals. + getAllSnapshotsMock.mockImplementation(() => + Promise.resolve(new Map([[workspaceId, { streaming: true }]])) + ); + + let now = 1_000_000; + const service = createService({ clock: () => now }); + const internals = getInternals(service); + + isFocused = true; + await internals.runTick(); + expect(generateSpy).toHaveBeenCalledTimes(1); + + // 5s elapsed: inside the active-focused 10s interval β†’ skip. + now += 5_000; + await historyHandle.historyService.appendToHistory( + workspaceId, + createMuxMessage("a1", "assistant", "step one") + ); + await internals.runTick(); + expect(generateSpy).toHaveBeenCalledTimes(1); + + // 10s elapsed: at the active-focused interval β†’ regenerates. + now += 5_000; + await historyHandle.historyService.appendToHistory( + workspaceId, + createMuxMessage("a2", "assistant", "step two") + ); + await internals.runTick(); + expect(generateSpy).toHaveBeenCalledTimes(2); + + // Unfocused: 10s past last run is inside the 30s active-unfocused + // interval β†’ skip. Only at 30s does it regenerate. + isFocused = false; + now += 10_000; + await historyHandle.historyService.appendToHistory( + workspaceId, + createMuxMessage("a3", "assistant", "step three") + ); + await internals.runTick(); + expect(generateSpy).toHaveBeenCalledTimes(2); + + now += 20_000; + await internals.runTick(); + expect(generateSpy).toHaveBeenCalledTimes(3); + }); + + test("round-robins across multiple workspaces so none starve under MAX_CONCURRENT=1", async () => { + // With MAX_CONCURRENT=1 and a fixed iteration order, the first workspace + // would always become re-eligible before later ones got a turn. The + // scheduler must prioritize least-recently-run workspaces. + const projectPathLocal = "/test/round-robin-project"; + const ids = ["ws-a", "ws-b", "ws-c"]; + const workspaces = ids.map( + (id) => ({ id, name: id, path: `/test/path/${id}` }) as unknown as Workspace + ); + projectsConfig = { + projects: new Map([ + [projectPathLocal, { workspaces } as unknown as ProjectConfig], + ]), + }; + for (const id of ids) { + await historyHandle.historyService.appendToHistory( + id, + createMuxMessage(`u1-${id}`, "user", `prompt for ${id}`) + ); + } + + let now = 1_000_000; + const service = createService({ clock: () => now }); + const internals = getInternals(service); + + // Tick 1 covers one workspace; ticks 2 and 3 each cover a distinct + // never-run workspace before any repeat (least-recently-run wins). + await internals.runTick(); + expect(generateSpy).toHaveBeenCalledTimes(1); + now += 31_000; + await internals.runTick(); + expect(generateSpy).toHaveBeenCalledTimes(2); + now += 31_000; + await internals.runTick(); + expect(generateSpy).toHaveBeenCalledTimes(3); + const persistedIds = setSidebarStatusMock.mock.calls.map((call) => call[0]); + expect(new Set(persistedIds)).toEqual(new Set(ids)); + }); + + test("does not invoke the generator if stopped during transcript build or candidates fetch", async () => { + // Earlier awaits (history read, candidates fetch) are also yield points. + // If stop() fires during one of them, kicking off the multi-second + // provider call afterwards would leak LLM work past the service's + // declared lifecycle. + await historyHandle.historyService.appendToHistory( + workspaceId, + createMuxMessage("u1", "user", "long-running task") + ); + + let releaseCandidates!: () => void; + const gate = new Promise((resolve) => { + releaseCandidates = resolve; + }); + getCandidatesMock.mockImplementationOnce(async () => { + await gate; + return ["anthropic:claude-haiku-4-5"]; + }); + + const service = createService(); + const inFlight = getInternals(service).runForWorkspace(workspaceId); + service.stop(); + releaseCandidates(); + await inFlight; + + expect(generateSpy).not.toHaveBeenCalled(); + expect(setSidebarStatusMock).not.toHaveBeenCalled(); + expect(emitWorkspaceActivityMock).not.toHaveBeenCalled(); + }); + + test("does not persist or emit if the service is stopped while a generation is in flight", async () => { + // Real provider calls can take seconds to minutes. If stop() fires + // mid-generation (app shutdown), persisting afterwards would leak writes + // past the declared lifecycle. + await historyHandle.historyService.appendToHistory( + workspaceId, + createMuxMessage("u1", "user", "long-running task") + ); + + // Two-stage gate: signal when the generator actually starts (so the + // test can fire stop() after the pre-generator guard has passed) and + // a release the test holds until it's ready for the generator to + // resolve. + let signalStarted!: () => void; + const startedSignal = new Promise((resolve) => { + signalStarted = resolve; + }); + let releaseGenerate!: () => void; + const gate = new Promise((resolve) => { + releaseGenerate = resolve; + }); + generateSpy.mockImplementationOnce(async () => { + signalStarted(); + await gate; + return Ok({ + status: { emoji: "πŸ› οΈ", message: "Doing work" }, + modelUsed: "anthropic:claude-haiku-4-5", + }); + }); + + const service = createService(); + const inFlight = getInternals(service).runForWorkspace(workspaceId); + await startedSignal; + service.stop(); + releaseGenerate(); + await inFlight; + + expect(generateSpy).toHaveBeenCalledTimes(1); + expect(setSidebarStatusMock).not.toHaveBeenCalled(); + expect(emitWorkspaceActivityMock).not.toHaveBeenCalled(); + }); + + test("drops a generated status if workspace recency advances while provider call is in flight", async () => { + await historyHandle.historyService.appendToHistory( + workspaceId, + createMuxMessage("u1", "user", "Old task") + ); + + let recency = 100; + getAllSnapshotsMock.mockImplementation(() => + Promise.resolve( + new Map([[workspaceId, { streaming: false, recency }]]) + ) + ); + setSidebarStatusMock.mockImplementation((_workspaceId, _status, options) => + Promise.resolve( + options?.skipIfRecencyAdvancedSince != null && recency > options.skipIfRecencyAdvancedSince + ? null + : { recency } + ) + ); + + let signalStarted!: () => void; + const startedSignal = new Promise((resolve) => { + signalStarted = resolve; + }); + let releaseGenerate!: () => void; + const gate = new Promise((resolve) => { + releaseGenerate = resolve; + }); + generateSpy.mockImplementationOnce(async () => { + signalStarted(); + await gate; + return Ok({ + status: { emoji: "πŸ› οΈ", message: "Summarizing old task" }, + modelUsed: "anthropic:claude-haiku-4-5", + }); + }); + + const service = createService(); + const inFlight = getInternals(service).runForWorkspace(workspaceId, recency); + await startedSignal; + + // A user message can advance recency while the provider is still working + // on the old transcript. The old result must not be written after that + // pivot, or the sidebar can resurrect stale pre-pivot status. + recency = 200; + releaseGenerate(); + await inFlight; + + expect(generateSpy).toHaveBeenCalledTimes(1); + expect(setSidebarStatusMock).toHaveBeenCalledTimes(1); + expect(setSidebarStatusMock.mock.calls[0][2]).toEqual({ skipIfRecencyAdvancedSince: 100 }); + expect(emitWorkspaceActivityMock).not.toHaveBeenCalled(); + }); + + test("a failed persistence write does not update the dedup hash, so the next tick retries", async () => { + // Only update lastInputHash AFTER a successful persist. Otherwise a + // transient I/O failure would leave us dedup'ing against a hash that + // never made it to disk, silently dropping subsequent retries. + await historyHandle.historyService.appendToHistory( + workspaceId, + createMuxMessage("u1", "user", "kick off a task") + ); + + setSidebarStatusMock.mockImplementationOnce(() => Promise.reject(new Error("disk full"))); + + const service = createService(); + await getInternals(service).runForWorkspace(workspaceId); + + expect(generateSpy).toHaveBeenCalledTimes(1); + expect(setSidebarStatusMock).toHaveBeenCalledTimes(1); + // Activity must not emit on persist failure. + expect(emitWorkspaceActivityMock).not.toHaveBeenCalled(); + + // Same transcript, second pass: retries because the previous failure + // left lastInputHash unchanged. + setSidebarStatusMock.mockImplementation((_w, _s) => Promise.resolve({ recency: 0 })); + await getInternals(service).runForWorkspace(workspaceId); + expect(generateSpy).toHaveBeenCalledTimes(2); + expect(setSidebarStatusMock).toHaveBeenCalledTimes(2); + expect(emitWorkspaceActivityMock).toHaveBeenCalledTimes(1); + }); + + test("setSidebarStatus must not bump workspace recency (would re-sort idle workspaces)", async () => { + // AgentStatusService is a background scheduler with no causal + // connection to user activity, so its writes must not bump recency β€” + // that would promote idle workspaces in the sidebar and mark them + // unread every tick. Test ExtensionMetadataService directly to pin the + // contract for any future caller of setSidebarStatus. + const dir = mkdtempSync(join(tmpdir(), "mux-recency-")); + try { + const svc = new ExtensionMetadataService(join(dir, "metadata.json")); + await svc.updateRecency("ws", 100); + await svc.setSidebarStatus("ws", { emoji: "πŸ› οΈ", message: "Doing work" }); + const after = await svc.getSnapshot("ws"); + expect(after?.recency).toBe(100); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + test("setSidebarStatus can atomically skip when recency advanced", async () => { + const dir = mkdtempSync(join(tmpdir(), "mux-recency-skip-")); + try { + const svc = new ExtensionMetadataService(join(dir, "metadata.json")); + await svc.updateRecency("ws", 200); + const skipped = await svc.setSidebarStatus( + "ws", + { emoji: "πŸ› οΈ", message: "Old status" }, + { skipIfRecencyAdvancedSince: 100 } + ); + const after = await svc.getSnapshot("ws"); + + expect(skipped).toBeNull(); + expect(after?.todoStatus).toBeUndefined(); + expect(after?.recency).toBe(200); + } finally { + rmSync(dir, { recursive: true, force: true }); + } + }); + + test("rejects generic placeholder messages and advances dedup so we don't loop", async () => { + // Codex review: even with the prompt steering away from "Awaiting next + // task" et al., small models can still emit them. We must reject them + // post-generation so they never reach the sidebar β€” and we must NOT + // re-call the model on the same transcript, because we'd just get the + // same placeholder back and burn provider budget. + await historyHandle.historyService.appendToHistory( + workspaceId, + createMuxMessage("u1", "user", "kick off a task") + ); + + generateSpy.mockResolvedValueOnce( + Ok({ + status: { emoji: "πŸ’€", message: "Awaiting next task" }, + modelUsed: "anthropic:claude-haiku-4-5", + }) + ); + + const service = createService(); + await getInternals(service).runForWorkspace(workspaceId); + + // Generator was called, but persist was skipped: the placeholder must + // not reach the sidebar. + expect(generateSpy).toHaveBeenCalledTimes(1); + expect(setSidebarStatusMock).not.toHaveBeenCalled(); + expect(emitWorkspaceActivityMock).not.toHaveBeenCalled(); + + // Same transcript again: dedup must skip β€” we already learned this + // input produces a placeholder, no point retrying until it changes. + await getInternals(service).runForWorkspace(workspaceId); + expect(generateSpy).toHaveBeenCalledTimes(1); + expect(setSidebarStatusMock).not.toHaveBeenCalled(); + + // After a genuine transcript change, we try again with a fresh result. + await historyHandle.historyService.appendToHistory( + workspaceId, + createMuxMessage("u2", "user", "follow-up message") + ); + await getInternals(service).runForWorkspace(workspaceId); + expect(generateSpy).toHaveBeenCalledTimes(2); + expect(setSidebarStatusMock).toHaveBeenCalledTimes(1); + expect(emitWorkspaceActivityMock).toHaveBeenCalledTimes(1); + }); + + test("post-provider failure advances dedup so we don't resend the same transcript every tick", async () => { + // Codex review: when status generation fails AFTER reaching the + // provider (e.g., the chosen model refuses to call propose_status, or + // hits a persistent provider error), leaving lastInputHash unchanged + // would let the scheduler resend the exact same trailing transcript on + // every focused/idle interval, burning tokens against a workspace that + // is stuck. Once we've attempted generation against the provider, the + // only retry signal that matters is a real transcript change. + await historyHandle.historyService.appendToHistory( + workspaceId, + createMuxMessage("u1", "user", "kick off a task") + ); + + generateSpy.mockResolvedValueOnce( + Err({ + error: { type: "unknown", raw: "model did not call propose_status" }, + reachedProvider: true, + }) + ); + + const service = createService(); + await getInternals(service).runForWorkspace(workspaceId); + + // Generator was called and failed; nothing reached the sidebar. + expect(generateSpy).toHaveBeenCalledTimes(1); + expect(setSidebarStatusMock).not.toHaveBeenCalled(); + expect(emitWorkspaceActivityMock).not.toHaveBeenCalled(); + + // Same transcript again: dedup must skip β€” we already learned that this + // input fails, no point retrying until something changes. + await getInternals(service).runForWorkspace(workspaceId); + expect(generateSpy).toHaveBeenCalledTimes(1); + expect(setSidebarStatusMock).not.toHaveBeenCalled(); + + // After a genuine transcript change, we try again. This time the + // generator returns a fresh result and it gets persisted normally. + await historyHandle.historyService.appendToHistory( + workspaceId, + createMuxMessage("u2", "user", "follow-up message") + ); + await getInternals(service).runForWorkspace(workspaceId); + expect(generateSpy).toHaveBeenCalledTimes(2); + expect(setSidebarStatusMock).toHaveBeenCalledTimes(1); + expect(emitWorkspaceActivityMock).toHaveBeenCalledTimes(1); + }); + + test("pre-provider failure (auth/config) keeps retrying so a later credential fix recovers", async () => { + // Codex review: if the first attempt happens before the user has + // connected OAuth / configured an API key (or while a provider is + // disabled), generateWorkspaceStatus returns an Err whose + // reachedProvider flag is false β€” every candidate failed at + // createModel, never crossed the wire to a provider. Caching that + // failure with the transcript hash would silently freeze the workspace + // out of AI status until the chat advances on its own. Pre-provider + // failures must therefore stay retriable: the next tick must call + // generateWorkspaceStatus again so a later credential/provider fix + // recovers without requiring a new user message. + await historyHandle.historyService.appendToHistory( + workspaceId, + createMuxMessage("u1", "user", "kick off a task") + ); + + generateSpy.mockResolvedValueOnce( + Err({ + error: { + type: "authentication", + authKind: "api_key_missing", + provider: "anthropic", + }, + reachedProvider: false, + }) + ); + + const service = createService(); + await getInternals(service).runForWorkspace(workspaceId); + + expect(generateSpy).toHaveBeenCalledTimes(1); + expect(setSidebarStatusMock).not.toHaveBeenCalled(); + + // Same transcript, no fix yet: must retry. The scheduler still picks + // this workspace up because the dedup hash didn't advance. + generateSpy.mockResolvedValueOnce( + Err({ + error: { + type: "authentication", + authKind: "api_key_missing", + provider: "anthropic", + }, + reachedProvider: false, + }) + ); + await getInternals(service).runForWorkspace(workspaceId); + expect(generateSpy).toHaveBeenCalledTimes(2); + expect(setSidebarStatusMock).not.toHaveBeenCalled(); + + // User fixes credentials β†’ next attempt succeeds against the same + // transcript (no chat change required). + await getInternals(service).runForWorkspace(workspaceId); + expect(generateSpy).toHaveBeenCalledTimes(3); + expect(setSidebarStatusMock).toHaveBeenCalledTimes(1); + expect(emitWorkspaceActivityMock).toHaveBeenCalledTimes(1); + }); + + test("pre-provider failures consume recency priority without advancing transcript dedup", async () => { + const misconfiguredWorkspaceId = "ws-misconfigured"; + projectsConfig = makeProjectsConfig([ + makeWorkspaceEntry({ + id: misconfiguredWorkspaceId, + name: misconfiguredWorkspaceId, + } as Partial), + makeWorkspaceEntry(), + ]); + await historyHandle.historyService.appendToHistory( + misconfiguredWorkspaceId, + createMuxMessage("u-bad", "user", "Misconfigured workspace") + ); + await historyHandle.historyService.appendToHistory( + workspaceId, + createMuxMessage("u-good", "user", "Healthy workspace") + ); + getAllSnapshotsMock.mockImplementation(() => + Promise.resolve( + new Map([ + [misconfiguredWorkspaceId, { streaming: false, recency: 100 }], + ]) + ) + ); + generateSpy.mockResolvedValueOnce( + Err({ + error: { + type: "authentication", + authKind: "api_key_missing", + provider: "anthropic", + }, + reachedProvider: false, + }) + ); + + let now = 1_000_000; + const service = createService({ clock: () => now }); + const internals = getInternals(service); + + await internals.runTick(); + expect(generateSpy).toHaveBeenCalledTimes(1); + expect(generateSpy.mock.calls[0][0]).toContain("User: Misconfigured workspace"); + + now += 10_000; + await internals.runTick(); + + expect(generateSpy).toHaveBeenCalledTimes(2); + expect(generateSpy.mock.calls[1][0]).toContain("User: Healthy workspace"); + }); + + test("pre-provider retry state does not consume a recency bump before history catches up", async () => { + await historyHandle.historyService.appendToHistory( + workspaceId, + createMuxMessage("u1", "user", "Old misconfigured request") + ); + let recency = 100; + getAllSnapshotsMock.mockImplementation(() => + Promise.resolve( + new Map([[workspaceId, { streaming: false, recency }]]) + ) + ); + generateSpy.mockResolvedValueOnce( + Err({ + error: { + type: "authentication", + authKind: "api_key_missing", + provider: "anthropic", + }, + reachedProvider: false, + }) + ); + + let now = 1_000_000; + const service = createService({ clock: () => now }); + const internals = getInternals(service); + + await internals.runTick(); + expect(generateSpy).toHaveBeenCalledTimes(1); + + now += 5_000; + recency = now; + await internals.runTick(); + expect(generateSpy).toHaveBeenCalledTimes(1); + + await historyHandle.historyService.appendToHistory( + workspaceId, + createMuxMessage("u2", "user", "Pivot after config failure") + ); + now += 10_000; + await internals.runTick(); + + expect(generateSpy).toHaveBeenCalledTimes(2); + expect(generateSpy.mock.calls[1][0]).toContain("User: Pivot after config failure"); + }); + + test("archived workspaces are not regenerated", async () => { + projectsConfig = makeProjectsConfig([ + makeWorkspaceEntry({ archivedAt: new Date().toISOString() } as Partial), + ]); + await historyHandle.historyService.appendToHistory( + workspaceId, + createMuxMessage("u1", "user", "Archived chat") + ); + + const service = createService(); + await getInternals(service).runTick(); + + expect(generateSpy).not.toHaveBeenCalled(); + expect(setSidebarStatusMock).not.toHaveBeenCalled(); + }); +}); diff --git a/src/node/services/agentStatusService.ts b/src/node/services/agentStatusService.ts new file mode 100644 index 0000000000..8464581d3a --- /dev/null +++ b/src/node/services/agentStatusService.ts @@ -0,0 +1,526 @@ +import { createHash } from "crypto"; +import assert from "@/common/utils/assert"; +import { + AGENT_STATUS_ACTIVE_FOCUSED_INTERVAL_MS, + AGENT_STATUS_ACTIVE_UNFOCUSED_INTERVAL_MS, + AGENT_STATUS_IDLE_FOCUSED_INTERVAL_MS, + AGENT_STATUS_IDLE_UNFOCUSED_INTERVAL_MS, + AGENT_STATUS_MAX_CONCURRENT, + AGENT_STATUS_MAX_MESSAGE_CHARS, + AGENT_STATUS_MAX_TRAILING_MESSAGES, + AGENT_STATUS_MAX_TRANSCRIPT_TOKENS, + AGENT_STATUS_TICK_INTERVAL_MS, +} from "@/constants/agentStatus"; +import type { Config } from "@/node/config"; +import type { MuxMessage } from "@/common/types/message"; +import { isWorkspaceArchived } from "@/common/utils/archive"; +import type { AIService } from "./aiService"; +import type { ExtensionMetadataService } from "./ExtensionMetadataService"; +import type { HistoryService } from "./historyService"; +import type { TokenizerService } from "./tokenizerService"; +import type { WindowService } from "./windowService"; +import type { WorkspaceService } from "./workspaceService"; +import { generateWorkspaceStatus } from "./workspaceStatusGenerator"; +import { log } from "./log"; + +const FALLBACK_TOKENIZER_MODEL = "anthropic:claude-haiku-4-5"; + +export interface AgentStatusServiceOptions { + /** Override for test injection. Defaults to `Date.now`. */ + clock?: () => number; + /** Override scheduler tick interval. Defaults to AGENT_STATUS_TICK_INTERVAL_MS. */ + tickIntervalMs?: number; +} + +interface State { + /** Last time we ran (or skipped via dedup). 0 if we never ran. */ + lastRanAt: number; + /** + * Hash of the input we last "settled" on β€” i.e. an outcome that depends + * on the *transcript* and shouldn't be retried until the transcript + * changes. That covers: + * - successful persists (Ok result, status written), + * - post-generation placeholder rejection, + * - generation failures that reached the provider (model refused tool, + * rate limit, persistent provider error, etc.). + * + * Pre-provider failures (no API key, OAuth not connected, provider + * disabled, model not available, policy denied β€” anything that fails + * inside createModel before we cross the wire) intentionally do NOT + * advance this hash. Those are properties of the user's *config*, and + * caching them by transcript would freeze a workspace out of AI status + * until a new chat message arrived, even after the user fixed + * credentials. See the `result.error.reachedProvider` branch in + * `runForWorkspace`. + * + * null if we have never settled on a transcript for this workspace. + */ + lastInputHash: string | null; + /** + * Hash of the transcript the scheduler last examined, even if that input + * did not settle into a sidebar status (for example, a pre-provider config + * failure). Used to avoid consuming a recency bump while history is still + * catching up to the user message that caused it. + */ + lastSeenInputHash: string | null; + /** + * Recency timestamp observed the last time the scheduler considered this + * workspace. User messages update recency, so an increased value is a + * strong signal that the old sidebar status may now be stale even if the + * normal idle/active cadence has not elapsed yet. + */ + lastObservedRecency: number | null; + /** Whether a generation is currently in flight. */ + inFlight: boolean; +} + +/** + * Periodic backend job that produces the sidebar's AI-generated agent status + * using the same "small model" path as workspace title generation. + * + * Cadence: streaming workspaces refresh fast so the user can follow along; + * idle workspaces back off. Both back off further when the desktop window + * is blurred. See ACTIVE_/IDLE_ intervals in @/constants/agentStatus. + * + * Dedup: each generation hashes its trailing-transcript window. Identical + * hash to the last settled run skips regeneration (idle/frozen chats). + * + * Concurrency: bounded by AGENT_STATUS_MAX_CONCURRENT so a multi-workspace + * sweep never spikes provider load. + */ +export class AgentStatusService { + private readonly tracked = new Map(); + private readonly inFlightPromises = new Set>(); + private readonly clock: () => number; + private readonly tickIntervalMs: number; + + private checkInterval: ReturnType | null = null; + private stopped = false; + private tickInFlight = false; + + constructor( + private readonly config: Config, + private readonly historyService: HistoryService, + private readonly tokenizerService: TokenizerService, + private readonly extensionMetadata: ExtensionMetadataService, + private readonly workspaceService: WorkspaceService, + private readonly windowService: WindowService, + private readonly aiService: AIService, + options: AgentStatusServiceOptions = {} + ) { + this.clock = options.clock ?? (() => Date.now()); + this.tickIntervalMs = options.tickIntervalMs ?? AGENT_STATUS_TICK_INTERVAL_MS; + } + + start(): void { + assert(this.checkInterval === null, "AgentStatusService.start() called while already running"); + this.stopped = false; + // No startup delay: AGENT_STATUS_MAX_CONCURRENT=1 already serializes + // generation across workspaces, so an immediate first tick won't create a + // thundering herd at launch. + this.checkInterval = setInterval(() => void this.runTick(), this.tickIntervalMs); + void this.runTick(); + log.info("AgentStatusService started", { tickIntervalMs: this.tickIntervalMs }); + } + + stop(): void { + this.stopped = true; + if (this.checkInterval) { + clearInterval(this.checkInterval); + this.checkInterval = null; + } + this.tracked.clear(); + this.inFlightPromises.clear(); + this.tickInFlight = false; + log.info("AgentStatusService stopped"); + } + + private async runTick(): Promise { + if (this.stopped || this.tickInFlight) return; + this.tickInFlight = true; + try { + // Anchor lastRanAt below to tick start time. With tick=10s and + // active-focused interval=10s, that makes the eligibility math exact: + // tick[k+1] - tick[k] === interval, so the workspace runs every tick. + // Otherwise sub-ms timer drift can degrade actual cadence to 2Γ— the + // configured interval. + const tickStartedAt = this.clock(); + await this.dispatch(tickStartedAt); + // Awaited so production callers and tests observe completion. + await Promise.allSettled([...this.inFlightPromises]); + } catch (error) { + log.error("AgentStatusService tick failed", { error }); + } finally { + this.tickInFlight = false; + } + } + + private async dispatch(tickStartedAt: number): Promise { + const focused = this.windowService.isFocused(); + // One disk read per tick for streaming state across all workspaces. + // Cheap, and avoids N reads inside the inner loop. + const snapshots = await this.extensionMetadata.getAllSnapshots(); + + // Sort eligible workspaces by lastRanAt ascending. With MAX_CONCURRENT=1, + // a fixed iteration order would let the first workspace starve the rest; + // least-recently-run gives fair round-robin without an explicit queue. + const eligible: Array<{ + id: string; + lastRanAt: number; + recency: number | null; + recencyAdvanced: boolean; + }> = []; + for (const [, projectConfig] of this.config.loadConfigOrDefault().projects) { + for (const ws of projectConfig.workspaces) { + const id = ws.id ?? ws.name; + if (typeof id !== "string" || id.length === 0) continue; + if (isWorkspaceArchived(ws.archivedAt, ws.unarchivedAt)) continue; + const state = this.tracked.get(id); + if (state?.inFlight) continue; + const snapshot = snapshots.get(id); + const recency = typeof snapshot?.recency === "number" ? snapshot.recency : null; + const recencyAdvanced = hasRecencyAdvanced(state, recency); + const interval = pickInterval(snapshot?.streaming === true, focused); + if (state && !recencyAdvanced && tickStartedAt - state.lastRanAt < interval) continue; + eligible.push({ id, lastRanAt: state?.lastRanAt ?? 0, recency, recencyAdvanced }); + } + } + eligible.sort((a, b) => { + if (a.recencyAdvanced !== b.recencyAdvanced) { + // A user message is usually a task pivot. Put those workspaces ahead + // of ordinary cadence refreshes so stale pre-pivot statuses don't + // linger behind background idle work. + return a.recencyAdvanced ? -1 : 1; + } + return a.lastRanAt - b.lastRanAt; + }); + + for (const { id, recency } of eligible) { + if (this.stopped || this.inFlightPromises.size >= AGENT_STATUS_MAX_CONCURRENT) return; + const state = this.ensureState(id); + state.inFlight = true; + // Set lastRanAt at dispatch time (not after the async transcript + // build) so cadence is anchored to tick boundaries β€” see runTick. + state.lastRanAt = tickStartedAt; + const promise = this.runForWorkspace(id, recency).finally(() => { + state.inFlight = false; + this.inFlightPromises.delete(promise); + }); + this.inFlightPromises.add(promise); + } + } + + private async runForWorkspace( + workspaceId: string, + observedRecency: number | null = null + ): Promise { + try { + const transcript = await this.buildTrailingTranscript(workspaceId); + const inputHash = computeInputHash(transcript); + // dispatch() set lastRanAt to the tick start time before kicking us + // off, so the scheduler won't reconsider this workspace until the next + // interval boundary unless a newer user-recency timestamp indicates the + // chat pivoted again. + const state = this.ensureState(workspaceId); + + const markRecencyObserved = () => { + if (observedRecency !== null) { + state.lastObservedRecency = observedRecency; + } + }; + + if ( + isRecentRecencyAheadOfHistory( + state, + inputHash, + observedRecency, + this.clock(), + AGENT_STATUS_TICK_INTERVAL_MS + ) + ) { + state.lastSeenInputHash = inputHash; + // We may be seeing WorkspaceService's recency update before the + // corresponding user message is appended to history. If the transcript + // is unchanged from the last one we examined (or we have no baseline + // immediately after startup), generating now could persist a stale + // pre-pivot status and consume the only recency signal. Wait one + // scheduler interval so the history write can catch up. + log.debug("AgentStatusService: waiting for recent recency bump to reach history", { + workspaceId, + observedRecency, + }); + return; + } + state.lastSeenInputHash = inputHash; + + // Empty workspace: nothing to summarize. Don't blank an existing + // todoStatus β€” that would clobber a status produced before compaction. + // Still consume non-racy recency so an empty workspace doesn't sort as + // "recency advanced" forever and starve other workspaces under the + // single-concurrency scheduler. + if (transcript.trim().length === 0) { + markRecencyObserved(); + return; + } + // Idle/frozen: identical trailing window since last settled run. The + // recent race path above already handles recency that may be ahead of + // history, so any recency reaching this dedup branch is stale/non-racy: + // consume it to avoid permanent recency-advanced priority. + if (state.lastInputHash === inputHash) { + markRecencyObserved(); + return; + } + + const candidates = await this.workspaceService.getWorkspaceTitleModelCandidates(workspaceId); + if (candidates.length === 0) return; + + // Skip the expensive provider call if stop() fired during any of the + // earlier awaits (transcript build, candidates fetch). The generator + // can take seconds to a minute, so kicking it off after shutdown + // would leak background LLM work past our lifecycle. + if (this.stopped) return; + const result = await generateWorkspaceStatus(transcript, candidates, this.aiService); + // Re-check after the generator returns: the same hazard at a later + // await boundary. + if (this.stopped) return; + if (!result.success) { + // Only advance the dedup hash when at least one candidate actually + // reached the provider. If every candidate failed during model + // construction (no API key, OAuth not connected, provider disabled, + // model not available, policy denied, etc.), the failure is about + // the user's *config* rather than the transcript β€” caching it would + // permanently skip this workspace until they happen to send another + // message, even after they fix credentials. Post-provider failures + // (model refused tool, rate limit, persistent provider error) are + // properties of the transcript and should defer until the chat + // changes. + if (result.error.reachedProvider) { + log.debug( + "AgentStatusService: status generation failed at provider; deferring until transcript changes", + { workspaceId, error: result.error.error } + ); + markRecencyObserved(); + state.lastInputHash = inputHash; + } else { + log.debug( + "AgentStatusService: status generation failed before reaching provider; will retry on cadence", + { workspaceId, error: result.error.error } + ); + // Consume recency without advancing lastInputHash: credential/config + // fixes should still retry the same transcript, but a misconfigured + // workspace must not retain permanent recency-advanced priority and + // starve other workspaces under max concurrency 1. + markRecencyObserved(); + } + return; + } + + // Defense in depth: even with a tuned prompt, small models can + // occasionally produce a generic placeholder ("Awaiting next task", + // "Doing work", etc.) that conveys no information. Reject those + // outputs before they reach the sidebar. Advance lastInputHash so we + // don't burn provider budget retrying the same transcript on every + // tick β€” the next genuine transcript change will trigger a fresh + // attempt. + if (isPlaceholderStatus(result.data.status.message)) { + log.debug("AgentStatusService: model produced placeholder status; skipping persist", { + workspaceId, + message: result.data.status.message, + }); + markRecencyObserved(); + state.lastInputHash = inputHash; + return; + } + + // Persist BEFORE updating the in-memory dedup hash. If the disk write + // fails we want the next tick to retry against the same transcript + // instead of dedup'ing against a hash we never committed. + try { + const snapshot = await this.extensionMetadata.setSidebarStatus( + workspaceId, + result.data.status, + { skipIfRecencyAdvancedSince: observedRecency } + ); + if (this.stopped) return; + if (!snapshot) { + // The recency check happens inside ExtensionMetadataService's + // serialized mutation queue, immediately before the status write. + // That makes it atomic with fire-and-forget user-recency writes: + // a slow provider response cannot resurrect a pre-pivot status + // after a newer user turn has queued or committed its recency bump. + log.debug("AgentStatusService: dropping generated status after newer recency", { + workspaceId, + observedRecency, + }); + return; + } + markRecencyObserved(); + state.lastInputHash = inputHash; + this.workspaceService.emitWorkspaceActivity(workspaceId, snapshot); + } catch (error) { + log.error("AgentStatusService: failed to persist generated status", { + workspaceId, + error, + }); + } + } catch (error) { + log.error("AgentStatusService: unexpected error during status generation", { + workspaceId, + error, + }); + } + } + + private ensureState(id: string): State { + let state = this.tracked.get(id); + if (!state) { + state = { + lastRanAt: 0, + lastInputHash: null, + lastSeenInputHash: null, + lastObservedRecency: null, + inFlight: false, + }; + this.tracked.set(id, state); + } + return state; + } + + /** + * Build the trailing chat transcript, capped by message count and + * AGENT_STATUS_MAX_TRANSCRIPT_TOKENS. Includes the in-flight partial + * assistant message (HistoryService.readPartial) so the hash refreshes + * mid-stream β€” exactly when "what is the agent doing now" matters most. + */ + private async buildTrailingTranscript(workspaceId: string): Promise { + const result = await this.historyService.getLastMessages( + workspaceId, + AGENT_STATUS_MAX_TRAILING_MESSAGES + ); + if (!result.success) return ""; + + const messages: MuxMessage[] = [...result.data]; + const partial = await this.historyService.readPartial(workspaceId); + if (partial) messages.push(partial); + + const formatted = messages.map(formatMessageForTranscript).filter((s) => s.length > 0); + if (formatted.length === 0) return ""; + + // Trim from the front (oldest) until we fit the token budget. Trailing + // messages carry the most signal for "what is the agent doing right now", + // so we never drop them. The tokenizer service falls back to a known + // family for unknown models, so the fallback constant is safe regardless + // of which model actually generates this workspace's status. + const tokenCounts = await this.tokenizerService.countTokensBatch( + FALLBACK_TOKENIZER_MODEL, + formatted + ); + + let totalTokens = tokenCounts.reduce((sum, n) => sum + n, 0); + let drop = 0; + while (totalTokens > AGENT_STATUS_MAX_TRANSCRIPT_TOKENS && drop < formatted.length - 1) { + totalTokens -= tokenCounts[drop]; + drop += 1; + } + return formatted.slice(drop).join("\n\n"); + } +} + +function extractMessageText(message: MuxMessage): string { + return (message.parts ?? []) + .filter((part): part is { type: "text"; text: string } => part.type === "text") + .map((part) => part.text.trim()) + .filter((text) => text.length > 0) + .join("\n"); +} + +function summarizeToolPart(part: unknown): string | null { + if (typeof part !== "object" || part === null) return null; + const record = part as { type?: unknown; toolName?: unknown }; + const type = typeof record.type === "string" ? record.type : null; + if (!type) return null; + // Tool calls have type "tool-" or "dynamic-tool" with a toolName. + const toolName = + typeof record.toolName === "string" + ? record.toolName + : type.startsWith("tool-") + ? type.slice(5) + : null; + return toolName ? `[tool ${toolName}]` : null; +} + +function formatMessageForTranscript(message: MuxMessage): string { + const role = message.role === "user" ? "User" : message.role === "assistant" ? "Assistant" : null; + if (!role) return ""; + + const segments: string[] = []; + const text = extractMessageText(message).slice(0, AGENT_STATUS_MAX_MESSAGE_CHARS); + if (text) segments.push(text); + + // Tool-call summaries let the model see what the agent is doing even when + // the assistant has not emitted natural-language text yet. Args/output are + // intentionally omitted to keep cost predictable. + const tools = (message.parts ?? []).map(summarizeToolPart).filter((s): s is string => s !== null); + if (tools.length > 0) segments.push(tools.join(" ")); + + return segments.length === 0 ? "" : `${role}: ${segments.join("\n")}`; +} + +function computeInputHash(transcript: string): string { + return createHash("sha256").update(transcript).digest("hex"); +} + +/** + * Generic non-informative status messages. Even with the prompt steering + * the model away from these, providers occasionally emit them (especially + * when the transcript is short or paused). We reject them post-generation + * rather than letting them reach the sidebar. + * + * Match is exact + case-insensitive on the trimmed message; we don't + * substring-match because legitimate phrases like "Awaiting user reply" + * contain "Awaiting" and shouldn't be filtered. + */ +const PLACEHOLDER_STATUS_MESSAGES: ReadonlySet = new Set([ + "awaiting next task", + "awaiting input", + "doing work", + "idle", + "working", + "no recent activity", +]); + +function isPlaceholderStatus(message: string): boolean { + return PLACEHOLDER_STATUS_MESSAGES.has(message.trim().toLowerCase()); +} + +function isRecentRecencyAheadOfHistory( + state: State, + inputHash: string, + observedRecency: number | null, + now: number, + historyCatchupWindowMs: number +): boolean { + return ( + hasRecencyAdvanced(state, observedRecency) && + (state.lastSeenInputHash === null || state.lastSeenInputHash === inputHash) && + observedRecency !== null && + now - observedRecency < historyCatchupWindowMs + ); +} + +function hasRecencyAdvanced(state: State | undefined, recency: number | null): boolean { + return ( + state !== undefined && + recency !== null && + (state.lastObservedRecency === null || recency > state.lastObservedRecency) + ); +} + +function pickInterval(streaming: boolean, focused: boolean): number { + if (streaming) { + return focused + ? AGENT_STATUS_ACTIVE_FOCUSED_INTERVAL_MS + : AGENT_STATUS_ACTIVE_UNFOCUSED_INTERVAL_MS; + } + return focused ? AGENT_STATUS_IDLE_FOCUSED_INTERVAL_MS : AGENT_STATUS_IDLE_UNFOCUSED_INTERVAL_MS; +} diff --git a/src/node/services/serviceContainer.ts b/src/node/services/serviceContainer.ts index f0cbc98db0..1893c04142 100644 --- a/src/node/services/serviceContainer.ts +++ b/src/node/services/serviceContainer.ts @@ -43,6 +43,7 @@ import { ExperimentsService } from "@/node/services/experimentsService"; import { WorkspaceMcpOverridesService } from "@/node/services/workspaceMcpOverridesService"; import { McpOauthService } from "@/node/services/mcpOauthService"; import { HeartbeatService } from "@/node/services/heartbeatService"; +import { AgentStatusService } from "@/node/services/agentStatusService"; import { IdleCompactionService } from "@/node/services/idleCompactionService"; import { getSigningService, type SigningService } from "@/node/services/signingService"; import { coderService, type CoderService } from "@/node/services/coderService"; @@ -127,6 +128,7 @@ export class ServiceContainer { private readonly ptyService: PTYService; public readonly idleCompactionService: IdleCompactionService; public readonly heartbeatService: HeartbeatService; + public readonly agentStatusService: AgentStatusService; constructor(config: Config) { this.config = config; @@ -275,6 +277,18 @@ export class ServiceContainer { this.editorService = new EditorService(config); this.updateService = new UpdateService(this.config); this.tokenizerService = new TokenizerService(this.sessionUsageService); + // AgentStatusService depends on tokenizer + window focus state; instantiate + // after both are constructed so the small-model status loop can run with + // accurate token budgeting and focus-aware cadence. + this.agentStatusService = new AgentStatusService( + config, + this.historyService, + this.tokenizerService, + this.extensionMetadata, + this.workspaceService, + this.windowService, + this.aiService + ); this.serverService = new ServerService(); this.menuEventService = new MenuEventService(); this.voiceService = new VoiceService( @@ -428,6 +442,10 @@ export class ServiceContainer { this.heartbeatService.start(); stepDurationsMs["heartbeatService.start"] = Date.now() - heartbeatStartedAt; + const agentStatusStartedAt = Date.now(); + this.agentStatusService.start(); + stepDurationsMs["agentStatusService.start"] = Date.now() - agentStatusStartedAt; + // Refresh mux-owned Coder SSH config in background (handles binary path changes on restart) // Skip getCoderInfo() to avoid caching "unavailable" if coder isn't installed yet void this.coderService.ensureMuxCoderSSHConfig().catch((error: unknown) => { @@ -505,6 +523,7 @@ export class ServiceContainer { this.desktopTokenManager.dispose(); await this.desktopSessionManager.closeAll(); this.heartbeatService.stop(); + this.agentStatusService.stop(); this.idleCompactionService.stop(); await this.browserBridgeServer.stop(); this.browserSessionStateHub.dispose(); @@ -530,6 +549,13 @@ export class ServiceContainer { await this.desktopBridgeServer.stop(); this.desktopTokenManager.dispose(); await this.desktopSessionManager.closeAll(); + // Stop the periodic AgentStatusService loop here too (not just in + // shutdown()): dispose() is the path used by the desktop before-quit + // and ACP in-process close handlers, and the ref'd setInterval would + // otherwise keep the process alive and continue calling + // generateWorkspaceStatus against services that are about to be torn + // down below. + this.agentStatusService.stop(); await this.browserBridgeServer.stop(); this.browserSessionStateHub.dispose(); this.browserBridgeTokenManager.dispose(); diff --git a/src/node/services/windowService.ts b/src/node/services/windowService.ts index 20712998c6..35b0dce599 100644 --- a/src/node/services/windowService.ts +++ b/src/node/services/windowService.ts @@ -14,6 +14,15 @@ export class WindowService { this.restartAppHandler = handler; } + /** + * Whether the desktop main window is currently focused. Falls back to + * `true` in non-desktop contexts (CLI server, tests) so backend services + * don't throttle themselves to "unfocused" cadence when there is no window. + */ + isFocused(): boolean { + return this.mainWindow?.isFocused?.() ?? true; + } + async restartApp(): Promise<{ supported: true } | { supported: false; message: string }> { const restartAppHandler = this.restartAppHandler; if (!restartAppHandler) { diff --git a/src/node/services/workspaceService.test.ts b/src/node/services/workspaceService.test.ts index 4d4084ad4e..48f6e35471 100644 --- a/src/node/services/workspaceService.test.ts +++ b/src/node/services/workspaceService.test.ts @@ -1638,8 +1638,66 @@ describe("WorkspaceService idle compaction dispatch", () => { await internals.updateStreamingStatus(workspaceId, false); expect(internals.idleCompactingWorkspaces.has(workspaceId)).toBe(false); + // todoStatus is intentionally NOT passed when there are no todos β€” + // passing null would delete an AgentStatusService-written AI summary + // from the same slot. Explicit clears happen via setTodoStatus. expect(setStreaming).toHaveBeenCalledWith(workspaceId, false, { hasTodos: false, + }); + }); + + test("stream-stop with no todos does NOT clear todoStatus (preserves AI summary)", async () => { + // Codex: AgentStatusService writes its AI-generated summary into the + // same `todoStatus` slot that `setTodoStatus` uses. The stream-stop + // path used to read an empty todo list and pass `todoStatus: null`, + // which deleted the slot β€” wiping a summary that was just generated + // during the stream. Free-form chats (no todos) hit this every turn. + const workspaceId = "stream-stop-preserves-ai-status"; + const snapshot = { + recency: Date.now(), + streaming: false, + lastModel: "claude-sonnet-4", + lastThinkingLevel: null, + }; + const setStreaming = mock(() => Promise.resolve(snapshot)); + const emitWorkspaceActivity = mock( + (_workspaceId: string, _snapshot: typeof snapshot) => undefined + ); + + ( + workspaceService as unknown as { + extensionMetadata: ExtensionMetadataService; + emitWorkspaceActivity: typeof emitWorkspaceActivity; + } + ).extensionMetadata = { setStreaming } as unknown as ExtensionMetadataService; + ( + workspaceService as unknown as { + extensionMetadata: ExtensionMetadataService; + emitWorkspaceActivity: typeof emitWorkspaceActivity; + } + ).emitWorkspaceActivity = emitWorkspaceActivity; + + const internals = workspaceService as unknown as { + updateStreamingStatus: ( + workspaceId: string, + streaming: boolean, + options?: ExtensionMetadataStreamingUpdate + ) => Promise; + }; + + await internals.updateStreamingStatus(workspaceId, false); + + // The setStreaming call must omit `todoStatus` entirely. If it included + // `todoStatus: null`, ExtensionMetadataService.setStreaming would delete + // the slot (see the `update.todoStatus !== undefined` branch there). + expect(setStreaming).toHaveBeenCalledTimes(1); + expect(setStreaming).toHaveBeenCalledWith(workspaceId, false, { hasTodos: false }); + // Defensive double-check that the assertion is strict β€” toHaveBeenCalledWith + // with an object literal in some matchers tolerates extra fields. Use + // `not` against an explicit `todoStatus: null` payload to lock the + // contract. + expect(setStreaming).not.toHaveBeenCalledWith(workspaceId, false, { + hasTodos: false, todoStatus: null, }); }); @@ -3546,9 +3604,10 @@ describe("WorkspaceService metadata listeners", () => { await new Promise((resolve) => setTimeout(resolve, 0)); expect(setStreaming).toHaveBeenCalledTimes(1); + // todoStatus is intentionally NOT passed when there are no todos β€” + // see updateStreamingStatus comment for rationale. expect(setStreaming).toHaveBeenCalledWith(workspaceId, false, { hasTodos: false, - todoStatus: null, generation: 0, }); }); diff --git a/src/node/services/workspaceService.ts b/src/node/services/workspaceService.ts index 0bb166a244..46bc0b5da2 100644 --- a/src/node/services/workspaceService.ts +++ b/src/node/services/workspaceService.ts @@ -1543,7 +1543,12 @@ export class WorkspaceService extends EventEmitter { }); } - private emitWorkspaceActivity( + /** + * Public so AgentStatusService can broadcast a snapshot it produced after + * a direct setX call. (Most callers use emitWorkspaceActivityUpdate, which + * couples persist + emit but swallows persist errors.) + */ + public emitWorkspaceActivity( workspaceId: string, snapshot: WorkspaceActivitySnapshot | null ): void { @@ -1614,7 +1619,13 @@ export class WorkspaceService extends EventEmitter { const sessionDir = this.config.getSessionDir(workspaceId); const todos = await readTodosForSessionDir(sessionDir); hasTodos ??= todos.length > 0; - todoStatus ??= deriveTodoStatus(todos) ?? null; + // When there are no todos to derive from, leave `todoStatus` undefined + // so setStreaming doesn't touch the slot. AgentStatusService writes + // its AI-generated summary into the same `todoStatus` field β€” passing + // `null` here would clobber a freshly generated summary every time a + // free-form (no-todo) turn ends. Explicit clears still happen via + // setTodoStatus(null) when the agent calls `todo_write([])`. + todoStatus ??= deriveTodoStatus(todos); } if ( !streaming && @@ -3847,7 +3858,13 @@ export class WorkspaceService extends EventEmitter { } } - private async getWorkspaceTitleModelCandidates(workspaceId: string): Promise { + /** + * Candidate list for "small model" callers (title + AI sidebar status). + * Global preferences first, then any workspace-configured model so a + * custom-model workspace still works when global preferences are + * unavailable. Public so AgentStatusService can share the precedence. + */ + public async getWorkspaceTitleModelCandidates(workspaceId: string): Promise { const candidates: string[] = [...NAME_GEN_PREFERRED_MODELS]; const metadataResult = await this.aiService.getWorkspaceMetadata(workspaceId); if (!metadataResult.success) { diff --git a/src/node/services/workspaceStatusGenerator.test.ts b/src/node/services/workspaceStatusGenerator.test.ts new file mode 100644 index 0000000000..f751ddc7e9 --- /dev/null +++ b/src/node/services/workspaceStatusGenerator.test.ts @@ -0,0 +1,51 @@ +import { describe, expect, test } from "bun:test"; +import { buildWorkspaceStatusPrompt, generateWorkspaceStatus } from "./workspaceStatusGenerator"; + +describe("buildWorkspaceStatusPrompt", () => { + test("contains the transcript inside delimited markers", () => { + const prompt = buildWorkspaceStatusPrompt("User: please run tests\nAssistant: running"); + + // The transcript block needs explicit delimiters so the model can tell + // where the transcript ends and the requirements begin. If we ever drop + // these delimiters, the model is more likely to follow trailing + // instructions baked into the transcript itself (a real prompt-injection + // risk for arbitrary chat history). + expect(prompt).toContain(""); + expect(prompt).toContain(""); + expect(prompt).toContain("User: please run tests"); + expect(prompt).toContain("Assistant: running"); + }); + + test("falls back to a sentinel when transcript is empty", () => { + const prompt = buildWorkspaceStatusPrompt(""); + + // Empty transcripts must still produce a syntactically-valid prompt; the + // sentinel keeps the small model from inheriting system-prompt context + // from a previous workspace. + expect(prompt).toContain("(no recent transcript)"); + }); +}); + +describe("generateWorkspaceStatus error paths", () => { + test("returns a configuration error when no candidates are provided", async () => { + const fakeAiService = { + // Asserting this never gets called is the real point of this test β€” + // the empty-candidates short-circuit prevents wasteful provider calls + // for misconfigured workspaces. + createModel: () => { + throw new Error("createModel must not be called when no candidates exist"); + }, + } as unknown as Parameters[2]; + + const result = await generateWorkspaceStatus("hello", [], fakeAiService); + expect(result.success).toBe(false); + if (!result.success) { + expect(result.error.error.type).toBe("unknown"); + expect(result.error.error.raw).toContain("No model candidates"); + // No candidates means we never even attempted createModel, so the + // failure has nothing to do with the transcript β€” caller must keep + // retrying so a future config change recovers without a new message. + expect(result.error.reachedProvider).toBe(false); + } + }); +}); diff --git a/src/node/services/workspaceStatusGenerator.ts b/src/node/services/workspaceStatusGenerator.ts new file mode 100644 index 0000000000..bee1a054be --- /dev/null +++ b/src/node/services/workspaceStatusGenerator.ts @@ -0,0 +1,177 @@ +import { streamText, tool } from "ai"; +import type { AIService } from "./aiService"; +import { log } from "./log"; +import { runLanguageModelCleanup } from "./languageModelCleanup"; +import { mapModelCreationError, mapNameGenerationError } from "./workspaceTitleGenerator"; +import type { Result } from "@/common/types/result"; +import { Ok, Err } from "@/common/types/result"; +import type { NameGenerationError } from "@/common/types/errors"; +import { + TOOL_DEFINITIONS, + ProposeStatusToolArgsSchema, +} from "@/common/utils/tools/toolDefinitions"; + +/** + * AI-generated sidebar status: emoji + short verb-led phrase, matching + * WorkspaceAgentStatus so the frontend renders it through the same + * WorkspaceStatusIndicator path as displayStatus / todoStatus. + */ +export interface WorkspaceAgentStatusPayload { + emoji: string; + message: string; +} + +export interface GenerateWorkspaceStatusResult { + status: WorkspaceAgentStatusPayload; + /** The model that successfully generated the status */ + modelUsed: string; +} + +export interface GenerateWorkspaceStatusFailure { + error: NameGenerationError; + /** + * True if at least one candidate's `createModel` call succeeded, meaning + * we actually reached the provider with a request. False if every + * candidate failed during model construction (auth not connected, API + * key missing, provider disabled, model not available, policy denied, + * etc.). + * + * The caller uses this to decide whether to advance its dedup hash: + * post-provider failures (model refused tool, rate limit, network blip, + * persistent provider error) are properties of the *transcript* and + * should defer until the chat changes. Pre-provider failures are + * properties of the user's *config* and must remain retriable so a + * later credential/provider fix recovers without requiring a transcript + * change first. + */ + reachedProvider: boolean; +} + +/** + * Build the prompt used by {@link generateWorkspaceStatus}. The transcript + * is supplied pre-trimmed (token budget enforced upstream). The prompt + * intentionally targets "current activity" not "overall task scope" β€” this + * is a sidebar status, not a workspace title. + */ +export function buildWorkspaceStatusPrompt(transcript: string): string { + // Sentinel for an empty window. AgentStatusService skips empty inputs in + // practice, but the model still needs something to ground on. + const body = transcript.trim().length > 0 ? transcript : "(no recent transcript)"; + return [ + "You produce a short sidebar status summarizing the most recent activity in an AI coding agent's chat.\n\n", + "Recent chat transcript (oldest first, newest last):\n", + "\n", + body, + "\n\n\n", + "Requirements:\n", + "- Describe the specific activity the agent was last working on, drawn from the actual transcript content.\n", + "- Always name a concrete activity (file, feature, bug, command, etc.) from the transcript. Generic non-informative phrasing is rejected and not shown.\n", + "- Tense: use present tense if the agent appears to still be in the middle of the activity; use past tense if the most recent assistant turn looks complete (e.g. wrapped up with a summary, no pending tool calls).\n", + // The sidebar renders the emoji through EmojiIcon, which maps a fixed + // set of glyphs to Lucide icons. Emojis outside this set fall back to + // a generic Sparkles icon, which looks identical regardless of the + // activity. Restrict the model to glyphs we know render correctly. + "- emoji: must be exactly one of: πŸ” πŸ“ βœ… ❌ πŸš€ ⏳ πŸ”— πŸ”„ πŸ§ͺ πŸ€” πŸ”§ πŸ›  πŸ”” 🌐 πŸ“– πŸ“¦ πŸ’€ πŸ’‘ ⚠. Pick the one that best matches the activity (πŸ” investigating, πŸ“ writing, βœ… done/completed, ❌ failed, πŸš€ deploying/launching, ⏳ waiting, πŸ”„ refreshing/iterating, πŸ§ͺ testing, πŸ€” deciding, πŸ”§ πŸ›  fixing/building, 🌐 network/web, πŸ“– reading docs, πŸ“¦ packaging, πŸ’€ idle, πŸ’‘ planning, ⚠ warning).\n", + "- message: 2-6 words, verb-led, sentence case, no punctuation, no quotes.\n", + '- Examples (in progress): "Investigating crash", "Implementing sidebar status", "Running tests", "Reading config files".\n', + '- Examples (completed): "Wrote tests", "Fixed sidebar bug", "Investigated crash", "Refactored config loader".\n\n', + "Call propose_status exactly once with your chosen emoji and message. Do not emit any text response.", + ].join(""); +} + +/** + * Generate a sidebar agent-status summary using the same "small model" path + * that powers workspace title generation. Tries up to 3 candidates so a + * single misconfigured candidate can't permanently disable status updates. + */ +export async function generateWorkspaceStatus( + transcript: string, + candidates: readonly string[], + aiService: AIService +): Promise> { + if (candidates.length === 0) { + return Err({ + error: { + type: "unknown", + raw: "No model candidates provided for workspace status generation", + }, + reachedProvider: false, + }); + } + + const maxAttempts = Math.min(candidates.length, 3); + let lastError: NameGenerationError | null = null; + // Track whether any candidate's createModel call succeeded β€” i.e., whether + // we actually crossed the wire to a provider. If every attempt fails at + // construction (no API key, OAuth not connected, provider disabled, etc.), + // the failure is about the user's config rather than the transcript and + // the caller must keep retrying so a later fix recovers. + let reachedProvider = false; + + for (let i = 0; i < maxAttempts; i++) { + const modelString = candidates[i]; + + const modelResult = await aiService.createModel(modelString, undefined, { + agentInitiated: true, + }); + if (!modelResult.success) { + lastError = mapModelCreationError(modelResult.error, modelString); + log.debug(`Status generation: skipping ${modelString} (${modelResult.error.type})`); + continue; + } + reachedProvider = true; + + try { + const currentStream = streamText({ + model: modelResult.data, + prompt: buildWorkspaceStatusPrompt(transcript), + tools: { + propose_status: tool({ + description: TOOL_DEFINITIONS.propose_status.description, + inputSchema: ProposeStatusToolArgsSchema, + // eslint-disable-next-line @typescript-eslint/require-await -- AI SDK Tool.execute must return a Promise + execute: async (args) => ({ success: true as const, ...args }), + }), + }, + }); + + const results = await currentStream.toolResults; + const toolResult = results.find((r) => r.dynamic !== true && r.toolName === "propose_status"); + + if (!toolResult) { + lastError = { type: "unknown", raw: "Model did not call propose_status tool" }; + log.warn("Status generation: model did not call propose_status", { modelString }); + continue; + } + + const { emoji, message } = toolResult.output; + return Ok({ + status: { emoji: emoji.trim(), message: message.trim() }, + modelUsed: modelString, + }); + } catch (error) { + lastError = mapNameGenerationError(error, modelString); + log.warn("Status generation failed, trying next candidate", { + modelString, + error: lastError, + }); + continue; + } finally { + // Mirror workspaceTitleGenerator: some providers attach cleanup hooks + // to the created model (notably the OpenAI Responses WebSocket + // transport, which attaches webSocketTransport.close). Without this + // call the periodic AgentStatusService loop would leak transports + // for every successful or failed candidate, every tick, every + // workspace. + runLanguageModelCleanup(modelResult.data); + } + } + + return Err({ + error: lastError ?? { + type: "configuration", + raw: "No working model candidates were available for workspace status generation.", + }, + reachedProvider, + }); +} diff --git a/src/node/utils/extensionMetadata.ts b/src/node/utils/extensionMetadata.ts index 551f77d181..aebc8c63bf 100644 --- a/src/node/utils/extensionMetadata.ts +++ b/src/node/utils/extensionMetadata.ts @@ -122,7 +122,6 @@ export function toWorkspaceActivitySnapshot( // agentStatus field. Project that forward into todoStatus until a fresh todo_write // or stream-stop snapshot rewrites the workspace metadata. coerceAgentStatus(metadata.agentStatus); - return { recency: metadata.recency, streaming: metadata.streaming,