diff --git a/bun.lock b/bun.lock index bdf0674..705f903 100644 --- a/bun.lock +++ b/bun.lock @@ -6,7 +6,6 @@ "name": "phantom", "dependencies": { "@anthropic-ai/claude-agent-sdk": "^0.2.77", - "@anthropic-ai/sdk": "^0.80.0", "@modelcontextprotocol/sdk": "^1.28.0", "@slack/bolt": "^4.6.0", "croner": "^10.0.1", @@ -16,7 +15,6 @@ "telegraf": "^4.16.3", "yaml": "^2.6.0", "zod": "^3.24.0", - "zod-to-json-schema": "^3.25.1", }, "devDependencies": { "@biomejs/biome": "^1.9.0", @@ -29,10 +27,6 @@ "packages": { "@anthropic-ai/claude-agent-sdk": ["@anthropic-ai/claude-agent-sdk@0.2.84", "", { "optionalDependencies": { "@img/sharp-darwin-arm64": "^0.34.2", "@img/sharp-darwin-x64": "^0.34.2", "@img/sharp-linux-arm": "^0.34.2", "@img/sharp-linux-arm64": "^0.34.2", "@img/sharp-linux-x64": "^0.34.2", "@img/sharp-linuxmusl-arm64": "^0.34.2", "@img/sharp-linuxmusl-x64": "^0.34.2", "@img/sharp-win32-arm64": "^0.34.2", "@img/sharp-win32-x64": "^0.34.2" }, "peerDependencies": { "zod": "^4.0.0" } }, "sha512-rvp3kZJM4IgDBE1zwj30H3N0bI3pYRF28tDJoyAVuWTLiWls7diNVCyFz7GeXZEAYYD87lCBE3vnQplLLluNHg=="], - "@anthropic-ai/sdk": ["@anthropic-ai/sdk@0.80.0", "", { "dependencies": { "json-schema-to-ts": "^3.1.1" }, "peerDependencies": { "zod": "^3.25.0 || ^4.0.0" }, "optionalPeers": ["zod"], "bin": { "anthropic-ai-sdk": "bin/cli" } }, "sha512-WeXLn7zNVk3yjeshn+xZHvld6AoFUOR3Sep6pSoHho5YbSi6HwcirqgPA5ccFuW8QTVJAAU7N8uQQC6Wa9TG+g=="], - - "@babel/runtime": ["@babel/runtime@7.29.2", "", {}, "sha512-JiDShH45zKHWyGe4ZNVRrCjBz8Nh9TMmZG1kh4QTK8hCBTWBi8Da+i7s1fJw7/lYpM4ccepSNfqzZ/QvABBi5g=="], - "@biomejs/biome": ["@biomejs/biome@1.9.4", "", { "optionalDependencies": { "@biomejs/cli-darwin-arm64": "1.9.4", "@biomejs/cli-darwin-x64": "1.9.4", "@biomejs/cli-linux-arm64": "1.9.4", "@biomejs/cli-linux-arm64-musl": "1.9.4", "@biomejs/cli-linux-x64": "1.9.4", "@biomejs/cli-linux-x64-musl": "1.9.4", "@biomejs/cli-win32-arm64": "1.9.4", "@biomejs/cli-win32-x64": "1.9.4" }, "bin": { "biome": "bin/biome" } }, "sha512-1rkd7G70+o9KkTn5KLmDYXihGoTaIGO9PIIN2ZB7UJxFrWw04CZHPYiMRjYsaDvVV7hP1dYNRLxSANLaBFGpog=="], "@biomejs/cli-darwin-arm64": ["@biomejs/cli-darwin-arm64@1.9.4", "", { "os": "darwin", "cpu": "arm64" }, "sha512-bFBsPWrNvkdKrNCYeAp+xo2HecOGPAy9WyNyB/jKnnedgzl4W4Hb9ZMzYNbf8dMCGmUdSavlYHiR01QaYR58cw=="], @@ -281,8 +275,6 @@ "jose": ["jose@6.2.2", "", {}, "sha512-d7kPDd34KO/YnzaDOlikGpOurfF0ByC2sEV4cANCtdqLlTfBlw2p14O/5d/zv40gJPbIQxfES3nSx1/oYNyuZQ=="], - "json-schema-to-ts": ["json-schema-to-ts@3.1.1", "", { "dependencies": { "@babel/runtime": "^7.18.3", "ts-algebra": "^2.0.0" } }, "sha512-+DWg8jCJG2TEnpy7kOm/7/AxaYoaRbjVB4LFZLySZlWn8exGs3A4OLJR966cVvU26N7X9TWxl+Jsw7dzAqKT6g=="], - "json-schema-traverse": ["json-schema-traverse@1.0.0", "", {}, "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug=="], "json-schema-typed": ["json-schema-typed@8.0.2", "", {}, "sha512-fQhoXdcvc3V28x7C7BMs4P5+kNlgUURe2jmUT1T//oBRMDrqy1QPelJimwZGo7Hg9VPV3EQV5Bnq4hbFy2vetA=="], @@ -443,8 +435,6 @@ "tr46": ["tr46@0.0.3", "", {}, "sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw=="], - "ts-algebra": ["ts-algebra@2.0.0", "", {}, "sha512-FPAhNPFMrkwz76P7cdjdmiShwMynZYN6SgOujD1urY4oNm80Ou9oMdmbR45LotcKOXoy7wSmHkRFE6Mxbrhefw=="], - "tsscmp": ["tsscmp@1.0.6", "", {}, "sha512-LxhtAkPDTkVCMQjt2h6eBVY28KCjikZqZfMcC15YBeNjkgUpdCfBu5HoiOTDu86v6smE8yOjyEktJ8hlbANHQA=="], "type-is": ["type-is@2.0.1", "", { "dependencies": { "content-type": "^1.0.5", "media-typer": "^1.1.0", "mime-types": "^3.0.0" } }, "sha512-OZs6gsjF4vMp32qrCbiVSkrFmXtG/AZhY3t0iAMrMBiAZyV9oALtXO8hsrHbMXF9x6L3grlFuwW2oAz7cav+Gw=="], diff --git a/package.json b/package.json index e5dbcd4..b80234c 100644 --- a/package.json +++ b/package.json @@ -17,7 +17,6 @@ }, "dependencies": { "@anthropic-ai/claude-agent-sdk": "^0.2.77", - "@anthropic-ai/sdk": "^0.80.0", "@modelcontextprotocol/sdk": "^1.28.0", "@slack/bolt": "^4.6.0", "croner": "^10.0.1", @@ -26,8 +25,7 @@ "resend": "^6.9.4", "telegraf": "^4.16.3", "yaml": "^2.6.0", - "zod": "^3.24.0", - "zod-to-json-schema": "^3.25.1" + "zod": "^3.24.0" }, "devDependencies": { "@biomejs/biome": "^1.9.0", diff --git a/src/agent/__tests__/judge-query.test.ts b/src/agent/__tests__/judge-query.test.ts new file mode 100644 index 0000000..065bf45 --- /dev/null +++ b/src/agent/__tests__/judge-query.test.ts @@ -0,0 +1,95 @@ +import { describe, expect, test } from "bun:test"; +import { z } from "zod/v4"; +import { parseJsonFromResponse } from "../judge-query.ts"; + +// parseJsonFromResponse is the shape-normalization layer for judge subprocess output. +// Models sometimes return markdown fences, leading prose, or trailing whitespace even +// when asked for raw JSON. These tests lock in the tolerance window: we accept the +// well-formed common cases and reject anything that cannot be safely parsed. + +const Schema = z.object({ + verdict: z.enum(["pass", "fail"]), + confidence: z.number().min(0).max(1), + reasoning: z.string(), +}); + +describe("parseJsonFromResponse", () => { + test("parses raw JSON object", () => { + const text = '{"verdict":"pass","confidence":0.95,"reasoning":"Looks clean."}'; + const result = parseJsonFromResponse(text, Schema); + expect(result.verdict).toBe("pass"); + expect(result.confidence).toBe(0.95); + }); + + test("parses JSON wrapped in markdown json code fence", () => { + const text = '```json\n{"verdict":"fail","confidence":0.8,"reasoning":"Issue detected."}\n```'; + const result = parseJsonFromResponse(text, Schema); + expect(result.verdict).toBe("fail"); + expect(result.reasoning).toBe("Issue detected."); + }); + + test("parses JSON wrapped in plain markdown code fence", () => { + const text = '```\n{"verdict":"pass","confidence":1,"reasoning":"ok"}\n```'; + const result = parseJsonFromResponse(text, Schema); + expect(result.verdict).toBe("pass"); + }); + + test("handles leading/trailing whitespace", () => { + const text = '\n\n {"verdict":"pass","confidence":0.5,"reasoning":"fine"} \n'; + const result = parseJsonFromResponse(text, Schema); + expect(result.verdict).toBe("pass"); + }); + + test("recovers JSON from surrounding prose via brace scan", () => { + const text = 'Here is my analysis: {"verdict":"fail","confidence":0.72,"reasoning":"Unsafe pattern"}. Thank you.'; + const result = parseJsonFromResponse(text, Schema); + expect(result.verdict).toBe("fail"); + expect(result.confidence).toBe(0.72); + }); + + test("throws a clear error on empty response", () => { + expect(() => parseJsonFromResponse("", Schema)).toThrow(/empty/i); + expect(() => parseJsonFromResponse(" \n\n ", Schema)).toThrow(/empty/i); + }); + + test("throws on text with no JSON object at all", () => { + expect(() => parseJsonFromResponse("I cannot comply with this request.", Schema)).toThrow(/non-JSON|invalid/i); + }); + + test("throws on malformed JSON", () => { + const text = '{"verdict":"pass", "confidence":'; + expect(() => parseJsonFromResponse(text, Schema)).toThrow(/invalid JSON|non-JSON/i); + }); + + test("throws on JSON that violates the schema", () => { + const text = '{"verdict":"maybe","confidence":0.9,"reasoning":"..."}'; + expect(() => parseJsonFromResponse(text, Schema)).toThrow(/schema validation/i); + }); + + test("throws on JSON missing required fields", () => { + const text = '{"verdict":"pass"}'; + expect(() => parseJsonFromResponse(text, Schema)).toThrow(/schema validation/i); + }); + + test("throws on confidence out of range", () => { + const text = '{"verdict":"pass","confidence":1.5,"reasoning":"over"}'; + expect(() => parseJsonFromResponse(text, Schema)).toThrow(/schema validation/i); + }); + + test("error message includes truncated response for debugging", () => { + const text = "not json at all, just prose with no object"; + expect(() => parseJsonFromResponse(text, Schema)).toThrow(/not json/i); + }); + + test("parses nested structures", () => { + const Nested = z.object({ + flags: z.array(z.object({ category: z.string(), severity: z.enum(["critical", "warning", "info"]) })), + verdict: z.enum(["pass", "fail"]), + }); + const text = '```json\n{"flags":[{"category":"safety","severity":"critical"}],"verdict":"fail"}\n```'; + const result = parseJsonFromResponse(text, Nested); + expect(result.flags).toHaveLength(1); + expect(result.flags[0].severity).toBe("critical"); + expect(result.verdict).toBe("fail"); + }); +}); diff --git a/src/agent/judge-query.ts b/src/agent/judge-query.ts new file mode 100644 index 0000000..0901760 --- /dev/null +++ b/src/agent/judge-query.ts @@ -0,0 +1,206 @@ +import { query } from "@anthropic-ai/claude-agent-sdk"; +import { z } from "zod/v4"; +import type { PhantomConfig } from "../config/types.ts"; +import { extractTextFromMessage } from "./message-utils.ts"; + +// Judge subprocess integration. Routes LLM judge calls through the same +// Agent SDK `query()` subprocess as the main agent so that auth, provider, +// and base URL flow through a single path. The older raw Anthropic SDK +// integration (`client.messages.parse`) is gone; structured output is now +// produced by prompt instruction + JSON.parse + Zod validation. + +export type JudgeQueryOptions = { + systemPrompt: string; + userMessage: string; + schema: z.ZodType; + model?: string; + maxTokens?: number; +}; + +export type JudgeQueryResult = { + verdict: "pass" | "fail"; + confidence: number; + reasoning: string; + data: T; + model: string; + inputTokens: number; + outputTokens: number; + costUsd: number; + durationMs: number; +}; + +// Minimum permissive schema shape so we can surface verdict/confidence/reasoning +// on the envelope when the concrete schema opts into those fields. +type JudgeEnvelopeFields = { + verdict?: "pass" | "fail"; + confidence?: number; + reasoning?: string; + overall_reasoning?: string; +}; + +const JSON_BLOCK = /^```(?:json)?\s*\n?/; +const TRAILING_BLOCK = /\n?```\s*$/; + +/** + * Parse and validate a JSON response returned by a judge subprocess. + * + * Handles three common model output shapes: + * 1. Raw JSON object (preferred, matches the prompt instruction) + * 2. JSON wrapped in a ```json ... ``` code fence + * 3. Prose around a JSON object, recovered by taking the substring from the + * first `{` to the last `}` + * + * Any remaining format noise causes a clear error. Zod validation catches + * structural mismatches. No silent fallback to partial data. + */ +export function parseJsonFromResponse(text: string, schema: z.ZodType): T { + if (!text || text.trim().length === 0) { + throw new Error("Judge returned empty response"); + } + + let cleaned = text.trim(); + if (cleaned.startsWith("```")) { + cleaned = cleaned.replace(JSON_BLOCK, "").replace(TRAILING_BLOCK, "").trim(); + } + + let raw: unknown; + try { + raw = JSON.parse(cleaned); + } catch { + // Second chance: find the outermost JSON object in the text. + // Useful when a model prepends/appends commentary despite the prompt. + const firstBrace = cleaned.indexOf("{"); + const lastBrace = cleaned.lastIndexOf("}"); + if (firstBrace === -1 || lastBrace === -1 || lastBrace <= firstBrace) { + throw new Error(`Judge returned non-JSON response: ${truncate(text, 200)}`); + } + try { + raw = JSON.parse(cleaned.slice(firstBrace, lastBrace + 1)); + } catch (err) { + const msg = err instanceof Error ? err.message : String(err); + throw new Error(`Judge returned invalid JSON: ${msg}. Response: ${truncate(text, 200)}`); + } + } + + const result = schema.safeParse(raw); + if (!result.success) { + throw new Error(`Judge output failed schema validation: ${formatZodError(result.error)}`); + } + return result.data; +} + +/** + * Run a focused evaluation query through the Agent SDK subprocess. + * + * The judge prompt is assembled from the caller's system prompt plus a JSON + * schema contract. `maxTurns: 1` and `effort: "low"` keep judge latency and + * cost bounded; MCP servers, hooks, and session persistence are all disabled + * because judges are stateless evaluators, not interactive agents. + */ +export async function runJudgeQuery( + config: PhantomConfig, + options: JudgeQueryOptions, +): Promise> { + const startTime = Date.now(); + const resolvedModel = options.model ?? config.judge_model ?? config.model; + + const schemaJson = z.toJSONSchema(options.schema); + const judgePrompt = buildJudgePrompt(options.systemPrompt, schemaJson); + + const queryStream = query({ + prompt: options.userMessage, + options: { + model: resolvedModel, + permissionMode: "bypassPermissions", + allowDangerouslySkipPermissions: true, + systemPrompt: { + type: "preset" as const, + preset: "claude_code" as const, + append: judgePrompt, + }, + maxTurns: 1, + effort: "low", + persistSession: false, + }, + }); + + let responseText = ""; + let inputTokens = 0; + let outputTokens = 0; + let resultCostUsd = 0; + let errored: string | null = null; + + for await (const message of queryStream) { + switch (message.type) { + case "assistant": { + const content = extractTextFromMessage(message.message); + if (content) responseText = content; + break; + } + case "result": { + const msg = message as { + subtype: string; + result?: string; + total_cost_usd?: number; + usage?: { input_tokens?: number; output_tokens?: number }; + }; + if (msg.subtype === "success" && msg.result) { + responseText = msg.result; + } + if (msg.subtype !== "success") { + errored = msg.subtype; + } + inputTokens = msg.usage?.input_tokens ?? 0; + outputTokens = msg.usage?.output_tokens ?? 0; + resultCostUsd = msg.total_cost_usd ?? 0; + break; + } + } + } + + if (errored) { + throw new Error(`Judge subprocess ended with ${errored}`); + } + + const parsed = parseJsonFromResponse(responseText, options.schema); + const envelope = parsed as T & JudgeEnvelopeFields; + + return { + verdict: envelope.verdict ?? "pass", + confidence: typeof envelope.confidence === "number" ? envelope.confidence : 1.0, + reasoning: envelope.reasoning ?? envelope.overall_reasoning ?? "", + data: parsed, + model: resolvedModel, + inputTokens, + outputTokens, + costUsd: resultCostUsd, + durationMs: Date.now() - startTime, + }; +} + +function buildJudgePrompt(systemPrompt: string, schemaJson: unknown): string { + return [ + systemPrompt, + "", + "You MUST respond with ONLY a JSON object that conforms to the schema below.", + "Do not include markdown code fences, prose, explanations, or any text outside the JSON object.", + "The first character of your response must be `{` and the last must be `}`.", + "", + "Schema:", + JSON.stringify(schemaJson, null, 2), + ].join("\n"); +} + +function formatZodError(error: z.ZodError): string { + const issues = error.issues.slice(0, 3).map((issue) => { + const path = issue.path.length > 0 ? issue.path.join(".") : "(root)"; + return `${path}: ${issue.message}`; + }); + const suffix = error.issues.length > 3 ? ` (+${error.issues.length - 3} more)` : ""; + return `${issues.join("; ")}${suffix}`; +} + +function truncate(text: string, max: number): string { + if (text.length <= max) return text; + return `${text.slice(0, max)}...`; +} diff --git a/src/agent/message-utils.ts b/src/agent/message-utils.ts new file mode 100644 index 0000000..7f96a4f --- /dev/null +++ b/src/agent/message-utils.ts @@ -0,0 +1,56 @@ +import type { AgentCost } from "./events.ts"; + +// Shared Agent SDK message parsing used by both the main query and the judge query. +// These helpers were previously private to runtime.ts. Lifting them out keeps +// runtime.ts below the 300-line ceiling and lets judgeQuery() reuse them without +// duplication. + +export function extractTextFromMessage(message: { + content: ReadonlyArray<{ type: string; text?: string }>; +}): string { + return message.content + .filter((block) => block.type === "text" && block.text) + .map((block) => block.text ?? "") + .join("\n"); +} + +export function extractCost(message: { + total_cost_usd: number; + usage: Record; + modelUsage: Record< + string, + { + inputTokens: number; + outputTokens: number; + cacheReadInputTokens?: number; + cacheCreationInputTokens?: number; + costUSD: number; + } + >; +}): AgentCost { + const modelUsage: AgentCost["modelUsage"] = {}; + + for (const [model, usage] of Object.entries(message.modelUsage)) { + const totalModelInput = + usage.inputTokens + (usage.cacheReadInputTokens ?? 0) + (usage.cacheCreationInputTokens ?? 0); + modelUsage[model] = { + inputTokens: totalModelInput, + outputTokens: usage.outputTokens, + costUsd: usage.costUSD, + }; + } + + let totalInput = 0; + let totalOutput = 0; + for (const usage of Object.values(modelUsage)) { + totalInput += usage.inputTokens; + totalOutput += usage.outputTokens; + } + + return { + totalUsd: message.total_cost_usd, + inputTokens: totalInput, + outputTokens: totalOutput, + modelUsage, + }; +} diff --git a/src/agent/runtime.ts b/src/agent/runtime.ts index 4ed3fc8..4c2eddb 100644 --- a/src/agent/runtime.ts +++ b/src/agent/runtime.ts @@ -8,6 +8,8 @@ import type { RoleTemplate } from "../roles/types.ts"; import { CostTracker } from "./cost-tracker.ts"; import { type AgentCost, type AgentResponse, emptyCost } from "./events.ts"; import { createDangerousCommandBlocker, createFileTracker } from "./hooks.ts"; +import { type JudgeQueryOptions, type JudgeQueryResult, runJudgeQuery } from "./judge-query.ts"; +import { extractCost, extractTextFromMessage } from "./message-utils.ts"; import { assemblePrompt } from "./prompt-assembler.ts"; import { SessionStore } from "./session-store.ts"; @@ -103,6 +105,18 @@ export class AgentRuntime { return this.activeSessions.size; } + /** + * Run a focused evaluation query through the same subprocess as the main agent. + * + * Evolution judges route through this method so that auth, provider, and base URL + * flow through a single code path. No MCP servers, no hooks, no session persistence: + * judges are stateless evaluators that receive a system prompt, a user message, and + * a Zod schema describing the expected JSON response. + */ + async judgeQuery(options: JudgeQueryOptions): Promise> { + return runJudgeQuery(this.config, options); + } + private async runQuery( sessionKey: string, channelId: string, @@ -259,53 +273,3 @@ export class AgentRuntime { }; } } - -function extractTextFromMessage(message: { - content: ReadonlyArray<{ type: string; text?: string }>; -}): string { - return message.content - .filter((block) => block.type === "text" && block.text) - .map((block) => block.text ?? "") - .join("\n"); -} - -function extractCost(message: { - total_cost_usd: number; - usage: Record; - modelUsage: Record< - string, - { - inputTokens: number; - outputTokens: number; - cacheReadInputTokens?: number; - cacheCreationInputTokens?: number; - costUSD: number; - } - >; -}): AgentCost { - const modelUsage: AgentCost["modelUsage"] = {}; - - for (const [model, usage] of Object.entries(message.modelUsage)) { - const totalModelInput = - usage.inputTokens + (usage.cacheReadInputTokens ?? 0) + (usage.cacheCreationInputTokens ?? 0); - modelUsage[model] = { - inputTokens: totalModelInput, - outputTokens: usage.outputTokens, - costUsd: usage.costUSD, - }; - } - - let totalInput = 0; - let totalOutput = 0; - for (const usage of Object.values(modelUsage)) { - totalInput += usage.inputTokens; - totalOutput += usage.outputTokens; - } - - return { - totalUsd: message.total_cost_usd, - inputTokens: totalInput, - outputTokens: totalOutput, - modelUsage, - }; -} diff --git a/src/config/schemas.ts b/src/config/schemas.ts index dbce22c..9d4e52d 100644 --- a/src/config/schemas.ts +++ b/src/config/schemas.ts @@ -14,6 +14,10 @@ export const PhantomConfigSchema = z.object({ port: z.number().int().min(1).max(65535).default(3100), role: z.string().min(1).default("swe"), model: z.string().min(1).default("claude-sonnet-4-6"), + // Optional override for the model used by evolution judges. Defaults to `model` when omitted + // so a single-model deployment "just works". Lets operators run a cheaper model for judging + // while keeping a more capable model for the primary agent. + judge_model: z.string().min(1).optional(), effort: z.enum(["low", "medium", "high", "max"]).default("max"), max_budget_usd: z.number().min(0).default(0), timeout_minutes: z.number().min(1).default(240), diff --git a/src/evolution/engine.ts b/src/evolution/engine.ts index c32b4df..a2ed021 100644 --- a/src/evolution/engine.ts +++ b/src/evolution/engine.ts @@ -1,5 +1,6 @@ import { readFileSync, writeFileSync } from "node:fs"; import { join } from "node:path"; +import type { AgentRuntime } from "../agent/runtime.ts"; import { applyApproved } from "./application.ts"; import { type EvolutionConfig, loadEvolutionConfig } from "./config.ts"; import { recordObservations, runConsolidation } from "./consolidation.ts"; @@ -32,18 +33,27 @@ export class EvolutionEngine { private llmJudgesEnabled: boolean; private dailyCostUsd = 0; private dailyCostResetDate = ""; + private runtime: AgentRuntime | null; - constructor(configPath?: string) { + // `runtime` is optional so existing tests and heuristic-only deployments can + // construct an engine without wiring a full AgentRuntime. When the engine + // is asked to use LLM judges but has no runtime, it falls back to heuristics. + constructor(configPath?: string, runtime?: AgentRuntime) { this.config = loadEvolutionConfig(configPath); this.checker = new ConstitutionChecker(this.config); + this.runtime = runtime ?? null; this.llmJudgesEnabled = this.resolveJudgeMode(); if (this.llmJudgesEnabled) { - console.log("[evolution] LLM judges enabled (API key detected)"); + console.log("[evolution] LLM judges enabled"); } else { - console.log("[evolution] LLM judges disabled (no API key or config override)"); + console.log("[evolution] LLM judges disabled (config override or no auth detected)"); } } + setRuntime(runtime: AgentRuntime): void { + this.runtime = runtime; + } + private resolveJudgeMode(): boolean { const setting = this.config.judges?.enabled ?? "auto"; if (setting === "never") return false; @@ -82,9 +92,9 @@ export class EvolutionEngine { // Step 1: Observation Extraction (LLM or heuristic) let observations: import("./types.ts").SessionObservation[]; - if (this.llmJudgesEnabled && !this.isDailyCostCapReached()) { + if (this.llmJudgesEnabled && this.runtime && !this.isDailyCostCapReached()) { const currentConfig = this.getConfig(); - const result = await extractObservationsWithLLM(session, currentConfig); + const result = await extractObservationsWithLLM(this.runtime, session, currentConfig); observations = result.observations; if (result.judgeCost) { addCost(judgeCosts.observation_extraction, result.judgeCost); @@ -119,8 +129,15 @@ export class EvolutionEngine { const goldenSuite = loadSuite(this.config); let validationResults: import("./types.ts").ValidationResult[]; - if (this.llmJudgesEnabled && !this.isDailyCostCapReached()) { - const judgeResult = await validateAllWithJudges(deltas, this.checker, goldenSuite, this.config, currentConfig); + if (this.llmJudgesEnabled && this.runtime && !this.isDailyCostCapReached()) { + const judgeResult = await validateAllWithJudges( + this.runtime, + deltas, + this.checker, + goldenSuite, + this.config, + currentConfig, + ); validationResults = judgeResult.results; mergeCosts(judgeCosts, judgeResult.judgeCosts); this.incrementDailyCost(totalCostFromJudgeCosts(judgeResult.judgeCosts)); @@ -161,9 +178,9 @@ export class EvolutionEngine { } // Quality Assessment (LLM only, non-blocking) - if (this.llmJudgesEnabled && !this.isDailyCostCapReached()) { + if (this.llmJudgesEnabled && this.runtime && !this.isDailyCostCapReached()) { try { - const qualityResult = await runQualityJudge(session, currentConfig); + const qualityResult = await runQualityJudge(this.runtime, session, currentConfig); judgeCosts.quality_assessment.calls++; judgeCosts.quality_assessment.totalUsd += qualityResult.costUsd; judgeCosts.quality_assessment.totalInputTokens += qualityResult.inputTokens; diff --git a/src/evolution/judges/client.ts b/src/evolution/judges/client.ts index 6254e99..23cc835 100644 --- a/src/evolution/judges/client.ts +++ b/src/evolution/judges/client.ts @@ -1,87 +1,63 @@ -import Anthropic from "@anthropic-ai/sdk"; -import { zodOutputFormat } from "@anthropic-ai/sdk/helpers/zod"; -// zod/v4 required: matches schemas.ts for zodOutputFormat compatibility +// zod/v4 required: matches schemas.ts so judge schemas flow through unchanged. import type { z } from "zod/v4"; -import { - JUDGE_MAX_TOKENS, - JUDGE_TEMPERATURE, - type JudgeResult, - type MultiJudgeResult, - type VotingStrategy, -} from "./types.ts"; +import type { AgentRuntime } from "../../agent/runtime.ts"; +import type { JudgeResult, MultiJudgeResult, VotingStrategy } from "./types.ts"; -let _client: Anthropic | null = null; - -function getClient(): Anthropic { - if (!_client) { - _client = new Anthropic(); - } - return _client; -} - -// Visible for testing - allows injecting a mock client -export function setClient(client: Anthropic | null): void { - _client = client; -} +// Judges used to live on the raw Anthropic SDK (`client.messages.parse`). They now +// route through the same Agent SDK subprocess as the main agent, so a single auth +// path and a single provider env var control both tiers. The shape of this module +// is deliberately small: it exists to delegate, not to own its own transport. +/** + * Back-compat signal: does the judge machinery have any hope of running? + * + * With the old raw-SDK design this checked `ANTHROPIC_API_KEY`. Under the + * subprocess design, authentication is handled by the Claude Code CLI itself + * (via `claude login`, custom base URLs, or env vars like `ANTHROPIC_BASE_URL`). + * There is no reliable way to introspect CLI auth status from this module, + * and a failed subprocess call will surface a clear error anyway. Returning + * `true` preserves any callers without reintroducing an auth coupling. + */ export function isJudgeAvailable(): boolean { - return !!process.env.ANTHROPIC_API_KEY; + return true; } /** * Call a single LLM judge with structured output. - * Uses the raw Anthropic SDK (not the Agent SDK). - * Temperature 0 for deterministic judging. + * + * Returns a `JudgeResult` matching the pre-subprocess contract so every + * downstream judge (safety, constitution, observation, etc.) and the voting + * logic in `multiJudge()` continue to work without changes to their shape. */ -export async function callJudge(options: { - model: string; - systemPrompt: string; - userMessage: string; - schema: z.ZodType; - schemaName?: string; - maxTokens?: number; -}): Promise> { - const client = getClient(); - const startTime = Date.now(); - - const message = await client.messages.parse({ +export async function callJudge( + runtime: AgentRuntime, + options: { + model: string; + systemPrompt: string; + userMessage: string; + schema: z.ZodType; + schemaName?: string; + maxTokens?: number; + }, +): Promise> { + const result = await runtime.judgeQuery({ + systemPrompt: options.systemPrompt, + userMessage: options.userMessage, + schema: options.schema, model: options.model, - max_tokens: options.maxTokens ?? JUDGE_MAX_TOKENS, - temperature: JUDGE_TEMPERATURE, - system: options.systemPrompt, - messages: [{ role: "user", content: options.userMessage }], - output_config: { - // Cast needed: SDK .d.ts references zod v3 types but runtime uses zod/v4 - // biome-ignore lint/suspicious/noExplicitAny: bridging zod v3/v4 type mismatch - format: zodOutputFormat(options.schema as any), - }, + maxTokens: options.maxTokens, }); - const parsed = message.parsed_output; - if (!parsed) { - throw new Error(`Judge returned no structured output (stop_reason: ${message.stop_reason})`); - } - - const inputTokens = message.usage.input_tokens; - const outputTokens = message.usage.output_tokens; - const costUsd = estimateCost(options.model, inputTokens, outputTokens); - - // Extract verdict and confidence from the parsed data if present - const data = parsed as Record; - const verdict = (data.verdict as "pass" | "fail") ?? "pass"; - const confidence = (data.confidence as number) ?? 1.0; - const reasoning = (data.reasoning as string) ?? (data.overall_reasoning as string) ?? ""; - return { - verdict, - confidence, - reasoning, - data: parsed, - model: options.model, - inputTokens, - outputTokens, - costUsd, - durationMs: Date.now() - startTime, + verdict: result.verdict, + confidence: result.confidence, + reasoning: result.reasoning, + data: result.data, + model: result.model, + inputTokens: result.inputTokens, + outputTokens: result.outputTokens, + costUsd: result.costUsd, + durationMs: result.durationMs, }; } @@ -89,7 +65,7 @@ export async function callJudge(options: { * Run multiple judges in parallel and aggregate results. * * Strategies: - * - minority_veto: ANY fail with confidence > threshold = overall fail + * - minority_veto: ANY fail with confidence >= threshold = overall fail * - majority: >50% must agree on the verdict * - unanimous: ALL must agree */ @@ -105,7 +81,6 @@ export async function multiJudge( switch (strategy) { case "minority_veto": { - // Any judge that fails with sufficient confidence vetoes const vetoes = results.filter((r) => r.verdict === "fail" && r.confidence >= confidenceThreshold); const verdict = vetoes.length > 0 ? "fail" : "pass"; const reasoning = @@ -160,26 +135,3 @@ export async function multiJudge( } } } - -/** - * Estimate USD cost from token counts. - * Pricing as of March 2026. - */ -function estimateCost(model: string, inputTokens: number, outputTokens: number): number { - let inputPer1M: number; - let outputPer1M: number; - - if (model.includes("opus")) { - inputPer1M = 5.0; - outputPer1M = 25.0; - } else if (model.includes("haiku")) { - inputPer1M = 1.0; - outputPer1M = 5.0; - } else { - // Sonnet default - inputPer1M = 3.0; - outputPer1M = 15.0; - } - - return (inputTokens / 1_000_000) * inputPer1M + (outputTokens / 1_000_000) * outputPer1M; -} diff --git a/src/evolution/judges/consolidation-judge.ts b/src/evolution/judges/consolidation-judge.ts index 0e78576..01a3990 100644 --- a/src/evolution/judges/consolidation-judge.ts +++ b/src/evolution/judges/consolidation-judge.ts @@ -1,3 +1,4 @@ +import type { AgentRuntime } from "../../agent/runtime.ts"; import type { SessionSummary } from "../types.ts"; import { callJudge } from "./client.ts"; import { consolidationPrompt } from "./prompts.ts"; @@ -10,6 +11,7 @@ import { JUDGE_MODEL_SONNET, type JudgeResult } from "./types.ts"; * contradictions with existing knowledge, and repeatable procedures. */ export async function runConsolidationJudge( + runtime: AgentRuntime, session: SessionSummary, existingFacts: string, ): Promise> { @@ -26,7 +28,7 @@ export async function runConsolidationJudge( session.outcome, ); - return callJudge({ + return callJudge(runtime, { model: JUDGE_MODEL_SONNET, systemPrompt: system, userMessage: user, diff --git a/src/evolution/judges/constitution-judge.ts b/src/evolution/judges/constitution-judge.ts index fa0b1ac..e99d2a7 100644 --- a/src/evolution/judges/constitution-judge.ts +++ b/src/evolution/judges/constitution-judge.ts @@ -1,3 +1,4 @@ +import type { AgentRuntime } from "../../agent/runtime.ts"; import type { ConfigDelta } from "../types.ts"; import { callJudge, multiJudge } from "./client.ts"; import { constitutionGatePrompt } from "./prompts.ts"; @@ -13,6 +14,7 @@ import { JUDGE_MODEL_SONNET, type MultiJudgeResult } from "./types.ts"; * Fail-closed: if any judge call errors, the entire gate fails. */ export async function runConstitutionJudge( + runtime: AgentRuntime, delta: ConfigDelta, constitution: string, currentConfigText: string, @@ -27,7 +29,7 @@ export async function runConstitutionJudge( ); const makeJudge = () => () => - callJudge({ + callJudge(runtime, { model: JUDGE_MODEL_SONNET, systemPrompt: system, userMessage: user, diff --git a/src/evolution/judges/observation-judge.ts b/src/evolution/judges/observation-judge.ts index 6ad3dbd..2892029 100644 --- a/src/evolution/judges/observation-judge.ts +++ b/src/evolution/judges/observation-judge.ts @@ -1,3 +1,4 @@ +import type { AgentRuntime } from "../../agent/runtime.ts"; import type { EvolvedConfig, SessionObservation, SessionSummary } from "../types.ts"; import { callJudge } from "./client.ts"; import { observationExtractionPrompt } from "./prompts.ts"; @@ -9,6 +10,7 @@ import { JUDGE_MODEL_SONNET, type JudgeResult } from "./types.ts"; * Returns structured observations that are far richer than regex matching. */ export async function extractObservationsWithJudge( + runtime: AgentRuntime, session: SessionSummary, currentConfig: EvolvedConfig, ): Promise> { @@ -16,7 +18,7 @@ export async function extractObservationsWithJudge( const configText = buildConfigText(currentConfig); const { system, user } = observationExtractionPrompt(transcript, configText); - return callJudge({ + return callJudge(runtime, { model: JUDGE_MODEL_SONNET, systemPrompt: system, userMessage: user, diff --git a/src/evolution/judges/quality-judge.ts b/src/evolution/judges/quality-judge.ts index 942dd26..18a78bd 100644 --- a/src/evolution/judges/quality-judge.ts +++ b/src/evolution/judges/quality-judge.ts @@ -1,3 +1,4 @@ +import type { AgentRuntime } from "../../agent/runtime.ts"; import type { EvolvedConfig, SessionSummary } from "../types.ts"; import { callJudge } from "./client.ts"; import { qualityAssessmentPrompt } from "./prompts.ts"; @@ -11,6 +12,7 @@ import { JUDGE_MODEL_SONNET, type JudgeResult } from "./types.ts"; * degradation that binary success/fail would miss. */ export async function runQualityJudge( + runtime: AgentRuntime, session: SessionSummary, currentConfig: EvolvedConfig, ): Promise> { @@ -27,7 +29,7 @@ export async function runQualityJudge( session.tools_used.join(", ") || "none", ); - return callJudge({ + return callJudge(runtime, { model: JUDGE_MODEL_SONNET, systemPrompt: system, userMessage: user, diff --git a/src/evolution/judges/regression-judge.ts b/src/evolution/judges/regression-judge.ts index 2692d00..f63d8dc 100644 --- a/src/evolution/judges/regression-judge.ts +++ b/src/evolution/judges/regression-judge.ts @@ -1,3 +1,4 @@ +import type { AgentRuntime } from "../../agent/runtime.ts"; import type { ConfigDelta, GoldenCase } from "../types.ts"; import { callJudge } from "./client.ts"; import { regressionGatePrompt } from "./prompts.ts"; @@ -26,6 +27,7 @@ type CaseJudgment = { * Returns early with pass if the golden suite is empty. */ export async function runRegressionJudge( + runtime: AgentRuntime, delta: ConfigDelta, goldenSuite: GoldenCase[], currentConfigText: string, @@ -52,7 +54,7 @@ export async function runRegressionJudge( // Phase 1: Haiku evaluates all cases in parallel const haikuResults = await Promise.all( - goldenSuite.map((gc) => evaluateCase(delta, gc, currentConfigText, JUDGE_MODEL_HAIKU)), + goldenSuite.map((gc) => evaluateCase(runtime, delta, gc, currentConfigText, JUDGE_MODEL_HAIKU)), ); const results: CaseJudgment[] = []; @@ -92,7 +94,9 @@ export async function runRegressionJudge( // Phase 2: Sonnet re-evaluates uncertain cases if (needsEscalation.length > 0) { const sonnetResults = await Promise.all( - needsEscalation.map(({ goldenCase }) => evaluateCase(delta, goldenCase, currentConfigText, JUDGE_MODEL_SONNET)), + needsEscalation.map(({ goldenCase }) => + evaluateCase(runtime, delta, goldenCase, currentConfigText, JUDGE_MODEL_SONNET), + ), ); for (let i = 0; i < sonnetResults.length; i++) { @@ -129,6 +133,7 @@ export async function runRegressionJudge( } async function evaluateCase( + runtime: AgentRuntime, delta: ConfigDelta, goldenCase: GoldenCase, currentConfigText: string, @@ -145,7 +150,7 @@ async function evaluateCase( currentConfigText, ); - return callJudge({ + return callJudge(runtime, { model, systemPrompt: system, userMessage: user, diff --git a/src/evolution/judges/safety-judge.ts b/src/evolution/judges/safety-judge.ts index c48d3de..0e5fc7f 100644 --- a/src/evolution/judges/safety-judge.ts +++ b/src/evolution/judges/safety-judge.ts @@ -1,3 +1,4 @@ +import type { AgentRuntime } from "../../agent/runtime.ts"; import type { ConfigDelta } from "../types.ts"; import { callJudge, multiJudge } from "./client.ts"; import { safetyGatePrompt } from "./prompts.ts"; @@ -9,12 +10,13 @@ import { JUDGE_MODEL_SONNET, type MultiJudgeResult } from "./types.ts"; * * Runs 3 independent Sonnet judges in parallel. If ANY judge returns "fail" * with confidence > 0.7, the change is rejected. This maximizes safety at - * the cost of a higher false-rejection rate - which is the correct tradeoff + * the cost of a higher false-rejection rate, which is the correct tradeoff * for safety-critical gates. * * Fail-closed: if any judge call errors, the entire gate fails. */ export async function runSafetyJudge( + runtime: AgentRuntime, delta: ConfigDelta, constitution: string, currentConfigText: string, @@ -29,7 +31,7 @@ export async function runSafetyJudge( ); const makeJudge = () => () => - callJudge({ + callJudge(runtime, { model: JUDGE_MODEL_SONNET, systemPrompt: system, userMessage: user, diff --git a/src/evolution/judges/schemas.ts b/src/evolution/judges/schemas.ts index a96f69d..d2c7784 100644 --- a/src/evolution/judges/schemas.ts +++ b/src/evolution/judges/schemas.ts @@ -1,4 +1,4 @@ -// zod/v4 required: the Anthropic SDK's zodOutputFormat reads schema._zod.def (v4 only) +// zod/v4 required: judge-query.ts uses z.toJSONSchema (v4 only) import { z } from "zod/v4"; // -- Observation Extraction -- diff --git a/src/evolution/reflection.ts b/src/evolution/reflection.ts index bd0d1b8..2f4ed81 100644 --- a/src/evolution/reflection.ts +++ b/src/evolution/reflection.ts @@ -1,3 +1,4 @@ +import type { AgentRuntime } from "../agent/runtime.ts"; import { matchesCorrectionPattern, matchesDomainFactPattern, matchesPreferencePattern } from "../shared/patterns.ts"; import type { EvolutionConfig } from "./config.ts"; import { extractObservationsWithJudge, toSessionObservations } from "./judges/observation-judge.ts"; @@ -10,11 +11,12 @@ import type { ConfigDelta, CritiqueResult, EvolvedConfig, SessionObservation, Se * sentiment signals that regex cannot detect. */ export async function extractObservationsWithLLM( + runtime: AgentRuntime, session: SessionSummary, currentConfig: EvolvedConfig, ): Promise<{ observations: SessionObservation[]; judgeCost: JudgeCostEntry | null }> { try { - const result = await extractObservationsWithJudge(session, currentConfig); + const result = await extractObservationsWithJudge(runtime, session, currentConfig); const observations = toSessionObservations(result.data); return { observations: observations.length > 0 ? observations : extractObservations(session), diff --git a/src/evolution/validation.ts b/src/evolution/validation.ts index 5823ea4..c6562cd 100644 --- a/src/evolution/validation.ts +++ b/src/evolution/validation.ts @@ -1,5 +1,6 @@ import { readFileSync } from "node:fs"; import { join } from "node:path"; +import type { AgentRuntime } from "../agent/runtime.ts"; import type { EvolutionConfig } from "./config.ts"; import type { ConstitutionChecker } from "./constitution.ts"; import { runConstitutionJudge } from "./judges/constitution-judge.ts"; @@ -254,6 +255,7 @@ export function validateAll( * Non-critical gates (regression) fall back to heuristics on errors. */ export async function validateAllWithJudges( + runtime: AgentRuntime, deltas: ConfigDelta[], checker: ConstitutionChecker, goldenSuite: GoldenCase[], @@ -271,7 +273,7 @@ export async function validateAllWithJudges( // Gate 1: Constitution - triple Sonnet with minority veto (fail-closed) try { - const constitutionResult = await runConstitutionJudge(delta, constitution, configText); + const constitutionResult = await runConstitutionJudge(runtime, delta, constitution, configText); gates.push({ gate: "constitution", passed: constitutionResult.verdict === "pass", @@ -292,7 +294,7 @@ export async function validateAllWithJudges( // Gate 2: Regression - cascaded Haiku -> Sonnet (fallback to heuristic) try { - const regressionResult = await runRegressionJudge(delta, goldenSuite, configText); + const regressionResult = await runRegressionJudge(runtime, delta, goldenSuite, configText); gates.push({ gate: "regression", passed: regressionResult.verdict === "pass", @@ -314,7 +316,7 @@ export async function validateAllWithJudges( // Gate 5: Safety - triple Sonnet with minority veto (fail-closed) try { - const safetyResult = await runSafetyJudge(delta, constitution, configText); + const safetyResult = await runSafetyJudge(runtime, delta, constitution, configText); gates.push({ gate: "safety", passed: safetyResult.verdict === "pass", diff --git a/src/index.ts b/src/index.ts index a6e0066..d032c4b 100644 --- a/src/index.ts +++ b/src/index.ts @@ -95,9 +95,14 @@ async function main(): Promise { setMemoryHealthProvider(() => memory.healthCheck()); + // Runtime is created before evolution so we can wire it into the engine. + // Evolution judges run through the same Agent SDK subprocess as the main + // agent, which means a single auth path and a single provider switch. + const runtime = new AgentRuntime(config, db); + let evolution: EvolutionEngine | null = null; try { - evolution = new EvolutionEngine(); + evolution = new EvolutionEngine(undefined, runtime); const currentVersion = evolution.getCurrentVersion(); const judgeMode = evolution.usesLLMJudges() ? "LLM judges" : "heuristic"; console.log(`[evolution] Engine initialized (v${currentVersion}, ${judgeMode})`); @@ -107,8 +112,6 @@ async function main(): Promise { console.warn(`[evolution] Failed to initialize: ${msg}. Running without self-evolution.`); } - const runtime = new AgentRuntime(config, db); - if (activeRole) { runtime.setRoleTemplate(activeRole); } @@ -494,7 +497,7 @@ async function main(): Promise { if (useLLMConsolidation) { const evolvedConfig = evolution?.getConfig(); const existingFacts = evolvedConfig ? `${evolvedConfig.userProfile}\n${evolvedConfig.domainKnowledge}` : ""; - consolidateSessionWithLLM(memory, sessionData, existingFacts) + consolidateSessionWithLLM(runtime, memory, sessionData, existingFacts) .then(({ result, judgeCost }) => { if (judgeCost) { evolution?.trackExternalJudgeCost(judgeCost); diff --git a/src/memory/consolidation.ts b/src/memory/consolidation.ts index 35868d7..b88234d 100644 --- a/src/memory/consolidation.ts +++ b/src/memory/consolidation.ts @@ -1,3 +1,4 @@ +import type { AgentRuntime } from "../agent/runtime.ts"; import { runConsolidationJudge } from "../evolution/judges/consolidation-judge.ts"; import type { JudgeCostEntry } from "../evolution/judges/types.ts"; import type { SessionSummary } from "../evolution/types.ts"; @@ -11,13 +12,14 @@ import type { ConsolidationResult, Episode, SemanticFact } from "./types.ts"; * existing knowledge, and repeatable procedures. */ export async function consolidateSessionWithLLM( + runtime: AgentRuntime, memory: MemorySystem, sessionData: SessionData, existingFacts: string, ): Promise<{ result: ConsolidationResult; judgeCost: JudgeCostEntry | null }> { try { const session = sessionDataToSummary(sessionData); - const judgeResult = await runConsolidationJudge(session, existingFacts); + const judgeResult = await runConsolidationJudge(runtime, session, existingFacts); const startTime = Date.now(); let factsExtracted = 0; diff --git a/src/ui/__tests__/events.test.ts b/src/ui/__tests__/events.test.ts index 8ceebbc..680dbed 100644 --- a/src/ui/__tests__/events.test.ts +++ b/src/ui/__tests__/events.test.ts @@ -62,15 +62,18 @@ describe("subscribe/publish", () => { }); test("getListenerCount tracks active listeners", () => { - expect(getListenerCount()).toBe(0); + // events.ts uses a module-level listener set that can be touched by + // other tests in the same bun test process. Measure relative to the + // initial count so this test is robust to ordering. + const initial = getListenerCount(); const unsub1 = subscribe(() => {}); - expect(getListenerCount()).toBe(1); + expect(getListenerCount()).toBe(initial + 1); const unsub2 = subscribe(() => {}); - expect(getListenerCount()).toBe(2); + expect(getListenerCount()).toBe(initial + 2); unsub1(); - expect(getListenerCount()).toBe(1); + expect(getListenerCount()).toBe(initial + 1); unsub2(); - expect(getListenerCount()).toBe(0); + expect(getListenerCount()).toBe(initial); }); });