diff --git a/bun.lock b/bun.lock
index bdf0674..705f903 100644
--- a/bun.lock
+++ b/bun.lock
@@ -6,7 +6,6 @@
       "name": "phantom",
       "dependencies": {
         "@anthropic-ai/claude-agent-sdk": "^0.2.77",
-        "@anthropic-ai/sdk": "^0.80.0",
         "@modelcontextprotocol/sdk": "^1.28.0",
         "@slack/bolt": "^4.6.0",
         "croner": "^10.0.1",
@@ -16,7 +15,6 @@
         "telegraf": "^4.16.3",
         "yaml": "^2.6.0",
         "zod": "^3.24.0",
-        "zod-to-json-schema": "^3.25.1",
       },
       "devDependencies": {
         "@biomejs/biome": "^1.9.0",
@@ -29,10 +27,6 @@
   "packages": {
     "@anthropic-ai/claude-agent-sdk": ["@anthropic-ai/claude-agent-sdk@0.2.84", "", { "optionalDependencies": { "@img/sharp-darwin-arm64": "^0.34.2", "@img/sharp-darwin-x64": "^0.34.2", "@img/sharp-linux-arm": "^0.34.2", "@img/sharp-linux-arm64": "^0.34.2", "@img/sharp-linux-x64": "^0.34.2", "@img/sharp-linuxmusl-arm64": "^0.34.2", "@img/sharp-linuxmusl-x64": "^0.34.2", "@img/sharp-win32-arm64": "^0.34.2", "@img/sharp-win32-x64": "^0.34.2" }, "peerDependencies": { "zod": "^4.0.0" } }, "sha512-rvp3kZJM4IgDBE1zwj30H3N0bI3pYRF28tDJoyAVuWTLiWls7diNVCyFz7GeXZEAYYD87lCBE3vnQplLLluNHg=="],
 
-    "@anthropic-ai/sdk": ["@anthropic-ai/sdk@0.80.0", "", { "dependencies": { "json-schema-to-ts": "^3.1.1" }, "peerDependencies": { "zod": "^3.25.0 || ^4.0.0" }, "optionalPeers": ["zod"], "bin": { "anthropic-ai-sdk": "bin/cli" } }, "sha512-WeXLn7zNVk3yjeshn+xZHvld6AoFUOR3Sep6pSoHho5YbSi6HwcirqgPA5ccFuW8QTVJAAU7N8uQQC6Wa9TG+g=="],
-
-    "@babel/runtime": ["@babel/runtime@7.29.2", "", {}, "sha512-JiDShH45zKHWyGe4ZNVRrCjBz8Nh9TMmZG1kh4QTK8hCBTWBi8Da+i7s1fJw7/lYpM4ccepSNfqzZ/QvABBi5g=="],
-
     "@biomejs/biome": ["@biomejs/biome@1.9.4", "", { "optionalDependencies": { "@biomejs/cli-darwin-arm64": "1.9.4", "@biomejs/cli-darwin-x64": "1.9.4", "@biomejs/cli-linux-arm64": "1.9.4", "@biomejs/cli-linux-arm64-musl": "1.9.4", "@biomejs/cli-linux-x64": "1.9.4", "@biomejs/cli-linux-x64-musl": "1.9.4", "@biomejs/cli-win32-arm64": "1.9.4", "@biomejs/cli-win32-x64": "1.9.4" }, "bin": { "biome": "bin/biome" } }, "sha512-1rkd7G70+o9KkTn5KLmDYXihGoTaIGO9PIIN2ZB7UJxFrWw04CZHPYiMRjYsaDvVV7hP1dYNRLxSANLaBFGpog=="],
 
     "@biomejs/cli-darwin-arm64": ["@biomejs/cli-darwin-arm64@1.9.4", "", { "os": "darwin", "cpu": "arm64" }, "sha512-bFBsPWrNvkdKrNCYeAp+xo2HecOGPAy9WyNyB/jKnnedgzl4W4Hb9ZMzYNbf8dMCGmUdSavlYHiR01QaYR58cw=="],
@@ -281,8 +275,6 @@
 
     "jose": ["jose@6.2.2", "", {}, "sha512-d7kPDd34KO/YnzaDOlikGpOurfF0ByC2sEV4cANCtdqLlTfBlw2p14O/5d/zv40gJPbIQxfES3nSx1/oYNyuZQ=="],
 
-    "json-schema-to-ts": ["json-schema-to-ts@3.1.1", "", { "dependencies": { "@babel/runtime": "^7.18.3", "ts-algebra": "^2.0.0" } }, "sha512-+DWg8jCJG2TEnpy7kOm/7/AxaYoaRbjVB4LFZLySZlWn8exGs3A4OLJR966cVvU26N7X9TWxl+Jsw7dzAqKT6g=="],
-
     "json-schema-traverse": ["json-schema-traverse@1.0.0", "", {}, "sha512-NM8/P9n3XjXhIZn1lLhkFaACTOURQXjWhV4BA/RnOv8xvgqtqpAX9IO4mRQxSx1Rlo4tqzeqb0sOlruaOy3dug=="],
 
     "json-schema-typed": ["json-schema-typed@8.0.2", "", {}, "sha512-fQhoXdcvc3V28x7C7BMs4P5+kNlgUURe2jmUT1T//oBRMDrqy1QPelJimwZGo7Hg9VPV3EQV5Bnq4hbFy2vetA=="],
@@ -443,8 +435,6 @@
 
     "tr46": ["tr46@0.0.3", "", {}, "sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw=="],
 
-    "ts-algebra": ["ts-algebra@2.0.0", "", {}, "sha512-FPAhNPFMrkwz76P7cdjdmiShwMynZYN6SgOujD1urY4oNm80Ou9oMdmbR45LotcKOXoy7wSmHkRFE6Mxbrhefw=="],
-
     "tsscmp": ["tsscmp@1.0.6", "", {}, "sha512-LxhtAkPDTkVCMQjt2h6eBVY28KCjikZqZfMcC15YBeNjkgUpdCfBu5HoiOTDu86v6smE8yOjyEktJ8hlbANHQA=="],
 
     "type-is": ["type-is@2.0.1", "", { "dependencies": { "content-type": "^1.0.5", "media-typer": "^1.1.0", "mime-types": "^3.0.0" } }, "sha512-OZs6gsjF4vMp32qrCbiVSkrFmXtG/AZhY3t0iAMrMBiAZyV9oALtXO8hsrHbMXF9x6L3grlFuwW2oAz7cav+Gw=="],
diff --git a/package.json b/package.json
index e5dbcd4..b80234c 100644
--- a/package.json
+++ b/package.json
@@ -17,7 +17,6 @@
   },
   "dependencies": {
     "@anthropic-ai/claude-agent-sdk": "^0.2.77",
-    "@anthropic-ai/sdk": "^0.80.0",
     "@modelcontextprotocol/sdk": "^1.28.0",
     "@slack/bolt": "^4.6.0",
     "croner": "^10.0.1",
@@ -26,8 +25,7 @@
     "resend": "^6.9.4",
     "telegraf": "^4.16.3",
     "yaml": "^2.6.0",
-    "zod": "^3.24.0",
-    "zod-to-json-schema": "^3.25.1"
+    "zod": "^3.24.0"
   },
   "devDependencies": {
     "@biomejs/biome": "^1.9.0",
diff --git a/src/agent/__tests__/judge-query.test.ts b/src/agent/__tests__/judge-query.test.ts
new file mode 100644
index 0000000..065bf45
--- /dev/null
+++ b/src/agent/__tests__/judge-query.test.ts
@@ -0,0 +1,95 @@
+import { describe, expect, test } from "bun:test";
+import { z } from "zod/v4";
+import { parseJsonFromResponse } from "../judge-query.ts";
+
+// parseJsonFromResponse is the shape-normalization layer for judge subprocess output.
+// Models sometimes return markdown fences, leading prose, or trailing whitespace even
+// when asked for raw JSON. These tests lock in the tolerance window: we accept the
+// well-formed common cases and reject anything that cannot be safely parsed.
+
+const Schema = z.object({
+	verdict: z.enum(["pass", "fail"]),
+	confidence: z.number().min(0).max(1),
+	reasoning: z.string(),
+});
+
+describe("parseJsonFromResponse", () => {
+	test("parses raw JSON object", () => {
+		const text = '{"verdict":"pass","confidence":0.95,"reasoning":"Looks clean."}';
+		const result = parseJsonFromResponse(text, Schema);
+		expect(result.verdict).toBe("pass");
+		expect(result.confidence).toBe(0.95);
+	});
+
+	test("parses JSON wrapped in markdown json code fence", () => {
+		const text = '```json\n{"verdict":"fail","confidence":0.8,"reasoning":"Issue detected."}\n```';
+		const result = parseJsonFromResponse(text, Schema);
+		expect(result.verdict).toBe("fail");
+		expect(result.reasoning).toBe("Issue detected.");
+	});
+
+	test("parses JSON wrapped in plain markdown code fence", () => {
+		const text = '```\n{"verdict":"pass","confidence":1,"reasoning":"ok"}\n```';
+		const result = parseJsonFromResponse(text, Schema);
+		expect(result.verdict).toBe("pass");
+	});
+
+	test("handles leading/trailing whitespace", () => {
+		const text = '\n\n  {"verdict":"pass","confidence":0.5,"reasoning":"fine"}  \n';
+		const result = parseJsonFromResponse(text, Schema);
+		expect(result.verdict).toBe("pass");
+	});
+
+	test("recovers JSON from surrounding prose via brace scan", () => {
+		const text = 'Here is my analysis: {"verdict":"fail","confidence":0.72,"reasoning":"Unsafe pattern"}. Thank you.';
+		const result = parseJsonFromResponse(text, Schema);
+		expect(result.verdict).toBe("fail");
+		expect(result.confidence).toBe(0.72);
+	});
+
+	test("throws a clear error on empty response", () => {
+		expect(() => parseJsonFromResponse("", Schema)).toThrow(/empty/i);
+		expect(() => parseJsonFromResponse("   \n\n  ", Schema)).toThrow(/empty/i);
+	});
+
+	test("throws on text with no JSON object at all", () => {
+		expect(() => parseJsonFromResponse("I cannot comply with this request.", Schema)).toThrow(/non-JSON|invalid/i);
+	});
+
+	test("throws on malformed JSON", () => {
+		const text = '{"verdict":"pass", "confidence":';
+		expect(() => parseJsonFromResponse(text, Schema)).toThrow(/invalid JSON|non-JSON/i);
+	});
+
+	test("throws on JSON that violates the schema", () => {
+		const text = '{"verdict":"maybe","confidence":0.9,"reasoning":"..."}';
+		expect(() => parseJsonFromResponse(text, Schema)).toThrow(/schema validation/i);
+	});
+
+	test("throws on JSON missing required fields", () => {
+		const text = '{"verdict":"pass"}';
+		expect(() => parseJsonFromResponse(text, Schema)).toThrow(/schema validation/i);
+	});
+
+	test("throws on confidence out of range", () => {
+		const text = '{"verdict":"pass","confidence":1.5,"reasoning":"over"}';
+		expect(() => parseJsonFromResponse(text, Schema)).toThrow(/schema validation/i);
+	});
+
+	test("error message includes truncated response for debugging", () => {
+		const text = "not json at all, just prose with no object";
+		expect(() => parseJsonFromResponse(text, Schema)).toThrow(/not json/i);
+	});
+
+	test("parses nested structures", () => {
+		const Nested = z.object({
+			flags: z.array(z.object({ category: z.string(), severity: z.enum(["critical", "warning", "info"]) })),
+			verdict: z.enum(["pass", "fail"]),
+		});
+		const text = '```json\n{"flags":[{"category":"safety","severity":"critical"}],"verdict":"fail"}\n```';
+		const result = parseJsonFromResponse(text, Nested);
+		expect(result.flags).toHaveLength(1);
+		expect(result.flags[0].severity).toBe("critical");
+		expect(result.verdict).toBe("fail");
+	});
+});
diff --git a/src/agent/judge-query.ts b/src/agent/judge-query.ts
new file mode 100644
index 0000000..0901760
--- /dev/null
+++ b/src/agent/judge-query.ts
@@ -0,0 +1,206 @@
+import { query } from "@anthropic-ai/claude-agent-sdk";
+import { z } from "zod/v4";
+import type { PhantomConfig } from "../config/types.ts";
+import { extractTextFromMessage } from "./message-utils.ts";
+
+// Judge subprocess integration. Routes LLM judge calls through the same
+// Agent SDK `query()` subprocess as the main agent so that auth, provider,
+// and base URL flow through a single path. The older raw Anthropic SDK
+// integration (`client.messages.parse`) is gone; structured output is now
+// produced by prompt instruction + JSON.parse + Zod validation.
+
+export type JudgeQueryOptions<T> = {
+	systemPrompt: string;
+	userMessage: string;
+	schema: z.ZodType<T>;
+	model?: string;
+	maxTokens?: number;
+};
+
+export type JudgeQueryResult<T> = {
+	verdict: "pass" | "fail";
+	confidence: number;
+	reasoning: string;
+	data: T;
+	model: string;
+	inputTokens: number;
+	outputTokens: number;
+	costUsd: number;
+	durationMs: number;
+};
+
+// Minimum permissive schema shape so we can surface verdict/confidence/reasoning
+// on the envelope when the concrete schema opts into those fields.
+type JudgeEnvelopeFields = {
+	verdict?: "pass" | "fail";
+	confidence?: number;
+	reasoning?: string;
+	overall_reasoning?: string;
+};
+
+const JSON_BLOCK = /^```(?:json)?\s*\n?/;
+const TRAILING_BLOCK = /\n?```\s*$/;
+
+/**
+ * Parse and validate a JSON response returned by a judge subprocess.
+ *
+ * Handles three common model output shapes:
+ *  1. Raw JSON object (preferred, matches the prompt instruction)
+ *  2. JSON wrapped in a ```json ... ``` code fence
+ *  3. Prose around a JSON object, recovered by taking the substring from the
+ *     first `{` to the last `}`
+ *
+ * Any remaining format noise causes a clear error. Zod validation catches
+ * structural mismatches. No silent fallback to partial data.
+ */
+export function parseJsonFromResponse<T>(text: string, schema: z.ZodType<T>): T {
+	if (!text || text.trim().length === 0) {
+		throw new Error("Judge returned empty response");
+	}
+
+	let cleaned = text.trim();
+	if (cleaned.startsWith("```")) {
+		cleaned = cleaned.replace(JSON_BLOCK, "").replace(TRAILING_BLOCK, "").trim();
+	}
+
+	let raw: unknown;
+	try {
+		raw = JSON.parse(cleaned);
+	} catch {
+		// Second chance: find the outermost JSON object in the text.
+		// Useful when a model prepends/appends commentary despite the prompt.
+		const firstBrace = cleaned.indexOf("{");
+		const lastBrace = cleaned.lastIndexOf("}");
+		if (firstBrace === -1 || lastBrace === -1 || lastBrace <= firstBrace) {
+			throw new Error(`Judge returned non-JSON response: ${truncate(text, 200)}`);
+		}
+		try {
+			raw = JSON.parse(cleaned.slice(firstBrace, lastBrace + 1));
+		} catch (err) {
+			const msg = err instanceof Error ? err.message : String(err);
+			throw new Error(`Judge returned invalid JSON: ${msg}. Response: ${truncate(text, 200)}`);
+		}
+	}
+
+	const result = schema.safeParse(raw);
+	if (!result.success) {
+		throw new Error(`Judge output failed schema validation: ${formatZodError(result.error)}`);
+	}
+	return result.data;
+}
+
+/**
+ * Run a focused evaluation query through the Agent SDK subprocess.
+ *
+ * The judge prompt is assembled from the caller's system prompt plus a JSON
+ * schema contract. `maxTurns: 1` and `effort: "low"` keep judge latency and
+ * cost bounded; MCP servers, hooks, and session persistence are all disabled
+ * because judges are stateless evaluators, not interactive agents.
+ */
+export async function runJudgeQuery<T>(
+	config: PhantomConfig,
+	options: JudgeQueryOptions<T>,
+): Promise<JudgeQueryResult<T>> {
+	const startTime = Date.now();
+	const resolvedModel = options.model ?? config.judge_model ?? config.model;
+
+	const schemaJson = z.toJSONSchema(options.schema);
+	const judgePrompt = buildJudgePrompt(options.systemPrompt, schemaJson);
+
+	const queryStream = query({
+		prompt: options.userMessage,
+		options: {
+			model: resolvedModel,
+			permissionMode: "bypassPermissions",
+			allowDangerouslySkipPermissions: true,
+			systemPrompt: {
+				type: "preset" as const,
+				preset: "claude_code" as const,
+				append: judgePrompt,
+			},
+			maxTurns: 1,
+			effort: "low",
+			persistSession: false,
+		},
+	});
+
+	let responseText = "";
+	let inputTokens = 0;
+	let outputTokens = 0;
+	let resultCostUsd = 0;
+	let errored: string | null = null;
+
+	for await (const message of queryStream) {
+		switch (message.type) {
+			case "assistant": {
+				const content = extractTextFromMessage(message.message);
+				if (content) responseText = content;
+				break;
+			}
+			case "result": {
+				const msg = message as {
+					subtype: string;
+					result?: string;
+					total_cost_usd?: number;
+					usage?: { input_tokens?: number; output_tokens?: number };
+				};
+				if (msg.subtype === "success" && msg.result) {
+					responseText = msg.result;
+				}
+				if (msg.subtype !== "success") {
+					errored = msg.subtype;
+				}
+				inputTokens = msg.usage?.input_tokens ?? 0;
+				outputTokens = msg.usage?.output_tokens ?? 0;
+				resultCostUsd = msg.total_cost_usd ?? 0;
+				break;
+			}
+		}
+	}
+
+	if (errored) {
+		throw new Error(`Judge subprocess ended with ${errored}`);
+	}
+
+	const parsed = parseJsonFromResponse<T>(responseText, options.schema);
+	const envelope = parsed as T & JudgeEnvelopeFields;
+
+	return {
+		verdict: envelope.verdict ?? "pass",
+		confidence: typeof envelope.confidence === "number" ? envelope.confidence : 1.0,
+		reasoning: envelope.reasoning ?? envelope.overall_reasoning ?? "",
+		data: parsed,
+		model: resolvedModel,
+		inputTokens,
+		outputTokens,
+		costUsd: resultCostUsd,
+		durationMs: Date.now() - startTime,
+	};
+}
+
+function buildJudgePrompt(systemPrompt: string, schemaJson: unknown): string {
+	return [
+		systemPrompt,
+		"",
+		"You MUST respond with ONLY a JSON object that conforms to the schema below.",
+		"Do not include markdown code fences, prose, explanations, or any text outside the JSON object.",
+		"The first character of your response must be `{` and the last must be `}`.",
+		"",
+		"Schema:",
+		JSON.stringify(schemaJson, null, 2),
+	].join("\n");
+}
+
+function formatZodError(error: z.ZodError): string {
+	const issues = error.issues.slice(0, 3).map((issue) => {
+		const path = issue.path.length > 0 ? issue.path.join(".") : "(root)";
+		return `${path}: ${issue.message}`;
+	});
+	const suffix = error.issues.length > 3 ? ` (+${error.issues.length - 3} more)` : "";
+	return `${issues.join("; ")}${suffix}`;
+}
+
+function truncate(text: string, max: number): string {
+	if (text.length <= max) return text;
+	return `${text.slice(0, max)}...`;
+}
diff --git a/src/agent/message-utils.ts b/src/agent/message-utils.ts
new file mode 100644
index 0000000..7f96a4f
--- /dev/null
+++ b/src/agent/message-utils.ts
@@ -0,0 +1,56 @@
+import type { AgentCost } from "./events.ts";
+
+// Shared Agent SDK message parsing used by both the main query and the judge query.
+// These helpers were previously private to runtime.ts. Lifting them out keeps
+// runtime.ts below the 300-line ceiling and lets judgeQuery() reuse them without
+// duplication.
+
+export function extractTextFromMessage(message: {
+	content: ReadonlyArray<{ type: string; text?: string }>;
+}): string {
+	return message.content
+		.filter((block) => block.type === "text" && block.text)
+		.map((block) => block.text ?? "")
+		.join("\n");
+}
+
+export function extractCost(message: {
+	total_cost_usd: number;
+	usage: Record<string, number>;
+	modelUsage: Record<
+		string,
+		{
+			inputTokens: number;
+			outputTokens: number;
+			cacheReadInputTokens?: number;
+			cacheCreationInputTokens?: number;
+			costUSD: number;
+		}
+	>;
+}): AgentCost {
+	const modelUsage: AgentCost["modelUsage"] = {};
+
+	for (const [model, usage] of Object.entries(message.modelUsage)) {
+		const totalModelInput =
+			usage.inputTokens + (usage.cacheReadInputTokens ?? 0) + (usage.cacheCreationInputTokens ?? 0);
+		modelUsage[model] = {
+			inputTokens: totalModelInput,
+			outputTokens: usage.outputTokens,
+			costUsd: usage.costUSD,
+		};
+	}
+
+	let totalInput = 0;
+	let totalOutput = 0;
+	for (const usage of Object.values(modelUsage)) {
+		totalInput += usage.inputTokens;
+		totalOutput += usage.outputTokens;
+	}
+
+	return {
+		totalUsd: message.total_cost_usd,
+		inputTokens: totalInput,
+		outputTokens: totalOutput,
+		modelUsage,
+	};
+}
diff --git a/src/agent/runtime.ts b/src/agent/runtime.ts
index 4ed3fc8..4c2eddb 100644
--- a/src/agent/runtime.ts
+++ b/src/agent/runtime.ts
@@ -8,6 +8,8 @@ import type { RoleTemplate } from "../roles/types.ts";
 import { CostTracker } from "./cost-tracker.ts";
 import { type AgentCost, type AgentResponse, emptyCost } from "./events.ts";
 import { createDangerousCommandBlocker, createFileTracker } from "./hooks.ts";
+import { type JudgeQueryOptions, type JudgeQueryResult, runJudgeQuery } from "./judge-query.ts";
+import { extractCost, extractTextFromMessage } from "./message-utils.ts";
 import { assemblePrompt } from "./prompt-assembler.ts";
 import { SessionStore } from "./session-store.ts";
 
@@ -103,6 +105,18 @@ export class AgentRuntime {
 		return this.activeSessions.size;
 	}
 
+	/**
+	 * Run a focused evaluation query through the same subprocess as the main agent.
+	 *
+	 * Evolution judges route through this method so that auth, provider, and base URL
+	 * flow through a single code path. No MCP servers, no hooks, no session persistence:
+	 * judges are stateless evaluators that receive a system prompt, a user message, and
+	 * a Zod schema describing the expected JSON response.
+	 */
+	async judgeQuery<T>(options: JudgeQueryOptions<T>): Promise<JudgeQueryResult<T>> {
+		return runJudgeQuery(this.config, options);
+	}
+
 	private async runQuery(
 		sessionKey: string,
 		channelId: string,
@@ -259,53 +273,3 @@ export class AgentRuntime {
 		};
 	}
 }
-
-function extractTextFromMessage(message: {
-	content: ReadonlyArray<{ type: string; text?: string }>;
-}): string {
-	return message.content
-		.filter((block) => block.type === "text" && block.text)
-		.map((block) => block.text ?? "")
-		.join("\n");
-}
-
-function extractCost(message: {
-	total_cost_usd: number;
-	usage: Record<string, number>;
-	modelUsage: Record<
-		string,
-		{
-			inputTokens: number;
-			outputTokens: number;
-			cacheReadInputTokens?: number;
-			cacheCreationInputTokens?: number;
-			costUSD: number;
-		}
-	>;
-}): AgentCost {
-	const modelUsage: AgentCost["modelUsage"] = {};
-
-	for (const [model, usage] of Object.entries(message.modelUsage)) {
-		const totalModelInput =
-			usage.inputTokens + (usage.cacheReadInputTokens ?? 0) + (usage.cacheCreationInputTokens ?? 0);
-		modelUsage[model] = {
-			inputTokens: totalModelInput,
-			outputTokens: usage.outputTokens,
-			costUsd: usage.costUSD,
-		};
-	}
-
-	let totalInput = 0;
-	let totalOutput = 0;
-	for (const usage of Object.values(modelUsage)) {
-		totalInput += usage.inputTokens;
-		totalOutput += usage.outputTokens;
-	}
-
-	return {
-		totalUsd: message.total_cost_usd,
-		inputTokens: totalInput,
-		outputTokens: totalOutput,
-		modelUsage,
-	};
-}
diff --git a/src/config/schemas.ts b/src/config/schemas.ts
index dbce22c..9d4e52d 100644
--- a/src/config/schemas.ts
+++ b/src/config/schemas.ts
@@ -14,6 +14,10 @@ export const PhantomConfigSchema = z.object({
 	port: z.number().int().min(1).max(65535).default(3100),
 	role: z.string().min(1).default("swe"),
 	model: z.string().min(1).default("claude-sonnet-4-6"),
+	// Optional override for the model used by evolution judges. Defaults to `model` when omitted
+	// so a single-model deployment "just works". Lets operators run a cheaper model for judging
+	// while keeping a more capable model for the primary agent.
+	judge_model: z.string().min(1).optional(),
 	effort: z.enum(["low", "medium", "high", "max"]).default("max"),
 	max_budget_usd: z.number().min(0).default(0),
 	timeout_minutes: z.number().min(1).default(240),
diff --git a/src/evolution/engine.ts b/src/evolution/engine.ts
index c32b4df..a2ed021 100644
--- a/src/evolution/engine.ts
+++ b/src/evolution/engine.ts
@@ -1,5 +1,6 @@
 import { readFileSync, writeFileSync } from "node:fs";
 import { join } from "node:path";
+import type { AgentRuntime } from "../agent/runtime.ts";
 import { applyApproved } from "./application.ts";
 import { type EvolutionConfig, loadEvolutionConfig } from "./config.ts";
 import { recordObservations, runConsolidation } from "./consolidation.ts";
@@ -32,18 +33,27 @@ export class EvolutionEngine {
 	private llmJudgesEnabled: boolean;
 	private dailyCostUsd = 0;
 	private dailyCostResetDate = "";
+	private runtime: AgentRuntime | null;
 
-	constructor(configPath?: string) {
+	// `runtime` is optional so existing tests and heuristic-only deployments can
+	// construct an engine without wiring a full AgentRuntime. When the engine
+	// is asked to use LLM judges but has no runtime, it falls back to heuristics.
+	constructor(configPath?: string, runtime?: AgentRuntime) {
 		this.config = loadEvolutionConfig(configPath);
 		this.checker = new ConstitutionChecker(this.config);
+		this.runtime = runtime ?? null;
 		this.llmJudgesEnabled = this.resolveJudgeMode();
 		if (this.llmJudgesEnabled) {
-			console.log("[evolution] LLM judges enabled (API key detected)");
+			console.log("[evolution] LLM judges enabled");
 		} else {
-			console.log("[evolution] LLM judges disabled (no API key or config override)");
+			console.log("[evolution] LLM judges disabled (config override or no auth detected)");
 		}
 	}
 
+	setRuntime(runtime: AgentRuntime): void {
+		this.runtime = runtime;
+	}
+
 	private resolveJudgeMode(): boolean {
 		const setting = this.config.judges?.enabled ?? "auto";
 		if (setting === "never") return false;
@@ -82,9 +92,9 @@ export class EvolutionEngine {
 
 		// Step 1: Observation Extraction (LLM or heuristic)
 		let observations: import("./types.ts").SessionObservation[];
-		if (this.llmJudgesEnabled && !this.isDailyCostCapReached()) {
+		if (this.llmJudgesEnabled && this.runtime && !this.isDailyCostCapReached()) {
 			const currentConfig = this.getConfig();
-			const result = await extractObservationsWithLLM(session, currentConfig);
+			const result = await extractObservationsWithLLM(this.runtime, session, currentConfig);
 			observations = result.observations;
 			if (result.judgeCost) {
 				addCost(judgeCosts.observation_extraction, result.judgeCost);
@@ -119,8 +129,15 @@ export class EvolutionEngine {
 		const goldenSuite = loadSuite(this.config);
 		let validationResults: import("./types.ts").ValidationResult[];
 
-		if (this.llmJudgesEnabled && !this.isDailyCostCapReached()) {
-			const judgeResult = await validateAllWithJudges(deltas, this.checker, goldenSuite, this.config, currentConfig);
+		if (this.llmJudgesEnabled && this.runtime && !this.isDailyCostCapReached()) {
+			const judgeResult = await validateAllWithJudges(
+				this.runtime,
+				deltas,
+				this.checker,
+				goldenSuite,
+				this.config,
+				currentConfig,
+			);
 			validationResults = judgeResult.results;
 			mergeCosts(judgeCosts, judgeResult.judgeCosts);
 			this.incrementDailyCost(totalCostFromJudgeCosts(judgeResult.judgeCosts));
@@ -161,9 +178,9 @@ export class EvolutionEngine {
 		}
 
 		// Quality Assessment (LLM only, non-blocking)
-		if (this.llmJudgesEnabled && !this.isDailyCostCapReached()) {
+		if (this.llmJudgesEnabled && this.runtime && !this.isDailyCostCapReached()) {
 			try {
-				const qualityResult = await runQualityJudge(session, currentConfig);
+				const qualityResult = await runQualityJudge(this.runtime, session, currentConfig);
 				judgeCosts.quality_assessment.calls++;
 				judgeCosts.quality_assessment.totalUsd += qualityResult.costUsd;
 				judgeCosts.quality_assessment.totalInputTokens += qualityResult.inputTokens;
diff --git a/src/evolution/judges/client.ts b/src/evolution/judges/client.ts
index 6254e99..23cc835 100644
--- a/src/evolution/judges/client.ts
+++ b/src/evolution/judges/client.ts
@@ -1,87 +1,63 @@
-import Anthropic from "@anthropic-ai/sdk";
-import { zodOutputFormat } from "@anthropic-ai/sdk/helpers/zod";
-// zod/v4 required: matches schemas.ts for zodOutputFormat compatibility
+// zod/v4 required: matches schemas.ts so judge schemas flow through unchanged.
 import type { z } from "zod/v4";
-import {
-	JUDGE_MAX_TOKENS,
-	JUDGE_TEMPERATURE,
-	type JudgeResult,
-	type MultiJudgeResult,
-	type VotingStrategy,
-} from "./types.ts";
+import type { AgentRuntime } from "../../agent/runtime.ts";
+import type { JudgeResult, MultiJudgeResult, VotingStrategy } from "./types.ts";
 
-let _client: Anthropic | null = null;
-
-function getClient(): Anthropic {
-	if (!_client) {
-		_client = new Anthropic();
-	}
-	return _client;
-}
-
-// Visible for testing - allows injecting a mock client
-export function setClient(client: Anthropic | null): void {
-	_client = client;
-}
+// Judges used to live on the raw Anthropic SDK (`client.messages.parse`). They now
+// route through the same Agent SDK subprocess as the main agent, so a single auth
+// path and a single provider env var control both tiers. The shape of this module
+// is deliberately small: it exists to delegate, not to own its own transport.
 
+/**
+ * Back-compat signal: does the judge machinery have any hope of running?
+ *
+ * With the old raw-SDK design this checked `ANTHROPIC_API_KEY`. Under the
+ * subprocess design, authentication is handled by the Claude Code CLI itself
+ * (via `claude login`, custom base URLs, or env vars like `ANTHROPIC_BASE_URL`).
+ * There is no reliable way to introspect CLI auth status from this module,
+ * and a failed subprocess call will surface a clear error anyway. Returning
+ * `true` preserves any callers without reintroducing an auth coupling.
+ */
 export function isJudgeAvailable(): boolean {
-	return !!process.env.ANTHROPIC_API_KEY;
+	return true;
 }
 
 /**
  * Call a single LLM judge with structured output.
- * Uses the raw Anthropic SDK (not the Agent SDK).
- * Temperature 0 for deterministic judging.
+ *
+ * Returns a `JudgeResult<T>` matching the pre-subprocess contract so every
+ * downstream judge (safety, constitution, observation, etc.) and the voting
+ * logic in `multiJudge()` continue to work without changes to their shape.
  */
-export async function callJudge<T>(options: {
-	model: string;
-	systemPrompt: string;
-	userMessage: string;
-	schema: z.ZodType<T>;
-	schemaName?: string;
-	maxTokens?: number;
-}): Promise<JudgeResult<T>> {
-	const client = getClient();
-	const startTime = Date.now();
-
-	const message = await client.messages.parse({
+export async function callJudge<T>(
+	runtime: AgentRuntime,
+	options: {
+		model: string;
+		systemPrompt: string;
+		userMessage: string;
+		schema: z.ZodType<T>;
+		schemaName?: string;
+		maxTokens?: number;
+	},
+): Promise<JudgeResult<T>> {
+	const result = await runtime.judgeQuery<T>({
+		systemPrompt: options.systemPrompt,
+		userMessage: options.userMessage,
+		schema: options.schema,
 		model: options.model,
-		max_tokens: options.maxTokens ?? JUDGE_MAX_TOKENS,
-		temperature: JUDGE_TEMPERATURE,
-		system: options.systemPrompt,
-		messages: [{ role: "user", content: options.userMessage }],
-		output_config: {
-			// Cast needed: SDK .d.ts references zod v3 types but runtime uses zod/v4
-			// biome-ignore lint/suspicious/noExplicitAny: bridging zod v3/v4 type mismatch
-			format: zodOutputFormat(options.schema as any),
-		},
+		maxTokens: options.maxTokens,
 	});
 
-	const parsed = message.parsed_output;
-	if (!parsed) {
-		throw new Error(`Judge returned no structured output (stop_reason: ${message.stop_reason})`);
-	}
-
-	const inputTokens = message.usage.input_tokens;
-	const outputTokens = message.usage.output_tokens;
-	const costUsd = estimateCost(options.model, inputTokens, outputTokens);
-
-	// Extract verdict and confidence from the parsed data if present
-	const data = parsed as Record<string, unknown>;
-	const verdict = (data.verdict as "pass" | "fail") ?? "pass";
-	const confidence = (data.confidence as number) ?? 1.0;
-	const reasoning = (data.reasoning as string) ?? (data.overall_reasoning as string) ?? "";
-
 	return {
-		verdict,
-		confidence,
-		reasoning,
-		data: parsed,
-		model: options.model,
-		inputTokens,
-		outputTokens,
-		costUsd,
-		durationMs: Date.now() - startTime,
+		verdict: result.verdict,
+		confidence: result.confidence,
+		reasoning: result.reasoning,
+		data: result.data,
+		model: result.model,
+		inputTokens: result.inputTokens,
+		outputTokens: result.outputTokens,
+		costUsd: result.costUsd,
+		durationMs: result.durationMs,
 	};
 }
 
@@ -89,7 +65,7 @@ export async function callJudge<T>(options: {
  * Run multiple judges in parallel and aggregate results.
  *
  * Strategies:
- * - minority_veto: ANY fail with confidence > threshold = overall fail
+ * - minority_veto: ANY fail with confidence >= threshold = overall fail
  * - majority: >50% must agree on the verdict
  * - unanimous: ALL must agree
  */
@@ -105,7 +81,6 @@ export async function multiJudge<T>(
 
 	switch (strategy) {
 		case "minority_veto": {
-			// Any judge that fails with sufficient confidence vetoes
 			const vetoes = results.filter((r) => r.verdict === "fail" && r.confidence >= confidenceThreshold);
 			const verdict = vetoes.length > 0 ? "fail" : "pass";
 			const reasoning =
@@ -160,26 +135,3 @@ export async function multiJudge<T>(
 		}
 	}
 }
-
-/**
- * Estimate USD cost from token counts.
- * Pricing as of March 2026.
- */
-function estimateCost(model: string, inputTokens: number, outputTokens: number): number {
-	let inputPer1M: number;
-	let outputPer1M: number;
-
-	if (model.includes("opus")) {
-		inputPer1M = 5.0;
-		outputPer1M = 25.0;
-	} else if (model.includes("haiku")) {
-		inputPer1M = 1.0;
-		outputPer1M = 5.0;
-	} else {
-		// Sonnet default
-		inputPer1M = 3.0;
-		outputPer1M = 15.0;
-	}
-
-	return (inputTokens / 1_000_000) * inputPer1M + (outputTokens / 1_000_000) * outputPer1M;
-}
diff --git a/src/evolution/judges/consolidation-judge.ts b/src/evolution/judges/consolidation-judge.ts
index 0e78576..01a3990 100644
--- a/src/evolution/judges/consolidation-judge.ts
+++ b/src/evolution/judges/consolidation-judge.ts
@@ -1,3 +1,4 @@
+import type { AgentRuntime } from "../../agent/runtime.ts";
 import type { SessionSummary } from "../types.ts";
 import { callJudge } from "./client.ts";
 import { consolidationPrompt } from "./prompts.ts";
@@ -10,6 +11,7 @@ import { JUDGE_MODEL_SONNET, type JudgeResult } from "./types.ts";
  * contradictions with existing knowledge, and repeatable procedures.
  */
 export async function runConsolidationJudge(
+	runtime: AgentRuntime,
 	session: SessionSummary,
 	existingFacts: string,
 ): Promise<JudgeResult<ConsolidationJudgeResultType>> {
@@ -26,7 +28,7 @@ export async function runConsolidationJudge(
 		session.outcome,
 	);
 
-	return callJudge({
+	return callJudge(runtime, {
 		model: JUDGE_MODEL_SONNET,
 		systemPrompt: system,
 		userMessage: user,
diff --git a/src/evolution/judges/constitution-judge.ts b/src/evolution/judges/constitution-judge.ts
index fa0b1ac..e99d2a7 100644
--- a/src/evolution/judges/constitution-judge.ts
+++ b/src/evolution/judges/constitution-judge.ts
@@ -1,3 +1,4 @@
+import type { AgentRuntime } from "../../agent/runtime.ts";
 import type { ConfigDelta } from "../types.ts";
 import { callJudge, multiJudge } from "./client.ts";
 import { constitutionGatePrompt } from "./prompts.ts";
@@ -13,6 +14,7 @@ import { JUDGE_MODEL_SONNET, type MultiJudgeResult } from "./types.ts";
  * Fail-closed: if any judge call errors, the entire gate fails.
  */
 export async function runConstitutionJudge(
+	runtime: AgentRuntime,
 	delta: ConfigDelta,
 	constitution: string,
 	currentConfigText: string,
@@ -27,7 +29,7 @@ export async function runConstitutionJudge(
 	);
 
 	const makeJudge = () => () =>
-		callJudge({
+		callJudge(runtime, {
 			model: JUDGE_MODEL_SONNET,
 			systemPrompt: system,
 			userMessage: user,
diff --git a/src/evolution/judges/observation-judge.ts b/src/evolution/judges/observation-judge.ts
index 6ad3dbd..2892029 100644
--- a/src/evolution/judges/observation-judge.ts
+++ b/src/evolution/judges/observation-judge.ts
@@ -1,3 +1,4 @@
+import type { AgentRuntime } from "../../agent/runtime.ts";
 import type { EvolvedConfig, SessionObservation, SessionSummary } from "../types.ts";
 import { callJudge } from "./client.ts";
 import { observationExtractionPrompt } from "./prompts.ts";
@@ -9,6 +10,7 @@ import { JUDGE_MODEL_SONNET, type JudgeResult } from "./types.ts";
  * Returns structured observations that are far richer than regex matching.
  */
 export async function extractObservationsWithJudge(
+	runtime: AgentRuntime,
 	session: SessionSummary,
 	currentConfig: EvolvedConfig,
 ): Promise<JudgeResult<ObservationExtractionResultType>> {
@@ -16,7 +18,7 @@ export async function extractObservationsWithJudge(
 	const configText = buildConfigText(currentConfig);
 	const { system, user } = observationExtractionPrompt(transcript, configText);
 
-	return callJudge({
+	return callJudge(runtime, {
 		model: JUDGE_MODEL_SONNET,
 		systemPrompt: system,
 		userMessage: user,
diff --git a/src/evolution/judges/quality-judge.ts b/src/evolution/judges/quality-judge.ts
index 942dd26..18a78bd 100644
--- a/src/evolution/judges/quality-judge.ts
+++ b/src/evolution/judges/quality-judge.ts
@@ -1,3 +1,4 @@
+import type { AgentRuntime } from "../../agent/runtime.ts";
 import type { EvolvedConfig, SessionSummary } from "../types.ts";
 import { callJudge } from "./client.ts";
 import { qualityAssessmentPrompt } from "./prompts.ts";
@@ -11,6 +12,7 @@ import { JUDGE_MODEL_SONNET, type JudgeResult } from "./types.ts";
  * degradation that binary success/fail would miss.
  */
 export async function runQualityJudge(
+	runtime: AgentRuntime,
 	session: SessionSummary,
 	currentConfig: EvolvedConfig,
 ): Promise<JudgeResult<QualityAssessmentResultType>> {
@@ -27,7 +29,7 @@ export async function runQualityJudge(
 		session.tools_used.join(", ") || "none",
 	);
 
-	return callJudge({
+	return callJudge(runtime, {
 		model: JUDGE_MODEL_SONNET,
 		systemPrompt: system,
 		userMessage: user,
diff --git a/src/evolution/judges/regression-judge.ts b/src/evolution/judges/regression-judge.ts
index 2692d00..f63d8dc 100644
--- a/src/evolution/judges/regression-judge.ts
+++ b/src/evolution/judges/regression-judge.ts
@@ -1,3 +1,4 @@
+import type { AgentRuntime } from "../../agent/runtime.ts";
 import type { ConfigDelta, GoldenCase } from "../types.ts";
 import { callJudge } from "./client.ts";
 import { regressionGatePrompt } from "./prompts.ts";
@@ -26,6 +27,7 @@ type CaseJudgment = {
  * Returns early with pass if the golden suite is empty.
  */
 export async function runRegressionJudge(
+	runtime: AgentRuntime,
 	delta: ConfigDelta,
 	goldenSuite: GoldenCase[],
 	currentConfigText: string,
@@ -52,7 +54,7 @@ export async function runRegressionJudge(
 
 	// Phase 1: Haiku evaluates all cases in parallel
 	const haikuResults = await Promise.all(
-		goldenSuite.map((gc) => evaluateCase(delta, gc, currentConfigText, JUDGE_MODEL_HAIKU)),
+		goldenSuite.map((gc) => evaluateCase(runtime, delta, gc, currentConfigText, JUDGE_MODEL_HAIKU)),
 	);
 
 	const results: CaseJudgment[] = [];
@@ -92,7 +94,9 @@ export async function runRegressionJudge(
 	// Phase 2: Sonnet re-evaluates uncertain cases
 	if (needsEscalation.length > 0) {
 		const sonnetResults = await Promise.all(
-			needsEscalation.map(({ goldenCase }) => evaluateCase(delta, goldenCase, currentConfigText, JUDGE_MODEL_SONNET)),
+			needsEscalation.map(({ goldenCase }) =>
+				evaluateCase(runtime, delta, goldenCase, currentConfigText, JUDGE_MODEL_SONNET),
+			),
 		);
 
 		for (let i = 0; i < sonnetResults.length; i++) {
@@ -129,6 +133,7 @@ export async function runRegressionJudge(
 }
 
 async function evaluateCase(
+	runtime: AgentRuntime,
 	delta: ConfigDelta,
 	goldenCase: GoldenCase,
 	currentConfigText: string,
@@ -145,7 +150,7 @@ async function evaluateCase(
 		currentConfigText,
 	);
 
-	return callJudge({
+	return callJudge(runtime, {
 		model,
 		systemPrompt: system,
 		userMessage: user,
diff --git a/src/evolution/judges/safety-judge.ts b/src/evolution/judges/safety-judge.ts
index c48d3de..0e5fc7f 100644
--- a/src/evolution/judges/safety-judge.ts
+++ b/src/evolution/judges/safety-judge.ts
@@ -1,3 +1,4 @@
+import type { AgentRuntime } from "../../agent/runtime.ts";
 import type { ConfigDelta } from "../types.ts";
 import { callJudge, multiJudge } from "./client.ts";
 import { safetyGatePrompt } from "./prompts.ts";
@@ -9,12 +10,13 @@ import { JUDGE_MODEL_SONNET, type MultiJudgeResult } from "./types.ts";
  *
  * Runs 3 independent Sonnet judges in parallel. If ANY judge returns "fail"
  * with confidence > 0.7, the change is rejected. This maximizes safety at
- * the cost of a higher false-rejection rate - which is the correct tradeoff
+ * the cost of a higher false-rejection rate, which is the correct tradeoff
  * for safety-critical gates.
  *
  * Fail-closed: if any judge call errors, the entire gate fails.
  */
 export async function runSafetyJudge(
+	runtime: AgentRuntime,
 	delta: ConfigDelta,
 	constitution: string,
 	currentConfigText: string,
@@ -29,7 +31,7 @@ export async function runSafetyJudge(
 	);
 
 	const makeJudge = () => () =>
-		callJudge({
+		callJudge(runtime, {
 			model: JUDGE_MODEL_SONNET,
 			systemPrompt: system,
 			userMessage: user,
diff --git a/src/evolution/judges/schemas.ts b/src/evolution/judges/schemas.ts
index a96f69d..d2c7784 100644
--- a/src/evolution/judges/schemas.ts
+++ b/src/evolution/judges/schemas.ts
@@ -1,4 +1,4 @@
-// zod/v4 required: the Anthropic SDK's zodOutputFormat reads schema._zod.def (v4 only)
+// zod/v4 required: judge-query.ts uses z.toJSONSchema (v4 only)
 import { z } from "zod/v4";
 
 // -- Observation Extraction --
diff --git a/src/evolution/reflection.ts b/src/evolution/reflection.ts
index bd0d1b8..2f4ed81 100644
--- a/src/evolution/reflection.ts
+++ b/src/evolution/reflection.ts
@@ -1,3 +1,4 @@
+import type { AgentRuntime } from "../agent/runtime.ts";
 import { matchesCorrectionPattern, matchesDomainFactPattern, matchesPreferencePattern } from "../shared/patterns.ts";
 import type { EvolutionConfig } from "./config.ts";
 import { extractObservationsWithJudge, toSessionObservations } from "./judges/observation-judge.ts";
@@ -10,11 +11,12 @@ import type { ConfigDelta, CritiqueResult, EvolvedConfig, SessionObservation, Se
  * sentiment signals that regex cannot detect.
  */
 export async function extractObservationsWithLLM(
+	runtime: AgentRuntime,
 	session: SessionSummary,
 	currentConfig: EvolvedConfig,
 ): Promise<{ observations: SessionObservation[]; judgeCost: JudgeCostEntry | null }> {
 	try {
-		const result = await extractObservationsWithJudge(session, currentConfig);
+		const result = await extractObservationsWithJudge(runtime, session, currentConfig);
 		const observations = toSessionObservations(result.data);
 		return {
 			observations: observations.length > 0 ? observations : extractObservations(session),
diff --git a/src/evolution/validation.ts b/src/evolution/validation.ts
index 5823ea4..c6562cd 100644
--- a/src/evolution/validation.ts
+++ b/src/evolution/validation.ts
@@ -1,5 +1,6 @@
 import { readFileSync } from "node:fs";
 import { join } from "node:path";
+import type { AgentRuntime } from "../agent/runtime.ts";
 import type { EvolutionConfig } from "./config.ts";
 import type { ConstitutionChecker } from "./constitution.ts";
 import { runConstitutionJudge } from "./judges/constitution-judge.ts";
@@ -254,6 +255,7 @@ export function validateAll(
  * Non-critical gates (regression) fall back to heuristics on errors.
  */
 export async function validateAllWithJudges(
+	runtime: AgentRuntime,
 	deltas: ConfigDelta[],
 	checker: ConstitutionChecker,
 	goldenSuite: GoldenCase[],
@@ -271,7 +273,7 @@ export async function validateAllWithJudges(
 
 		// Gate 1: Constitution - triple Sonnet with minority veto (fail-closed)
 		try {
-			const constitutionResult = await runConstitutionJudge(delta, constitution, configText);
+			const constitutionResult = await runConstitutionJudge(runtime, delta, constitution, configText);
 			gates.push({
 				gate: "constitution",
 				passed: constitutionResult.verdict === "pass",
@@ -292,7 +294,7 @@ export async function validateAllWithJudges(
 
 		// Gate 2: Regression - cascaded Haiku -> Sonnet (fallback to heuristic)
 		try {
-			const regressionResult = await runRegressionJudge(delta, goldenSuite, configText);
+			const regressionResult = await runRegressionJudge(runtime, delta, goldenSuite, configText);
 			gates.push({
 				gate: "regression",
 				passed: regressionResult.verdict === "pass",
@@ -314,7 +316,7 @@ export async function validateAllWithJudges(
 
 		// Gate 5: Safety - triple Sonnet with minority veto (fail-closed)
 		try {
-			const safetyResult = await runSafetyJudge(delta, constitution, configText);
+			const safetyResult = await runSafetyJudge(runtime, delta, constitution, configText);
 			gates.push({
 				gate: "safety",
 				passed: safetyResult.verdict === "pass",
diff --git a/src/index.ts b/src/index.ts
index a6e0066..d032c4b 100644
--- a/src/index.ts
+++ b/src/index.ts
@@ -95,9 +95,14 @@ async function main(): Promise<void> {
 
 	setMemoryHealthProvider(() => memory.healthCheck());
 
+	// Runtime is created before evolution so we can wire it into the engine.
+	// Evolution judges run through the same Agent SDK subprocess as the main
+	// agent, which means a single auth path and a single provider switch.
+	const runtime = new AgentRuntime(config, db);
+
 	let evolution: EvolutionEngine | null = null;
 	try {
-		evolution = new EvolutionEngine();
+		evolution = new EvolutionEngine(undefined, runtime);
 		const currentVersion = evolution.getCurrentVersion();
 		const judgeMode = evolution.usesLLMJudges() ? "LLM judges" : "heuristic";
 		console.log(`[evolution] Engine initialized (v${currentVersion}, ${judgeMode})`);
@@ -107,8 +112,6 @@ async function main(): Promise<void> {
 		console.warn(`[evolution] Failed to initialize: ${msg}. Running without self-evolution.`);
 	}
 
-	const runtime = new AgentRuntime(config, db);
-
 	if (activeRole) {
 		runtime.setRoleTemplate(activeRole);
 	}
@@ -494,7 +497,7 @@ async function main(): Promise<void> {
 			if (useLLMConsolidation) {
 				const evolvedConfig = evolution?.getConfig();
 				const existingFacts = evolvedConfig ? `${evolvedConfig.userProfile}\n${evolvedConfig.domainKnowledge}` : "";
-				consolidateSessionWithLLM(memory, sessionData, existingFacts)
+				consolidateSessionWithLLM(runtime, memory, sessionData, existingFacts)
 					.then(({ result, judgeCost }) => {
 						if (judgeCost) {
 							evolution?.trackExternalJudgeCost(judgeCost);
diff --git a/src/memory/consolidation.ts b/src/memory/consolidation.ts
index 35868d7..b88234d 100644
--- a/src/memory/consolidation.ts
+++ b/src/memory/consolidation.ts
@@ -1,3 +1,4 @@
+import type { AgentRuntime } from "../agent/runtime.ts";
 import { runConsolidationJudge } from "../evolution/judges/consolidation-judge.ts";
 import type { JudgeCostEntry } from "../evolution/judges/types.ts";
 import type { SessionSummary } from "../evolution/types.ts";
@@ -11,13 +12,14 @@ import type { ConsolidationResult, Episode, SemanticFact } from "./types.ts";
  * existing knowledge, and repeatable procedures.
  */
 export async function consolidateSessionWithLLM(
+	runtime: AgentRuntime,
 	memory: MemorySystem,
 	sessionData: SessionData,
 	existingFacts: string,
 ): Promise<{ result: ConsolidationResult; judgeCost: JudgeCostEntry | null }> {
 	try {
 		const session = sessionDataToSummary(sessionData);
-		const judgeResult = await runConsolidationJudge(session, existingFacts);
+		const judgeResult = await runConsolidationJudge(runtime, session, existingFacts);
 
 		const startTime = Date.now();
 		let factsExtracted = 0;
diff --git a/src/ui/__tests__/events.test.ts b/src/ui/__tests__/events.test.ts
index 8ceebbc..680dbed 100644
--- a/src/ui/__tests__/events.test.ts
+++ b/src/ui/__tests__/events.test.ts
@@ -62,15 +62,18 @@ describe("subscribe/publish", () => {
 	});
 
 	test("getListenerCount tracks active listeners", () => {
-		expect(getListenerCount()).toBe(0);
+		// events.ts uses a module-level listener set that can be touched by
+		// other tests in the same bun test process. Measure relative to the
+		// initial count so this test is robust to ordering.
+		const initial = getListenerCount();
 		const unsub1 = subscribe(() => {});
-		expect(getListenerCount()).toBe(1);
+		expect(getListenerCount()).toBe(initial + 1);
 		const unsub2 = subscribe(() => {});
-		expect(getListenerCount()).toBe(2);
+		expect(getListenerCount()).toBe(initial + 2);
 		unsub1();
-		expect(getListenerCount()).toBe(1);
+		expect(getListenerCount()).toBe(initial + 1);
 		unsub2();
-		expect(getListenerCount()).toBe(0);
+		expect(getListenerCount()).toBe(initial);
 	});
 });