diff --git a/CLAUDE.md b/CLAUDE.md
index ea7608f0f..492e4c944 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -159,6 +159,28 @@ Unit tests alone are insufficient for evaluator changes. After implementing or m
 
 5. **Note:** `--dry-run` returns mock responses that don't match evaluator output schemas. Use it only for testing harness flow, not evaluator logic.
 
+## Completing Work — E2E Checklist
+
+Before marking any branch as ready for review, complete this checklist:
+
+1. **Copy `.env` to worktree** (if working in a git worktree):
+   ```bash
+   cp /home/christso/projects/agentv/.env .env
+   ```
+   Without this, any eval run or LLM-dependent test will fail with missing API key errors.
+
+2. **Run unit tests**: `bun run test` — all must pass.
+
+3. **Run at least one real eval** against an example file to verify end-to-end behavior:
+   ```bash
+   bun apps/cli/src/cli.ts eval examples/features/rubric/evals/dataset.eval.yaml --test-id <test-id>
+   ```
+   Inspect the output JSONL to confirm correct evaluator type, scores, and hits/misses.
+
+4. **Verify no regressions** in areas adjacent to your changes (e.g., if you changed evaluator parsing, run an eval that exercises different evaluator types).
+
+5. **Mark PR as ready** only after all above steps pass.
+
 ## Evaluator Type System
 
 Evaluator types use **kebab-case** everywhere (matching promptfoo convention):
@@ -248,6 +270,7 @@ When working on a GitHub issue, **ALWAYS** follow this workflow:
    ```
 
 4. **Before merging**, ensure:
+   - **E2E verification completed** (see "Completing Work — E2E Checklist" below)
    - CI pipeline passes (all checks green)
    - Code has been reviewed if required
    - No merge conflicts with `main`
diff --git a/apps/cli/src/commands/eval/commands/run.ts b/apps/cli/src/commands/eval/commands/run.ts
index 4a7ec1b50..7e2117107 100644
--- a/apps/cli/src/commands/eval/commands/run.ts
+++ b/apps/cli/src/commands/eval/commands/run.ts
@@ -163,6 +163,17 @@ export const evalRunCommand = command({
       description:
         'Write companion artifacts (grading/<test>.json, timing.json, benchmark.json) to the specified directory',
     }),
+    judgeTarget: option({
+      type: optional(string),
+      long: 'judge-target',
+      description:
+        'Override judge target for all evaluators (e.g., "agentv", or a target name from targets.yaml)',
+    }),
+    model: option({
+      type: optional(string),
+      long: 'model',
+      description: 'Override model for the judge target (e.g., "openai:gpt-5-mini")',
+    }),
   },
   handler: async (args) => {
     // Launch interactive wizard when no eval paths and stdin is a TTY
@@ -203,6 +214,8 @@ export const evalRunCommand = command({
       strict: args.strict,
       benchmarkJson: args.benchmarkJson,
       artifacts: args.artifacts,
+      judgeTarget: args.judgeTarget,
+      model: args.model,
     };
     await runEvalCommand({ testFiles: resolvedPaths, rawOptions });
   },
diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts
index 43eec380b..1f3d77f46 100644
--- a/apps/cli/src/commands/eval/run-eval.ts
+++ b/apps/cli/src/commands/eval/run-eval.ts
@@ -82,6 +82,8 @@ interface NormalizedOptions {
   readonly workspacePath?: string;
   readonly benchmarkJson?: string;
   readonly artifacts?: string;
+  readonly judgeTarget?: string;
+  readonly model?: string;
 }
 
 function normalizeBoolean(value: unknown): boolean {
@@ -249,6 +251,8 @@ function normalizeOptions(
     workspacePath,
     benchmarkJson: normalizeString(rawOptions.benchmarkJson),
     artifacts: normalizeString(rawOptions.artifacts),
+    judgeTarget: normalizeString(rawOptions.judgeTarget),
+    model: normalizeString(rawOptions.model),
   } satisfies NormalizedOptions;
 }
 
@@ -593,6 +597,8 @@ async function runSingleEvalFile(params: {
     trials: trialsConfig,
     totalBudgetUsd,
     failOnError,
+    judgeTarget: options.judgeTarget,
+    model: options.model,
     streamCallbacks: streamingObserver?.getStreamCallbacks(),
     onResult: async (result: EvaluationResult) => {
       // Finalize streaming observer span with score
@@ -674,6 +680,11 @@ export async function runEvalCommand(input: RunEvalCommandInput): Promise<void>
 
   let options = normalizeOptions(input.rawOptions, config, yamlConfig?.execution);
 
+  // Validate --judge-target / --model combinations
+  if (options.judgeTarget === 'agentv' && !options.model) {
+    throw new Error('--judge-target agentv requires --model (e.g., --model openai:gpt-5-mini)');
+  }
+
   // --retry-errors: override filter to only re-run execution_error test cases.
   // IMPORTANT: JSONL must be fully loaded here, before the output writer is created below,
   // since the retry source and output destination may refer to the same file.
diff --git a/apps/web/src/content/docs/evaluation/eval-cases.mdx b/apps/web/src/content/docs/evaluation/eval-cases.mdx
index cc1545b64..4881674a9 100644
--- a/apps/web/src/content/docs/evaluation/eval-cases.mdx
+++ b/apps/web/src/content/docs/evaluation/eval-cases.mdx
@@ -265,7 +265,7 @@ tests:
 
 ### `assert` present — explicit evaluators only
 
-When `assert` is defined, only the declared evaluators run. No implicit judge is added. Judges that are declared (such as `llm-judge`, `code-judge`, `agent-judge`, or `rubrics`) receive `criteria` as input automatically.
+When `assert` is defined, only the declared evaluators run. No implicit judge is added. Judges that are declared (such as `llm-judge`, `code-judge`, or `rubrics`) receive `criteria` as input automatically.
 
 If `assert` contains only deterministic evaluators (like `contains` or `regex`), the `criteria` field is not evaluated and a warning is emitted:
 
diff --git a/apps/web/src/content/docs/guides/agent-eval-layers.mdx b/apps/web/src/content/docs/guides/agent-eval-layers.mdx
index 783a2ca55..6d0f542cf 100644
--- a/apps/web/src/content/docs/guides/agent-eval-layers.mdx
+++ b/apps/web/src/content/docs/guides/agent-eval-layers.mdx
@@ -15,8 +15,8 @@ Covers plan quality, plan adherence, and tool selection rationale. Use LLM-based
 
 | Concern | AgentV evaluator |
 |---------|-----------------|
-| Plan quality & coherence | `llm_judge` with reasoning-focused prompt |
-| Workspace-aware auditing | `agent_judge` with rubrics |
+| Plan quality & coherence | `llm-judge` with reasoning-focused prompt |
+| Workspace-aware auditing | `llm-judge` with rubrics |
 
 ```yaml
 # Layer 1: Reasoning — verify the agent's plan makes sense
@@ -29,7 +29,7 @@ assertions:
       Did it select appropriate tools for the task?
       Score 1.0 if reasoning is sound, 0.0 if not.
   - name: workspace-audit
-    type: agent-judge
+    type: llm-judge
     max_steps: 5
     temperature: 0
     rubrics:
diff --git a/bun.lock b/bun.lock
index 200a436cc..70471cff6 100644
--- a/bun.lock
+++ b/bun.lock
@@ -24,7 +24,7 @@
     },
     "apps/cli": {
       "name": "agentv",
-      "version": "2.12.0",
+      "version": "2.19.0",
       "bin": {
         "agentv": "./dist/cli.js",
       },
@@ -61,13 +61,14 @@
     },
     "packages/core": {
       "name": "@agentv/core",
-      "version": "2.12.0",
+      "version": "2.19.0",
       "dependencies": {
         "@agentclientprotocol/sdk": "^0.14.1",
         "@agentv/eval": "workspace:*",
         "@ai-sdk/anthropic": "^2.0.53",
         "@ai-sdk/azure": "^2.0.78",
         "@ai-sdk/google": "^2.0.44",
+        "@ai-sdk/openai": "^2.0.0",
         "@anthropic-ai/claude-agent-sdk": "^0.2.49",
         "@github/copilot-sdk": "^0.1.25",
         "@mariozechner/pi-agent-core": "^0.54.2",
@@ -95,7 +96,7 @@
     },
     "packages/eval": {
       "name": "@agentv/eval",
-      "version": "2.12.0",
+      "version": "2.19.0",
       "dependencies": {
         "zod": "^3.23.8",
       },
diff --git a/docs/plans/2026-02-26-eval-schema-generation-design.md b/docs/plans/2026-02-26-eval-schema-generation-design.md
index a20a7909f..9d6047886 100644
--- a/docs/plans/2026-02-26-eval-schema-generation-design.md
+++ b/docs/plans/2026-02-26-eval-schema-generation-design.md
@@ -248,14 +248,9 @@ const ExecutionMetricsSchema = EvaluatorCommonSchema.extend({
   exploration_tolerance: z.number().min(0).optional(),
 });
 
-const AgentJudgeSchema = EvaluatorCommonSchema.extend({
-  type: z.literal('agent_judge'),
-  prompt: z.string().optional(),
-  rubrics: z.array(RubricItemSchema).optional(),
-  max_steps: z.number().int().min(1).max(50).optional(),
-  temperature: z.number().min(0).max(2).optional(),
-  target: z.string().optional(),
-});
+// Note: agent_judge was removed — llm-judge now covers all judge use cases
+// including agentic behavior (auto-detected based on judge provider kind).
+// See LlmJudgeSchema above for the unified schema.
 
 const ContainsSchema = EvaluatorCommonSchema.extend({
   type: z.literal('contains'),
@@ -292,7 +287,6 @@ const EvaluatorSchema = z.union([
   CostSchema,
   TokenUsageSchema,
   ExecutionMetricsSchema,
-  AgentJudgeSchema,
   ContainsSchema,
   RegexSchema,
   IsJsonSchema,
diff --git a/examples/features/agent-judge/.agentv/targets.yaml b/examples/features/agent-judge/.agentv/targets.yaml
deleted file mode 100644
index 6d5c82918..000000000
--- a/examples/features/agent-judge/.agentv/targets.yaml
+++ /dev/null
@@ -1,22 +0,0 @@
-targets:
-  # Mock agent that "creates tests" in the workspace.
-  # Each test gets a fresh copy of workspace-template/ as its CWD.
-  - name: mock_agent
-    provider: cli
-    command: >-
-      bash -c '
-      mkdir -p tests &&
-      printf "import { add, multiply } from \"../src/main\";\n\ndescribe(\"math functions\", () => {\n  test(\"add returns sum\", () => {\n    expect(add(2, 3)).toBe(5);\n  });\n\n  test(\"multiply returns product\", () => {\n    expect(multiply(4, 5)).toBe(20);\n  });\n});\n" > tests/math.test.ts &&
-      printf "import { greet } from \"../src/main\";\n\ndescribe(\"greet\", () => {\n  test(\"returns greeting\", () => {\n    expect(greet(\"World\")).toBe(\"Hello, World!\");\n  });\n});\n" > tests/greet.test.ts &&
-      echo "Created test files: tests/math.test.ts and tests/greet.test.ts" > {OUTPUT_FILE}
-      '
-    workspace_template: ../workspace-template
-    judge_target: azure_judge
-
-  # Azure OpenAI target used as judge provider for built-in agent_judge mode.
-  - name: azure_judge
-    provider: azure
-    endpoint: ${{ AZURE_OPENAI_ENDPOINT }}
-    api_key: ${{ AZURE_OPENAI_API_KEY }}
-    model: ${{ AZURE_DEPLOYMENT_NAME }}
-    version: ${{ AZURE_OPENAI_API_VERSION }}
diff --git a/examples/features/agent-judge/evals/dataset.eval.baseline.jsonl b/examples/features/agent-judge/evals/dataset.eval.baseline.jsonl
deleted file mode 100644
index bc2d5b6ee..000000000
--- a/examples/features/agent-judge/evals/dataset.eval.baseline.jsonl
+++ /dev/null
@@ -1,2 +0,0 @@
-{"timestamp":"2026-02-20T21:37:58.641Z","test_id":"verify-test-creation-freeform","dataset":"dataset","score":1,"hits":["Created tests/math.test.ts and tests/greet.test.ts in tests/ directory","Test files import functions from src/main.ts","add, multiply, and greet functions are tested","Assertions are meaningful and verify correct outputs"],"misses":[],"target":"mock_agent","reasoning":"workspace-audit: All criteria are fully met: each function is tested with meaningful assertions, test files are correctly placed and import from the source file.","scores":[{"name":"workspace-audit","type":"agent-judge","score":1,"weight":1,"verdict":"pass","hits":["Created tests/math.test.ts and tests/greet.test.ts in tests/ directory","Test files import functions from src/main.ts","add, multiply, and greet functions are tested","Assertions are meaningful and verify correct outputs"],"misses":[],"reasoning":"All criteria are fully met: each function is tested with meaningful assertions, test files are correctly placed and import from the source file.","details":{"mode":"built-in","steps":3,"tool_calls":5}}]}
-{"timestamp":"2026-02-20T21:37:59.540Z","test_id":"verify-test-creation-rubric","dataset":"dataset","score":1,"hits":["[tests-dir-exists] A tests/ directory exists in the workspace: A 'tests/' directory exists in the workspace, containing test files.","[math-tests] Test file exists that tests the add and multiply functions: 'tests/math.test.ts' exists and contains tests for both 'add' and 'multiply' functions.","[greet-tests] Test file exists that tests the greet function: 'tests/greet.test.ts' exists and contains a test for the 'greet' function.","[assertions-present] Tests contain proper assertions (expect/assert calls): All test files contain proper assertions using 'expect' calls."],"misses":[],"target":"mock_agent","reasoning":"workspace-audit-rubric: All required test files exist in the 'tests/' directory, and each function from 'src/main.ts' is covered by appropriate unit tests with proper assertions. The candidate answer meets all rubric criteria.","scores":[{"name":"workspace-audit-rubric","type":"agent-judge","score":1,"weight":1,"verdict":"pass","hits":["[tests-dir-exists] A tests/ directory exists in the workspace: A 'tests/' directory exists in the workspace, containing test files.","[math-tests] Test file exists that tests the add and multiply functions: 'tests/math.test.ts' exists and contains tests for both 'add' and 'multiply' functions.","[greet-tests] Test file exists that tests the greet function: 'tests/greet.test.ts' exists and contains a test for the 'greet' function.","[assertions-present] Tests contain proper assertions (expect/assert calls): All test files contain proper assertions using 'expect' calls."],"misses":[],"reasoning":"All required test files exist in the 'tests/' directory, and each function from 'src/main.ts' is covered by appropriate unit tests with proper assertions. The candidate answer meets all rubric criteria.","details":{"mode":"built-in","steps":2,"tool_calls":3}}]}
diff --git a/examples/features/agent-judge/evals/dataset.eval.yaml b/examples/features/agent-judge/evals/dataset.eval.yaml
deleted file mode 100644
index a9bf21048..000000000
--- a/examples/features/agent-judge/evals/dataset.eval.yaml
+++ /dev/null
@@ -1,64 +0,0 @@
-# Agent Judge feature demonstration
-# Tests that the agent_judge evaluator can investigate the workspace
-# to verify that an agent created the expected files and content.
-#
-# The mock_agent creates test files in the workspace-template.
-# The agent_judge evaluator uses an AI SDK agent loop with filesystem tools
-# to verify the test files exist and contain proper test cases.
-
-description: Verify agent_judge evaluator can audit workspace file creation
-
-execution:
-  target: mock_agent
-
-tests:
-  # Case 1: freeform agent_judge (no rubrics) — scores 0-1
-  - id: verify-test-creation-freeform
-    criteria: >-
-      The agent should create unit test files in a tests/ directory.
-      Test files should import from src/main.ts and test the add, multiply,
-      and greet functions with meaningful assertions.
-
-    input:
-      - role: user
-        content:
-          - type: text
-            value: Create unit tests for all functions in src/main.ts
-
-    assertions:
-      - name: workspace-audit
-        type: agent-judge
-        max_steps: 5
-        temperature: 0
-
-  # Case 2: rubric-based agent_judge — structured evaluation
-  - id: verify-test-creation-rubric
-    criteria: >-
-      The agent should create comprehensive unit tests for the project.
-
-    input:
-      - role: user
-        content:
-          - type: text
-            value: Create unit tests for all functions in src/main.ts
-
-    assertions:
-      - name: workspace-audit-rubric
-        type: agent-judge
-        max_steps: 5
-        temperature: 0
-        rubrics:
-          - id: tests-dir-exists
-            outcome: "A tests/ directory exists in the workspace"
-            weight: 1.0
-            required: true
-          - id: math-tests
-            outcome: "Test file exists that tests the add and multiply functions"
-            weight: 1.0
-            required: true
-          - id: greet-tests
-            outcome: "Test file exists that tests the greet function"
-            weight: 1.0
-          - id: assertions-present
-            outcome: "Tests contain proper assertions (expect/assert calls)"
-            weight: 0.5
diff --git a/examples/features/agent-judge/workspace-template/package.json b/examples/features/agent-judge/workspace-template/package.json
deleted file mode 100644
index 24d635536..000000000
--- a/examples/features/agent-judge/workspace-template/package.json
+++ /dev/null
@@ -1,5 +0,0 @@
-{
-  "name": "sample-project",
-  "version": "1.0.0",
-  "type": "module"
-}
diff --git a/examples/features/agent-judge/workspace-template/src/main.ts b/examples/features/agent-judge/workspace-template/src/main.ts
deleted file mode 100644
index cfda22527..000000000
--- a/examples/features/agent-judge/workspace-template/src/main.ts
+++ /dev/null
@@ -1,11 +0,0 @@
-export function add(a: number, b: number): number {
-  return a + b;
-}
-
-export function multiply(a: number, b: number): number {
-  return a * b;
-}
-
-export function greet(name: string): string {
-  return `Hello, ${name}!`;
-}
diff --git a/examples/features/file-changes-judges/.agentv/targets.yaml b/examples/features/file-changes-judges/.agentv/targets.yaml
index d9645bc03..10c067b31 100644
--- a/examples/features/file-changes-judges/.agentv/targets.yaml
+++ b/examples/features/file-changes-judges/.agentv/targets.yaml
@@ -11,7 +11,7 @@ targets:
     workspace_template: ../workspace-template
     judge_target: azure_judge
 
-  # Azure OpenAI — used as LLM judge (rubrics) and built-in agent_judge provider
+  # Azure OpenAI — used as LLM judge (rubrics) and built-in llm-judge provider
   - name: azure_judge
     provider: azure
     endpoint: ${{ AZURE_OPENAI_ENDPOINT }}
@@ -19,7 +19,7 @@ targets:
     model: ${{ AZURE_DEPLOYMENT_NAME }}
     version: ${{ AZURE_OPENAI_API_VERSION }}
 
-  # Copilot CLI — used as delegated agent_judge target
+  # Copilot CLI — used as delegated llm-judge target
   - name: copilot_judge
     provider: copilot-cli
     model: claude-haiku-4.5
diff --git a/examples/features/file-changes-judges/evals/dataset.eval.yaml b/examples/features/file-changes-judges/evals/dataset.eval.yaml
index 65ebd68df..2fb796537 100644
--- a/examples/features/file-changes-judges/evals/dataset.eval.yaml
+++ b/examples/features/file-changes-judges/evals/dataset.eval.yaml
@@ -2,13 +2,13 @@
 #
 # Proves that file_changes diffs are correctly passed to all judge types:
 #   1. rubrics      — LLM judge (Azure) evaluates the diff
-#   2. agent_judge  — built-in mode (Azure via AI SDK) sees file_changes in prompt
-#   3. agent_judge  — delegated mode (Copilot CLI with haiku) sees file_changes in prompt
+#   2. llm-judge    — built-in mode (Azure via AI SDK) sees file_changes in prompt
+#   3. llm-judge    — delegated mode (Copilot CLI with haiku) sees file_changes in prompt
 #
 # The mock agent adds a `subtract` function to calculator.ts, producing a small
 # diff (~10 lines) that fits comfortably in any LLM context window.
 
-description: Verify file_changes diffs are accessible to LLM judge, built-in agent judge, and copilot-cli agent judge
+description: Verify file_changes diffs are accessible to LLM judge (rubrics, built-in, and copilot-cli)
 
 execution:
   target: mock_agent
@@ -43,14 +43,14 @@ tests:
             outcome: "The file_changes contains a valid unified diff format"
             weight: 0.5
 
-      # 2. Built-in agent judge — Azure via AI SDK with filesystem tools
-      - name: agent-judge-builtin
-        type: agent-judge
+      # 2. Built-in LLM judge — Azure via AI SDK with filesystem tools
+      - name: llm-judge-builtin
+        type: llm-judge
         max_steps: 3
         temperature: 0
 
-      # 3. Copilot CLI agent judge — delegated via target
-      - name: agent-judge-copilot
-        type: agent-judge
+      # 3. Copilot CLI LLM judge — delegated via target
+      - name: llm-judge-copilot
+        type: llm-judge
         target: copilot_judge
         temperature: 0
diff --git a/packages/core/package.json b/packages/core/package.json
index d0c0a031e..600890177 100644
--- a/packages/core/package.json
+++ b/packages/core/package.json
@@ -39,11 +39,12 @@
   },
   "files": ["dist", "README.md"],
   "dependencies": {
-    "@agentv/eval": "workspace:*",
     "@agentclientprotocol/sdk": "^0.14.1",
+    "@agentv/eval": "workspace:*",
     "@ai-sdk/anthropic": "^2.0.53",
     "@ai-sdk/azure": "^2.0.78",
     "@ai-sdk/google": "^2.0.44",
+    "@ai-sdk/openai": "^2.0.0",
     "@anthropic-ai/claude-agent-sdk": "^0.2.49",
     "@github/copilot-sdk": "^0.1.25",
     "@mariozechner/pi-agent-core": "^0.54.2",
diff --git a/packages/core/src/evaluation/evaluators/agent-judge.ts b/packages/core/src/evaluation/evaluators/agent-judge.ts
deleted file mode 100644
index 2dc00f769..000000000
--- a/packages/core/src/evaluation/evaluators/agent-judge.ts
+++ /dev/null
@@ -1,598 +0,0 @@
-import fs from 'node:fs/promises';
-import path from 'node:path';
-
-import { generateText, stepCountIs, tool } from 'ai';
-import { z } from 'zod';
-
-import { extractLastAssistantContent } from '../providers/types.js';
-import type { Provider } from '../providers/types.js';
-import { TEMPLATE_VARIABLES } from '../template-variables.js';
-import type { JsonObject, RubricItem } from '../types.js';
-import {
-  buildOutputSchema,
-  buildRubricOutputSchema,
-  calculateRubricScore,
-  freeformEvaluationSchema,
-  rubricEvaluationSchema,
-  substituteVariables,
-} from './llm-judge.js';
-import { clampScore, isNonEmptyString, parseJsonFromText, scoreToVerdict } from './scoring.js';
-import type { EvaluationContext, EvaluationScore, Evaluator } from './types.js';
-
-const DEFAULT_MAX_STEPS = 10;
-const MAX_STEPS_LIMIT = 50;
-const MAX_FILE_SIZE = 50 * 1024; // 50KB
-const MAX_SEARCH_MATCHES = 20;
-
-/**
- * Directories/patterns to skip during file search.
- */
-const SEARCH_SKIP_DIRS = new Set([
-  'node_modules',
-  '.git',
-  '.next',
-  'dist',
-  '__pycache__',
-  '.cache',
-]);
-
-/**
- * Binary file extensions to skip during search.
- */
-const BINARY_EXTENSIONS = new Set([
-  '.png',
-  '.jpg',
-  '.jpeg',
-  '.gif',
-  '.ico',
-  '.svg',
-  '.woff',
-  '.woff2',
-  '.ttf',
-  '.eot',
-  '.mp3',
-  '.mp4',
-  '.wav',
-  '.zip',
-  '.tar',
-  '.gz',
-  '.pdf',
-  '.exe',
-  '.dll',
-  '.so',
-  '.dylib',
-]);
-
-export interface AgentJudgeEvaluatorOptions {
-  readonly resolveJudgeProvider: (ctx: EvaluationContext) => Promise<Provider | undefined>;
-  readonly maxSteps?: number;
-  readonly temperature?: number;
-  readonly evaluatorTemplate?: string;
-  readonly judgeTargetProvider?: Provider;
-}
-
-export class AgentJudgeEvaluator implements Evaluator {
-  readonly kind = 'agent-judge';
-
-  private readonly resolveJudgeProvider: (ctx: EvaluationContext) => Promise<Provider | undefined>;
-  private readonly maxSteps: number;
-  private readonly temperature: number;
-  private readonly evaluatorTemplate?: string;
-  private readonly judgeTargetProvider?: Provider;
-
-  constructor(options: AgentJudgeEvaluatorOptions) {
-    this.resolveJudgeProvider = options.resolveJudgeProvider;
-    this.maxSteps = Math.min(options.maxSteps ?? DEFAULT_MAX_STEPS, MAX_STEPS_LIMIT);
-    this.temperature = options.temperature ?? 0;
-    this.evaluatorTemplate = options.evaluatorTemplate;
-    this.judgeTargetProvider = options.judgeTargetProvider;
-  }
-
-  async evaluate(context: EvaluationContext): Promise<EvaluationScore> {
-    if (this.judgeTargetProvider) {
-      return this.evaluateWithJudgeTarget(context);
-    }
-    return this.evaluateBuiltIn(context);
-  }
-
-  /**
-   * Built-in mode: Uses Vercel AI SDK generateText() with sandboxed filesystem tools.
-   */
-  private async evaluateBuiltIn(context: EvaluationContext): Promise<EvaluationScore> {
-    const judgeProvider = await this.resolveJudgeProvider(context);
-    if (!judgeProvider) {
-      throw new Error('No judge provider available for agent-judge evaluation');
-    }
-
-    const model = judgeProvider.asLanguageModel?.();
-    if (!model) {
-      throw new Error(
-        `Judge provider '${judgeProvider.targetName}' does not support asLanguageModel() — required for built-in agent-judge mode`,
-      );
-    }
-
-    const workspacePath = context.workspacePath;
-    if (!workspacePath) {
-      throw new Error(
-        'agent-judge evaluator requires a workspace_template target (workspacePath is not set)',
-      );
-    }
-
-    const systemPrompt = this.buildSystemPrompt(context);
-    const userPrompt = this.buildUserPrompt(context);
-
-    const config = context.evaluator;
-    const rubrics = config?.type === 'agent-judge' ? config.rubrics : undefined;
-
-    const fsTools = createFilesystemTools(workspacePath);
-
-    const evaluatorRawRequest: JsonObject = {
-      mode: 'built-in',
-      systemPrompt,
-      userPrompt,
-      target: judgeProvider.targetName,
-      maxSteps: this.maxSteps,
-    };
-
-    try {
-      const { text, steps } = await generateText({
-        model,
-        system: systemPrompt,
-        prompt: userPrompt,
-        tools: fsTools,
-        stopWhen: stepCountIs(this.maxSteps),
-        temperature: this.temperature,
-      });
-
-      const toolCallCount = steps.reduce((count, step) => count + (step.toolCalls?.length ?? 0), 0);
-
-      const details: JsonObject = {
-        mode: 'built-in',
-        steps: steps.length,
-        tool_calls: toolCallCount,
-      };
-
-      return this.parseResult(text, rubrics, evaluatorRawRequest, details);
-    } catch (error) {
-      const message = error instanceof Error ? error.message : String(error);
-      return {
-        score: 0,
-        verdict: 'fail',
-        hits: [],
-        misses: [`agent-judge built-in evaluation failed: ${message}`],
-        expectedAspectCount: 1,
-        evaluatorRawRequest,
-        details: { mode: 'built-in', error: message },
-      };
-    }
-  }
-
-  /**
-   * Judge target mode: Delegates to an external agent provider via Provider.invoke().
-   */
-  private async evaluateWithJudgeTarget(context: EvaluationContext): Promise<EvaluationScore> {
-    const provider = this.judgeTargetProvider as Provider;
-
-    const workspacePath = context.workspacePath;
-    const prompt = this.buildDelegatedPrompt(context);
-
-    const evaluatorRawRequest: JsonObject = {
-      mode: 'judge_target',
-      judge_target: provider.targetName,
-      prompt,
-    };
-
-    try {
-      const response = await provider.invoke({
-        question: prompt,
-        cwd: workspacePath,
-        evalCaseId: context.evalCase.id,
-        attempt: context.attempt,
-      });
-
-      const assistantContent = extractLastAssistantContent(response.output);
-      if (!assistantContent) {
-        return {
-          score: 0,
-          verdict: 'fail',
-          hits: [],
-          misses: ['agent-judge judge_target returned no assistant response'],
-          expectedAspectCount: 1,
-          evaluatorRawRequest,
-          details: { mode: 'judge_target', judge_target: provider.targetName },
-        };
-      }
-
-      const config = context.evaluator;
-      const rubrics = config?.type === 'agent-judge' ? config.rubrics : undefined;
-
-      const details: JsonObject = {
-        mode: 'judge_target',
-        judge_target: provider.targetName,
-      };
-
-      return this.parseResult(assistantContent, rubrics, evaluatorRawRequest, details);
-    } catch (error) {
-      const message = error instanceof Error ? error.message : String(error);
-      return {
-        score: 0,
-        verdict: 'fail',
-        hits: [],
-        misses: [`agent-judge judge_target evaluation failed: ${message}`],
-        expectedAspectCount: 1,
-        evaluatorRawRequest,
-        details: {
-          mode: 'judge_target',
-          judge_target: provider.targetName,
-          error: message,
-        },
-      };
-    }
-  }
-
-  /**
-   * Parse the agent's response text into an EvaluationScore.
-   * Supports both freeform and rubric modes.
-   */
-  private parseResult(
-    text: string,
-    rubrics: readonly RubricItem[] | undefined,
-    evaluatorRawRequest: JsonObject,
-    details: JsonObject,
-  ): EvaluationScore {
-    try {
-      const parsed = parseJsonFromText(text);
-
-      if (rubrics && rubrics.length > 0) {
-        const data = rubricEvaluationSchema.parse(parsed);
-        const { score, verdict, hits, misses } = calculateRubricScore(data, rubrics);
-        return {
-          score,
-          verdict,
-          hits,
-          misses,
-          expectedAspectCount: rubrics.length,
-          reasoning: data.overall_reasoning,
-          evaluatorRawRequest,
-          details,
-        };
-      }
-
-      const data = freeformEvaluationSchema.parse(parsed);
-      const score = clampScore(data.score);
-      const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
-      const misses = Array.isArray(data.misses)
-        ? data.misses.filter(isNonEmptyString).slice(0, 4)
-        : [];
-
-      return {
-        score,
-        verdict: scoreToVerdict(score),
-        hits,
-        misses,
-        expectedAspectCount: Math.max(hits.length + misses.length, 1),
-        reasoning: data.reasoning,
-        evaluatorRawRequest,
-        details,
-      };
-    } catch {
-      return {
-        score: 0,
-        verdict: 'fail',
-        hits: [],
-        misses: ['Failed to parse agent-judge response as valid evaluation JSON'],
-        expectedAspectCount: 1,
-        evaluatorRawRequest,
-        details,
-      };
-    }
-  }
-
-  /**
-   * Build system prompt for built-in mode.
-   * Includes output format instructions.
-   */
-  private buildSystemPrompt(context: EvaluationContext): string {
-    const config = context.evaluator;
-    const rubrics = config?.type === 'agent-judge' ? config.rubrics : undefined;
-
-    const parts: string[] = [
-      'You are an expert evaluator with access to the workspace filesystem.',
-      'Use the provided tools to investigate the workspace and verify the criteria are met.',
-      'Thoroughly examine relevant files before making your assessment.',
-      '',
-    ];
-
-    if (rubrics && rubrics.length > 0) {
-      parts.push(buildRubricOutputSchema());
-    } else {
-      parts.push(buildOutputSchema());
-    }
-
-    return parts.join('\n');
-  }
-
-  /**
-   * Build user prompt for built-in mode.
-   * Uses custom template if provided, otherwise builds default prompt.
-   */
-  private buildUserPrompt(context: EvaluationContext): string {
-    const formattedQuestion =
-      context.promptInputs.question && context.promptInputs.question.trim().length > 0
-        ? context.promptInputs.question
-        : context.evalCase.question;
-
-    const variables: Record<string, string> = {
-      [TEMPLATE_VARIABLES.ANSWER]: context.candidate.trim(),
-      [TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context.evalCase.reference_answer ?? '').trim(),
-      [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(),
-      [TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(),
-      [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? '',
-    };
-
-    if (this.evaluatorTemplate) {
-      return substituteVariables(this.evaluatorTemplate, variables);
-    }
-
-    const config = context.evaluator;
-    const rubrics = config?.type === 'agent-judge' ? config.rubrics : undefined;
-
-    const parts: string[] = [
-      'Evaluate the candidate answer by investigating the workspace.',
-      '',
-      '[[ ## question ## ]]',
-      formattedQuestion,
-      '',
-      '[[ ## criteria ## ]]',
-      context.evalCase.criteria,
-      '',
-    ];
-
-    if (context.evalCase.reference_answer && context.evalCase.reference_answer.trim().length > 0) {
-      parts.push('[[ ## reference_answer ## ]]', context.evalCase.reference_answer, '');
-    }
-
-    parts.push('[[ ## answer ## ]]', context.candidate, '');
-
-    if (context.fileChanges) {
-      parts.push('[[ ## file_changes ## ]]', context.fileChanges, '');
-    }
-
-    if (rubrics && rubrics.length > 0) {
-      parts.push('[[ ## rubrics ## ]]');
-      for (const rubric of rubrics) {
-        const requiredLabel = rubric.required ? ' (REQUIRED)' : '';
-        const weightLabel = rubric.weight !== 1.0 ? ` (weight: ${rubric.weight})` : '';
-        parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.outcome}`);
-      }
-      parts.push(
-        '',
-        'For each rubric, investigate the workspace to determine if it is satisfied. Provide brief reasoning.',
-      );
-    } else {
-      parts.push(
-        'Investigate the workspace to verify the criteria. Provide a score between 0.0 and 1.0.',
-      );
-    }
-
-    return parts.join('\n');
-  }
-
-  /**
-   * Build the full evaluation prompt for judge target mode (delegation).
-   * Combines task context, criteria, candidate info, and output format instructions.
-   */
-  private buildDelegatedPrompt(context: EvaluationContext): string {
-    const formattedQuestion =
-      context.promptInputs.question && context.promptInputs.question.trim().length > 0
-        ? context.promptInputs.question
-        : context.evalCase.question;
-
-    const config = context.evaluator;
-    const rubrics = config?.type === 'agent-judge' ? config.rubrics : undefined;
-
-    if (this.evaluatorTemplate) {
-      const variables: Record<string, string> = {
-        [TEMPLATE_VARIABLES.ANSWER]: context.candidate.trim(),
-        [TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context.evalCase.reference_answer ?? '').trim(),
-        [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(),
-        [TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(),
-        [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? '',
-      };
-      const customPrompt = substituteVariables(this.evaluatorTemplate, variables);
-
-      const outputSchema =
-        rubrics && rubrics.length > 0 ? buildRubricOutputSchema() : buildOutputSchema();
-
-      return `${customPrompt}\n\n${outputSchema}`;
-    }
-
-    const parts: string[] = [
-      'You are an expert evaluator. Investigate the workspace to verify the criteria are met.',
-      '',
-      '[[ ## question ## ]]',
-      formattedQuestion,
-      '',
-      '[[ ## criteria ## ]]',
-      context.evalCase.criteria,
-      '',
-    ];
-
-    if (context.evalCase.reference_answer && context.evalCase.reference_answer.trim().length > 0) {
-      parts.push('[[ ## reference_answer ## ]]', context.evalCase.reference_answer, '');
-    }
-
-    parts.push('[[ ## answer ## ]]', context.candidate, '');
-
-    if (context.fileChanges) {
-      parts.push('[[ ## file_changes ## ]]', context.fileChanges, '');
-    }
-
-    if (rubrics && rubrics.length > 0) {
-      parts.push('[[ ## rubrics ## ]]');
-      for (const rubric of rubrics) {
-        const requiredLabel = rubric.required ? ' (REQUIRED)' : '';
-        const weightLabel = rubric.weight !== 1.0 ? ` (weight: ${rubric.weight})` : '';
-        parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.outcome}`);
-      }
-      parts.push('');
-      parts.push(buildRubricOutputSchema());
-    } else {
-      parts.push(buildOutputSchema());
-    }
-
-    return parts.join('\n');
-  }
-}
-
-// ---------------------------------------------------------------------------
-// Sandboxed filesystem tools for built-in mode
-// ---------------------------------------------------------------------------
-
-/**
- * Resolve a relative path within the sandbox, preventing path traversal.
- * Returns the absolute path if valid, or throws if the path escapes the sandbox.
- */
-function resolveSandboxed(basePath: string, relativePath: string): string {
-  const resolved = path.resolve(basePath, relativePath);
-  if (!resolved.startsWith(basePath + path.sep) && resolved !== basePath) {
-    throw new Error(`Path '${relativePath}' is outside the workspace`);
-  }
-  return resolved;
-}
-
-/**
- * Create sandboxed filesystem tools for the AI SDK agent loop.
- */
-function createFilesystemTools(workspacePath: string) {
-  return {
-    list_files: tool({
-      description:
-        'List files and directories at a relative path within the workspace. Returns names only (single level, no recursion).',
-      inputSchema: z.object({
-        path: z.string().describe('Relative path within workspace (use "." for root)').default('.'),
-      }),
-      execute: async (input: { path: string }) => {
-        try {
-          const resolved = resolveSandboxed(workspacePath, input.path);
-          const entries = await fs.readdir(resolved, { withFileTypes: true });
-          return entries
-            .map((e) => ({
-              name: e.name,
-              type: e.isDirectory() ? 'directory' : 'file',
-            }))
-            .slice(0, 100);
-        } catch (error) {
-          return { error: error instanceof Error ? error.message : String(error) };
-        }
-      },
-    }),
-
-    read_file: tool({
-      description:
-        'Read the content of a file at a relative path within the workspace. Large files are truncated at 50KB.',
-      inputSchema: z.object({
-        path: z.string().describe('Relative path to file within workspace'),
-      }),
-      execute: async (input: { path: string }) => {
-        try {
-          const resolved = resolveSandboxed(workspacePath, input.path);
-          const stat = await fs.stat(resolved);
-          if (stat.isDirectory()) {
-            return { error: `'${input.path}' is a directory, not a file` };
-          }
-          const buffer = Buffer.alloc(Math.min(stat.size, MAX_FILE_SIZE));
-          const fd = await fs.open(resolved, 'r');
-          try {
-            await fd.read(buffer, 0, buffer.length, 0);
-          } finally {
-            await fd.close();
-          }
-          const content = buffer.toString('utf-8');
-          const truncated = stat.size > MAX_FILE_SIZE;
-          return { content, truncated, size: stat.size };
-        } catch (error) {
-          return { error: error instanceof Error ? error.message : String(error) };
-        }
-      },
-    }),
-
-    search_files: tool({
-      description:
-        'Search for a regex pattern across files in the workspace. Returns up to 20 matches. Skips binary files and node_modules/.git.',
-      inputSchema: z.object({
-        pattern: z.string().describe('Regex pattern to search for'),
-        path: z.string().describe('Relative path to search within (use "." for root)').default('.'),
-      }),
-      execute: async (input: { pattern: string; path: string }) => {
-        try {
-          const resolved = resolveSandboxed(workspacePath, input.path);
-          const regex = new RegExp(input.pattern, 'gi');
-          const matches: Array<{ file: string; line: number; text: string }> = [];
-
-          await searchDirectory(resolved, workspacePath, regex, matches);
-
-          return { matches, total: matches.length };
-        } catch (error) {
-          return { error: error instanceof Error ? error.message : String(error) };
-        }
-      },
-    }),
-  };
-}
-
-/**
- * Recursively search a directory for regex matches.
- */
-async function searchDirectory(
-  dirPath: string,
-  workspacePath: string,
-  regex: RegExp,
-  matches: Array<{ file: string; line: number; text: string }>,
-): Promise<void> {
-  if (matches.length >= MAX_SEARCH_MATCHES) return;
-
-  let entries: import('node:fs').Dirent[];
-  try {
-    entries = await fs.readdir(dirPath, { withFileTypes: true });
-  } catch {
-    return;
-  }
-
-  for (const entry of entries) {
-    if (matches.length >= MAX_SEARCH_MATCHES) return;
-
-    if (SEARCH_SKIP_DIRS.has(entry.name)) continue;
-
-    const fullPath = path.join(dirPath, entry.name);
-
-    if (entry.isDirectory()) {
-      await searchDirectory(fullPath, workspacePath, regex, matches);
-    } else if (entry.isFile()) {
-      const ext = path.extname(entry.name).toLowerCase();
-      if (BINARY_EXTENSIONS.has(ext)) continue;
-
-      try {
-        const stat = await fs.stat(fullPath);
-        if (stat.size > MAX_FILE_SIZE) continue;
-
-        const content = await fs.readFile(fullPath, 'utf-8');
-        const lines = content.split('\n');
-
-        for (let i = 0; i < lines.length; i++) {
-          if (matches.length >= MAX_SEARCH_MATCHES) return;
-          regex.lastIndex = 0;
-          if (regex.test(lines[i])) {
-            matches.push({
-              file: path.relative(workspacePath, fullPath),
-              line: i + 1,
-              text: lines[i].substring(0, 200),
-            });
-          }
-        }
-      } catch {
-        // Skip unreadable files
-      }
-    }
-  }
-}
diff --git a/packages/core/src/evaluation/evaluators/index.ts b/packages/core/src/evaluation/evaluators/index.ts
index 157ff7c99..a64705fbe 100644
--- a/packages/core/src/evaluation/evaluators/index.ts
+++ b/packages/core/src/evaluation/evaluators/index.ts
@@ -52,9 +52,6 @@ export {
 } from './llm-judge.js';
 export type { LlmJudgeEvaluatorOptions } from './llm-judge.js';
 
-export { AgentJudgeEvaluator } from './agent-judge.js';
-export type { AgentJudgeEvaluatorOptions } from './agent-judge.js';
-
 export { SkillTriggerEvaluator } from './skill-trigger.js';
 
 export { assembleLlmJudgePrompt } from './llm-judge-prompt.js';
diff --git a/packages/core/src/evaluation/evaluators/llm-judge.ts b/packages/core/src/evaluation/evaluators/llm-judge.ts
index 46125f3e7..88e6a5268 100644
--- a/packages/core/src/evaluation/evaluators/llm-judge.ts
+++ b/packages/core/src/evaluation/evaluators/llm-judge.ts
@@ -1,14 +1,65 @@
-import { generateText } from 'ai';
+import fs from 'node:fs/promises';
+import path from 'node:path';
+
+import { generateText, stepCountIs, tool } from 'ai';
 import { z } from 'zod';
 
 import type { Provider, ProviderResponse } from '../providers/types.js';
-import { extractLastAssistantContent } from '../providers/types.js';
+import { extractLastAssistantContent, isAgentProvider } from '../providers/types.js';
 import { TEMPLATE_VARIABLES } from '../template-variables.js';
 import type { TokenUsage } from '../trace.js';
 import type { JsonObject, RubricItem } from '../types.js';
 import { clampScore, isNonEmptyString, parseJsonFromText, scoreToVerdict } from './scoring.js';
 import type { EvaluationContext, EvaluationScore, Evaluator } from './types.js';
 
+// ---------------------------------------------------------------------------
+// Constants for built-in agent mode (filesystem tools)
+// ---------------------------------------------------------------------------
+
+const DEFAULT_MAX_STEPS = 10;
+const MAX_STEPS_LIMIT = 50;
+const MAX_FILE_SIZE = 50 * 1024; // 50KB
+const MAX_SEARCH_MATCHES = 20;
+
+/**
+ * Directories/patterns to skip during file search.
+ */
+const SEARCH_SKIP_DIRS = new Set([
+  'node_modules',
+  '.git',
+  '.next',
+  'dist',
+  '__pycache__',
+  '.cache',
+]);
+
+/**
+ * Binary file extensions to skip during search.
+ */
+const BINARY_EXTENSIONS = new Set([
+  '.png',
+  '.jpg',
+  '.jpeg',
+  '.gif',
+  '.ico',
+  '.svg',
+  '.woff',
+  '.woff2',
+  '.ttf',
+  '.eot',
+  '.mp3',
+  '.mp4',
+  '.wav',
+  '.zip',
+  '.tar',
+  '.gz',
+  '.pdf',
+  '.exe',
+  '.dll',
+  '.so',
+  '.dylib',
+]);
+
 /**
  * Default evaluator template for the user prompt (variables will be substituted).
  * Custom evaluators can override this via evaluatorTemplate option.
@@ -38,6 +89,8 @@ export interface LlmJudgeEvaluatorOptions {
   readonly maxOutputTokens?: number;
   readonly temperature?: number;
   readonly evaluatorTemplate?: string;
+  readonly maxSteps?: number;
+  readonly judgeTargetProvider?: Provider;
 }
 
 const freeformEvaluationSchema = z.object({
@@ -82,20 +135,40 @@ export class LlmJudgeEvaluator implements Evaluator {
   private readonly maxOutputTokens?: number;
   private readonly temperature?: number;
   private readonly evaluatorTemplate?: string;
+  private readonly maxSteps: number;
+  private readonly judgeTargetProvider?: Provider;
 
   constructor(options: LlmJudgeEvaluatorOptions) {
     this.resolveJudgeProvider = options.resolveJudgeProvider;
     this.maxOutputTokens = options.maxOutputTokens;
     this.temperature = options.temperature;
     this.evaluatorTemplate = options.evaluatorTemplate;
+    this.maxSteps = Math.min(options.maxSteps ?? DEFAULT_MAX_STEPS, MAX_STEPS_LIMIT);
+    this.judgeTargetProvider = options.judgeTargetProvider;
   }
 
   async evaluate(context: EvaluationContext): Promise<EvaluationScore> {
+    // Delegate mode: judge target provider is an agent provider — send prompt via invoke()
+    if (this.judgeTargetProvider) {
+      return this.evaluateWithJudgeTarget(context);
+    }
+
     const judgeProvider = await this.resolveJudgeProvider(context);
     if (!judgeProvider) {
       throw new Error('No judge provider available for LLM grading');
     }
 
+    // Built-in agent mode: agentv provider → AI SDK generateText with filesystem tools
+    if (judgeProvider.kind === 'agentv') {
+      return this.evaluateBuiltIn(context, judgeProvider);
+    }
+
+    // Delegate mode: resolved provider is an agent provider → send prompt via invoke()
+    if (isAgentProvider(judgeProvider)) {
+      return this.evaluateWithDelegatedAgent(context, judgeProvider);
+    }
+
+    // LLM mode: structured JSON evaluation
     const config = context.evaluator;
     if (config?.type === 'llm-judge' && config.rubrics && config.rubrics.length > 0) {
       return this.evaluateWithRubrics(context, judgeProvider, config.rubrics);
@@ -104,6 +177,10 @@ export class LlmJudgeEvaluator implements Evaluator {
     return this.evaluateFreeform(context, judgeProvider);
   }
 
+  // ---------------------------------------------------------------------------
+  // LLM mode (existing)
+  // ---------------------------------------------------------------------------
+
   private async evaluateFreeform(
     context: EvaluationContext,
     judgeProvider: Provider,
@@ -177,7 +254,7 @@ export class LlmJudgeEvaluator implements Evaluator {
         tokenUsage,
       };
     } catch (e: unknown) {
-      // Judge parse failure → skip (not silent zero).
+      // Judge parse failure -> skip (not silent zero).
       // Signals infrastructure error to downstream consumers, excluded from score averages.
       const message = e instanceof Error ? e.message : String(e);
       const evalName = context.evaluator?.name ?? 'llm-judge';
@@ -314,6 +391,393 @@ export class LlmJudgeEvaluator implements Evaluator {
     }
   }
 
+  // ---------------------------------------------------------------------------
+  // Built-in agent mode (agentv provider — AI SDK generateText with filesystem tools)
+  // ---------------------------------------------------------------------------
+
+  /**
+   * Built-in mode: Uses Vercel AI SDK generateText() with sandboxed filesystem tools.
+   */
+  private async evaluateBuiltIn(
+    context: EvaluationContext,
+    judgeProvider: Provider,
+  ): Promise<EvaluationScore> {
+    const model = judgeProvider.asLanguageModel?.();
+    if (!model) {
+      throw new Error(
+        `Judge provider '${judgeProvider.targetName}' does not support asLanguageModel() — required for built-in agent mode`,
+      );
+    }
+
+    const workspacePath = context.workspacePath;
+    if (!workspacePath) {
+      throw new Error(
+        'llm-judge built-in agent mode requires a workspace_template target (workspacePath is not set)',
+      );
+    }
+
+    const systemPrompt = this.buildAgentSystemPrompt(context);
+    const userPrompt = this.buildAgentUserPrompt(context);
+
+    const config = context.evaluator;
+    const rubrics = config?.type === 'llm-judge' ? config.rubrics : undefined;
+
+    const fsTools = createFilesystemTools(workspacePath);
+
+    const evaluatorRawRequest: JsonObject = {
+      mode: 'built-in',
+      systemPrompt,
+      userPrompt,
+      target: judgeProvider.targetName,
+      maxSteps: this.maxSteps,
+    };
+
+    try {
+      const { text, steps } = await generateText({
+        model,
+        system: systemPrompt,
+        prompt: userPrompt,
+        tools: fsTools,
+        stopWhen: stepCountIs(this.maxSteps),
+        temperature: this.temperature ?? 0,
+      });
+
+      const toolCallCount = steps.reduce((count, step) => count + (step.toolCalls?.length ?? 0), 0);
+
+      const details: JsonObject = {
+        mode: 'built-in',
+        steps: steps.length,
+        tool_calls: toolCallCount,
+      };
+
+      return this.parseAgentResult(text, rubrics, evaluatorRawRequest, details);
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      return {
+        score: 0,
+        verdict: 'fail',
+        hits: [],
+        misses: [`llm-judge built-in evaluation failed: ${message}`],
+        expectedAspectCount: 1,
+        evaluatorRawRequest,
+        details: { mode: 'built-in', error: message },
+      };
+    }
+  }
+
+  // ---------------------------------------------------------------------------
+  // Delegate mode (agent provider — send prompt via Provider.invoke())
+  // ---------------------------------------------------------------------------
+
+  /**
+   * Judge target mode: Delegates to an explicit judgeTargetProvider via Provider.invoke().
+   */
+  private async evaluateWithJudgeTarget(context: EvaluationContext): Promise<EvaluationScore> {
+    return this.evaluateWithDelegate(context, this.judgeTargetProvider as Provider, 'judge_target');
+  }
+
+  /**
+   * Delegate mode: resolved provider is an agent provider — send prompt via invoke().
+   */
+  private async evaluateWithDelegatedAgent(
+    context: EvaluationContext,
+    judgeProvider: Provider,
+  ): Promise<EvaluationScore> {
+    return this.evaluateWithDelegate(context, judgeProvider, 'delegate');
+  }
+
+  /**
+   * Shared implementation for judge_target and delegate modes.
+   * Both invoke a provider and parse the agent result from the response.
+   */
+  private async evaluateWithDelegate(
+    context: EvaluationContext,
+    provider: Provider,
+    modeLabel: string,
+  ): Promise<EvaluationScore> {
+    const workspacePath = context.workspacePath;
+    const prompt = this.buildDelegatedPrompt(context);
+
+    const evaluatorRawRequest: JsonObject = {
+      mode: modeLabel,
+      judge_target: provider.targetName,
+      prompt,
+    };
+
+    try {
+      const response = await provider.invoke({
+        question: prompt,
+        cwd: workspacePath,
+        evalCaseId: context.evalCase.id,
+        attempt: context.attempt,
+      });
+
+      const assistantContent = extractLastAssistantContent(response.output);
+      if (!assistantContent) {
+        return {
+          score: 0,
+          verdict: 'fail',
+          hits: [],
+          misses: [`llm-judge ${modeLabel} returned no assistant response`],
+          expectedAspectCount: 1,
+          evaluatorRawRequest,
+          details: { mode: modeLabel, judge_target: provider.targetName },
+        };
+      }
+
+      const config = context.evaluator;
+      const rubrics = config?.type === 'llm-judge' ? config.rubrics : undefined;
+
+      const details: JsonObject = {
+        mode: modeLabel,
+        judge_target: provider.targetName,
+      };
+
+      return this.parseAgentResult(assistantContent, rubrics, evaluatorRawRequest, details);
+    } catch (error) {
+      const message = error instanceof Error ? error.message : String(error);
+      return {
+        score: 0,
+        verdict: 'fail',
+        hits: [],
+        misses: [`llm-judge ${modeLabel} evaluation failed: ${message}`],
+        expectedAspectCount: 1,
+        evaluatorRawRequest,
+        details: {
+          mode: modeLabel,
+          judge_target: provider.targetName,
+          error: message,
+        },
+      };
+    }
+  }
+
+  // ---------------------------------------------------------------------------
+  // Prompt builders for agent modes
+  // ---------------------------------------------------------------------------
+
+  /**
+   * Build system prompt for built-in agent mode.
+   * Includes output format instructions.
+   */
+  private buildAgentSystemPrompt(context: EvaluationContext): string {
+    const config = context.evaluator;
+    const rubrics = config?.type === 'llm-judge' ? config.rubrics : undefined;
+
+    const parts: string[] = [
+      'You are an expert evaluator with access to the workspace filesystem.',
+      'Use the provided tools to investigate the workspace and verify the criteria are met.',
+      'Thoroughly examine relevant files before making your assessment.',
+      '',
+    ];
+
+    if (rubrics && rubrics.length > 0) {
+      parts.push(buildRubricOutputSchema());
+    } else {
+      parts.push(buildOutputSchema());
+    }
+
+    return parts.join('\n');
+  }
+
+  /**
+   * Build user prompt for built-in agent mode.
+   * Uses custom template if provided, otherwise builds default prompt.
+   */
+  private buildAgentUserPrompt(context: EvaluationContext): string {
+    const formattedQuestion =
+      context.promptInputs.question && context.promptInputs.question.trim().length > 0
+        ? context.promptInputs.question
+        : context.evalCase.question;
+
+    const variables: Record<string, string> = {
+      [TEMPLATE_VARIABLES.ANSWER]: context.candidate.trim(),
+      [TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context.evalCase.reference_answer ?? '').trim(),
+      [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(),
+      [TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(),
+      [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? '',
+    };
+
+    if (this.evaluatorTemplate) {
+      return substituteVariables(this.evaluatorTemplate, variables);
+    }
+
+    const config = context.evaluator;
+    const rubrics = config?.type === 'llm-judge' ? config.rubrics : undefined;
+
+    const parts: string[] = [
+      'Evaluate the candidate answer by investigating the workspace.',
+      '',
+      '[[ ## question ## ]]',
+      formattedQuestion,
+      '',
+      '[[ ## criteria ## ]]',
+      context.evalCase.criteria,
+      '',
+    ];
+
+    if (context.evalCase.reference_answer && context.evalCase.reference_answer.trim().length > 0) {
+      parts.push('[[ ## reference_answer ## ]]', context.evalCase.reference_answer, '');
+    }
+
+    parts.push('[[ ## answer ## ]]', context.candidate, '');
+
+    if (context.fileChanges) {
+      parts.push('[[ ## file_changes ## ]]', context.fileChanges, '');
+    }
+
+    if (rubrics && rubrics.length > 0) {
+      parts.push('[[ ## rubrics ## ]]');
+      for (const rubric of rubrics) {
+        const requiredLabel = rubric.required ? ' (REQUIRED)' : '';
+        const weightLabel = rubric.weight !== 1.0 ? ` (weight: ${rubric.weight})` : '';
+        parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.outcome}`);
+      }
+      parts.push(
+        '',
+        'For each rubric, investigate the workspace to determine if it is satisfied. Provide brief reasoning.',
+      );
+    } else {
+      parts.push(
+        'Investigate the workspace to verify the criteria. Provide a score between 0.0 and 1.0.',
+      );
+    }
+
+    return parts.join('\n');
+  }
+
+  /**
+   * Build the full evaluation prompt for delegate mode (agent providers).
+   * Combines task context, criteria, candidate info, and output format instructions.
+   */
+  private buildDelegatedPrompt(context: EvaluationContext): string {
+    const formattedQuestion =
+      context.promptInputs.question && context.promptInputs.question.trim().length > 0
+        ? context.promptInputs.question
+        : context.evalCase.question;
+
+    const config = context.evaluator;
+    const rubrics = config?.type === 'llm-judge' ? config.rubrics : undefined;
+
+    if (this.evaluatorTemplate) {
+      const variables: Record<string, string> = {
+        [TEMPLATE_VARIABLES.ANSWER]: context.candidate.trim(),
+        [TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context.evalCase.reference_answer ?? '').trim(),
+        [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(),
+        [TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(),
+        [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? '',
+      };
+      const customPrompt = substituteVariables(this.evaluatorTemplate, variables);
+
+      const outputSchema =
+        rubrics && rubrics.length > 0 ? buildRubricOutputSchema() : buildOutputSchema();
+
+      return `${customPrompt}\n\n${outputSchema}`;
+    }
+
+    const parts: string[] = [
+      'You are an expert evaluator. Investigate the workspace to verify the criteria are met.',
+      '',
+      '[[ ## question ## ]]',
+      formattedQuestion,
+      '',
+      '[[ ## criteria ## ]]',
+      context.evalCase.criteria,
+      '',
+    ];
+
+    if (context.evalCase.reference_answer && context.evalCase.reference_answer.trim().length > 0) {
+      parts.push('[[ ## reference_answer ## ]]', context.evalCase.reference_answer, '');
+    }
+
+    parts.push('[[ ## answer ## ]]', context.candidate, '');
+
+    if (context.fileChanges) {
+      parts.push('[[ ## file_changes ## ]]', context.fileChanges, '');
+    }
+
+    if (rubrics && rubrics.length > 0) {
+      parts.push('[[ ## rubrics ## ]]');
+      for (const rubric of rubrics) {
+        const requiredLabel = rubric.required ? ' (REQUIRED)' : '';
+        const weightLabel = rubric.weight !== 1.0 ? ` (weight: ${rubric.weight})` : '';
+        parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.outcome}`);
+      }
+      parts.push('');
+      parts.push(buildRubricOutputSchema());
+    } else {
+      parts.push(buildOutputSchema());
+    }
+
+    return parts.join('\n');
+  }
+
+  // ---------------------------------------------------------------------------
+  // Agent result parser (shared by built-in and delegate modes)
+  // ---------------------------------------------------------------------------
+
+  /**
+   * Parse the agent's response text into an EvaluationScore.
+   * Supports both freeform and rubric modes.
+   */
+  private parseAgentResult(
+    text: string,
+    rubrics: readonly RubricItem[] | undefined,
+    evaluatorRawRequest: JsonObject,
+    details: JsonObject,
+  ): EvaluationScore {
+    try {
+      const parsed = parseJsonFromText(text);
+
+      if (rubrics && rubrics.length > 0) {
+        const data = rubricEvaluationSchema.parse(parsed);
+        const { score, verdict, hits, misses } = calculateRubricScore(data, rubrics);
+        return {
+          score,
+          verdict,
+          hits,
+          misses,
+          expectedAspectCount: rubrics.length,
+          reasoning: data.overall_reasoning,
+          evaluatorRawRequest,
+          details,
+        };
+      }
+
+      const data = freeformEvaluationSchema.parse(parsed);
+      const score = clampScore(data.score);
+      const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : [];
+      const misses = Array.isArray(data.misses)
+        ? data.misses.filter(isNonEmptyString).slice(0, 4)
+        : [];
+
+      return {
+        score,
+        verdict: scoreToVerdict(score),
+        hits,
+        misses,
+        expectedAspectCount: Math.max(hits.length + misses.length, 1),
+        reasoning: data.reasoning,
+        evaluatorRawRequest,
+        details,
+      };
+    } catch {
+      return {
+        score: 0,
+        verdict: 'fail',
+        hits: [],
+        misses: ['Failed to parse llm-judge agent response as valid evaluation JSON'],
+        expectedAspectCount: 1,
+        evaluatorRawRequest,
+        details,
+      };
+    }
+  }
+
+  // ---------------------------------------------------------------------------
+  // LLM mode prompt builders
+  // ---------------------------------------------------------------------------
+
   /**
    * Build prompt for score-range rubric evaluation.
    */
@@ -421,6 +885,10 @@ export class LlmJudgeEvaluator implements Evaluator {
     return parts.join('\n');
   }
 
+  // ---------------------------------------------------------------------------
+  // LLM mode retry logic
+  // ---------------------------------------------------------------------------
+
   private async runWithRetry<T>(options: {
     readonly context: EvaluationContext;
     readonly judgeProvider: Provider;
@@ -474,6 +942,10 @@ export class LlmJudgeEvaluator implements Evaluator {
   }
 }
 
+// ---------------------------------------------------------------------------
+// Output schema builders (exported for reuse)
+// ---------------------------------------------------------------------------
+
 /**
  * Build the mandatory output schema that all evaluators must follow.
  * This schema is always appended to the evaluator template.
@@ -656,3 +1128,162 @@ function calculateScoreRangeResult(
     },
   };
 }
+
+// ---------------------------------------------------------------------------
+// Sandboxed filesystem tools for built-in agent mode
+// ---------------------------------------------------------------------------
+
+/**
+ * Resolve a relative path within the sandbox, preventing path traversal.
+ * Returns the absolute path if valid, or throws if the path escapes the sandbox.
+ */
+function resolveSandboxed(basePath: string, relativePath: string): string {
+  const resolved = path.resolve(basePath, relativePath);
+  if (!resolved.startsWith(basePath + path.sep) && resolved !== basePath) {
+    throw new Error(`Path '${relativePath}' is outside the workspace`);
+  }
+  return resolved;
+}
+
+/**
+ * Create sandboxed filesystem tools for the AI SDK agent loop.
+ */
+function createFilesystemTools(workspacePath: string) {
+  return {
+    list_files: tool({
+      description:
+        'List files and directories at a relative path within the workspace. Returns names only (single level, no recursion).',
+      inputSchema: z.object({
+        path: z.string().describe('Relative path within workspace (use "." for root)').default('.'),
+      }),
+      execute: async (input: { path: string }) => {
+        try {
+          const resolved = resolveSandboxed(workspacePath, input.path);
+          const entries = await fs.readdir(resolved, { withFileTypes: true });
+          return entries
+            .map((e) => ({
+              name: e.name,
+              type: e.isDirectory() ? 'directory' : 'file',
+            }))
+            .slice(0, 100);
+        } catch (error) {
+          return { error: error instanceof Error ? error.message : String(error) };
+        }
+      },
+    }),
+
+    read_file: tool({
+      description:
+        'Read the content of a file at a relative path within the workspace. Large files are truncated at 50KB.',
+      inputSchema: z.object({
+        path: z.string().describe('Relative path to file within workspace'),
+      }),
+      execute: async (input: { path: string }) => {
+        try {
+          const resolved = resolveSandboxed(workspacePath, input.path);
+          const stat = await fs.stat(resolved);
+          if (stat.isDirectory()) {
+            return { error: `'${input.path}' is a directory, not a file` };
+          }
+          const buffer = Buffer.alloc(Math.min(stat.size, MAX_FILE_SIZE));
+          const fd = await fs.open(resolved, 'r');
+          try {
+            await fd.read(buffer, 0, buffer.length, 0);
+          } finally {
+            await fd.close();
+          }
+          const content = buffer.toString('utf-8');
+          const truncated = stat.size > MAX_FILE_SIZE;
+          return { content, truncated, size: stat.size };
+        } catch (error) {
+          return { error: error instanceof Error ? error.message : String(error) };
+        }
+      },
+    }),
+
+    search_files: tool({
+      description:
+        'Search for a regex pattern across files in the workspace. Returns up to 20 matches. Skips binary files and node_modules/.git.',
+      inputSchema: z.object({
+        pattern: z.string().describe('Regex pattern to search for'),
+        path: z.string().describe('Relative path to search within (use "." for root)').default('.'),
+      }),
+      execute: async (input: { pattern: string; path: string }) => {
+        try {
+          const resolved = resolveSandboxed(workspacePath, input.path);
+          let regex: RegExp;
+          try {
+            regex = new RegExp(input.pattern, 'gi');
+          } catch (regexErr) {
+            return {
+              error: `Invalid regex pattern: ${regexErr instanceof Error ? regexErr.message : String(regexErr)}`,
+            };
+          }
+          const matches: Array<{ file: string; line: number; text: string }> = [];
+
+          await searchDirectory(resolved, workspacePath, regex, matches);
+
+          return { matches, total: matches.length };
+        } catch (error) {
+          return { error: error instanceof Error ? error.message : String(error) };
+        }
+      },
+    }),
+  };
+}
+
+/**
+ * Recursively search a directory for regex matches.
+ */
+async function searchDirectory(
+  dirPath: string,
+  workspacePath: string,
+  regex: RegExp,
+  matches: Array<{ file: string; line: number; text: string }>,
+): Promise<void> {
+  if (matches.length >= MAX_SEARCH_MATCHES) return;
+
+  let entries: import('node:fs').Dirent[];
+  try {
+    entries = await fs.readdir(dirPath, { withFileTypes: true });
+  } catch {
+    return;
+  }
+
+  for (const entry of entries) {
+    if (matches.length >= MAX_SEARCH_MATCHES) return;
+
+    if (SEARCH_SKIP_DIRS.has(entry.name)) continue;
+
+    const fullPath = path.join(dirPath, entry.name);
+
+    if (entry.isDirectory()) {
+      await searchDirectory(fullPath, workspacePath, regex, matches);
+    } else if (entry.isFile()) {
+      const ext = path.extname(entry.name).toLowerCase();
+      if (BINARY_EXTENSIONS.has(ext)) continue;
+
+      try {
+        const stat = await fs.stat(fullPath);
+        if (stat.size > MAX_FILE_SIZE) continue;
+
+        const content = await fs.readFile(fullPath, 'utf-8');
+        const lines = content.split('\n');
+
+        for (let i = 0; i < lines.length; i++) {
+          if (matches.length >= MAX_SEARCH_MATCHES) return;
+          regex.lastIndex = 0;
+          if (regex.test(lines[i])) {
+            matches.push({
+              file: path.relative(workspacePath, fullPath),
+              line: i + 1,
+              text: lines[i].substring(0, 200),
+            });
+          }
+        }
+      } catch {
+        // Skip unreadable files
+      }
+    }
+  }
+}
diff --git a/packages/core/src/evaluation/loaders/eval-yaml-transpiler.ts b/packages/core/src/evaluation/loaders/eval-yaml-transpiler.ts
index 4d9560157..9c79366a0 100644
--- a/packages/core/src/evaluation/loaders/eval-yaml-transpiler.ts
+++ b/packages/core/src/evaluation/loaders/eval-yaml-transpiler.ts
@@ -143,11 +143,7 @@ function assertionToNaturalLanguage(entry: RawAssertEntry): string | null {
       return `Output ends with '${entry.value}'`;
 
     case 'llm-judge':
-    case 'llm_judge':
-      return typeof entry.prompt === 'string' ? entry.prompt : null;
-
-    case 'agent-judge':
-    case 'agent_judge': {
+    case 'llm_judge': {
       // Expand each rubric item to its own assertion string
       // Return the first one — callers handle arrays via assertionToNaturalLanguageList
       if (Array.isArray(entry.rubrics) && entry.rubrics.length > 0) {
@@ -217,10 +213,10 @@ function assertionToNaturalLanguage(entry: RawAssertEntry): string | null {
 
 /**
  * Expand a single assertion entry into zero or more NL strings.
- * Most assertions produce exactly one string; agent-judge with rubrics expands to many.
+ * Most assertions produce exactly one string; llm-judge with rubrics expands to many.
  */
 function assertionToNaturalLanguageList(entry: RawAssertEntry): string[] {
-  if (entry.type === 'agent-judge' || entry.type === 'agent_judge') {
+  if (entry.type === 'llm-judge' || entry.type === 'llm_judge') {
     if (Array.isArray(entry.rubrics) && entry.rubrics.length > 0) {
       return (entry.rubrics as Array<{ outcome?: string; criteria?: string; id?: string }>)
         .map((r) => r.outcome ?? r.criteria ?? r.id)
diff --git a/packages/core/src/evaluation/loaders/evaluator-parser.ts b/packages/core/src/evaluation/loaders/evaluator-parser.ts
index 2eb72cb92..4ec619e22 100644
--- a/packages/core/src/evaluation/loaders/evaluator-parser.ts
+++ b/packages/core/src/evaluation/loaders/evaluator-parser.ts
@@ -852,82 +852,6 @@ async function parseEvaluatorList(
       continue;
     }
 
-    if (typeValue === 'agent-judge') {
-      // Validate max_steps (1-50)
-      const rawMaxSteps = rawEvaluator.max_steps ?? rawEvaluator.maxSteps;
-      let maxSteps: number | undefined;
-      if (rawMaxSteps !== undefined) {
-        if (
-          typeof rawMaxSteps !== 'number' ||
-          !Number.isInteger(rawMaxSteps) ||
-          rawMaxSteps < 1 ||
-          rawMaxSteps > 50
-        ) {
-          logWarning(
-            `Skipping agent-judge evaluator '${name}' in '${evalId}': max_steps must be an integer 1-50`,
-          );
-          continue;
-        }
-        maxSteps = rawMaxSteps;
-      }
-
-      // Validate temperature (0-2)
-      const rawTemperature = rawEvaluator.temperature;
-      let temperature: number | undefined;
-      if (rawTemperature !== undefined) {
-        if (typeof rawTemperature !== 'number' || rawTemperature < 0 || rawTemperature > 2) {
-          logWarning(
-            `Skipping agent-judge evaluator '${name}' in '${evalId}': temperature must be a number 0-2`,
-          );
-          continue;
-        }
-        temperature = rawTemperature;
-      }
-
-      // Validate target (string)
-      const judgeTarget = asString(rawEvaluator.target);
-
-      // Parse prompt (file path or inline text)
-      let agentPrompt: string | undefined;
-      let agentPromptPath: string | undefined;
-      const rawAgentPrompt = rawEvaluator.prompt;
-      if (typeof rawAgentPrompt === 'string') {
-        agentPrompt = rawAgentPrompt;
-        const resolved = await resolveFileReference(rawAgentPrompt, searchRoots);
-        if (resolved.resolvedPath) {
-          agentPromptPath = path.resolve(resolved.resolvedPath);
-        }
-      }
-
-      // Parse rubrics via existing infrastructure
-      const rawAgentRubrics = rawEvaluator.rubrics;
-      const agentParsedRubrics = Array.isArray(rawAgentRubrics)
-        ? parseRubricItems(rawAgentRubrics, name, evalId)
-        : undefined;
-
-      const weight = validateWeight(rawEvaluator.weight, name, evalId);
-      const required = parseRequired(rawEvaluator.required);
-
-      evaluators.push({
-        name,
-        type: 'agent-judge',
-        ...(agentPrompt ? { prompt: agentPrompt } : {}),
-        ...(agentPromptPath
-          ? { promptPath: agentPromptPath, resolvedPromptPath: agentPromptPath }
-          : {}),
-        ...(agentParsedRubrics && agentParsedRubrics.length > 0
-          ? { rubrics: agentParsedRubrics }
-          : {}),
-        ...(maxSteps !== undefined ? { max_steps: maxSteps } : {}),
-        ...(temperature !== undefined ? { temperature } : {}),
-        ...(judgeTarget ? { target: judgeTarget } : {}),
-        ...(weight !== undefined ? { weight } : {}),
-        ...(required !== undefined ? { required } : {}),
-        ...(negate !== undefined ? { negate } : {}),
-      });
-      continue;
-    }
-
     if (typeValue === 'skill-trigger') {
       const skillName = asString(rawEvaluator.skill);
       if (!skillName) {
@@ -1266,6 +1190,9 @@ async function parseEvaluatorList(
       'config',
       'required',
       'negate',
+      'max_steps',
+      'maxSteps',
+      'temperature',
     ]);
     const config: Record<string, JsonValue> = {};
     for (const [key, value] of Object.entries(rawEvaluator)) {
@@ -1284,6 +1211,19 @@ async function parseEvaluatorList(
     const finalConfig =
       promptScriptConfig ?? (Object.keys(mergedConfig).length > 0 ? mergedConfig : undefined);
 
+    // Parse optional max_steps and temperature (used in agent mode)
+    const rawMaxStepsLlm = rawEvaluator.max_steps ?? rawEvaluator.maxSteps;
+    const llmMaxSteps =
+      typeof rawMaxStepsLlm === 'number' &&
+      Number.isInteger(rawMaxStepsLlm) &&
+      rawMaxStepsLlm >= 1 &&
+      rawMaxStepsLlm <= 50
+        ? rawMaxStepsLlm
+        : undefined;
+    const rawTempLlm = rawEvaluator.temperature;
+    const llmTemperature =
+      typeof rawTempLlm === 'number' && rawTempLlm >= 0 && rawTempLlm <= 2 ? rawTempLlm : undefined;
+
     evaluators.push({
       name,
       type: 'llm-judge',
@@ -1297,6 +1237,8 @@ async function parseEvaluatorList(
       ...(required !== undefined ? { required } : {}),
       ...(negate !== undefined ? { negate } : {}),
       ...(finalConfig ? { config: finalConfig } : {}),
+      ...(llmMaxSteps !== undefined ? { max_steps: llmMaxSteps } : {}),
+      ...(llmTemperature !== undefined ? { temperature: llmTemperature } : {}),
     });
   }
 
diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts
index 396bc15fe..95cbdab7f 100644
--- a/packages/core/src/evaluation/orchestrator.ts
+++ b/packages/core/src/evaluation/orchestrator.ts
@@ -235,6 +235,10 @@ export interface RunEvaluationOptions {
   readonly retainOnSuccess?: 'keep' | 'cleanup';
   /** Retention policy override for failed cases */
   readonly retainOnFailure?: 'keep' | 'cleanup';
+  /** CLI override: judge target name (e.g., "agentv" or a target from targets.yaml) */
+  readonly judgeTarget?: string;
+  /** CLI override: model for judge target (e.g., "openai:gpt-5-mini") */
+  readonly model?: string;
 }
 
 export async function runEvaluation(
@@ -271,6 +275,8 @@ export async function runEvaluation(
     workspaceClean,
     retainOnSuccess,
     retainOnFailure,
+    judgeTarget: cliJudgeTarget,
+    model: cliModel,
   } = options;
 
   // Disable cache when trials > 1 (cache makes trials deterministic = pointless)
@@ -335,6 +341,25 @@ export async function runEvaluation(
   const resolveJudgeProvider = async (
     targetContext: ResolvedTarget,
   ): Promise<Provider | undefined> => {
+    // CLI --judge-target takes highest priority
+    if (cliJudgeTarget) {
+      if (cliJudgeTarget === 'agentv') {
+        if (!cliModel) {
+          throw new Error('--judge-target "agentv" requires --model (e.g., "openai:gpt-5-mini")');
+        }
+        const { AgentvProvider } = await import('./providers/agentv-provider.js');
+        return new AgentvProvider('agentv', { model: cliModel, temperature: 0 });
+      }
+      const overrideTarget = resolveTargetByName(cliJudgeTarget);
+      if (!overrideTarget) {
+        throw new Error(`--judge-target "${cliJudgeTarget}" not found in targets`);
+      }
+      return getOrCreateProvider(overrideTarget);
+    }
+
+    // TODO: When --model is provided without --judge-target, override the model of
+    // whichever judge target is resolved. For now, --model only works with --judge-target agentv.
+
     const judgeName = targetContext.judgeTarget ?? targetContext.name;
     const resolvedJudge = resolveTargetByName(judgeName);
     if (!resolvedJudge) {
@@ -346,7 +371,8 @@ export async function runEvaluation(
   // Validate judge_target: error if an agent provider would be used as judge.
   // Agent providers can't return structured JSON for judging — they respond with
   // tool calls and markdown, causing silent score-0 failures.
-  if (isAgentProvider(getOrCreateProvider(target)) && !target.judgeTarget) {
+  // CLI --judge-target override also satisfies this requirement.
+  if (isAgentProvider(getOrCreateProvider(target)) && !target.judgeTarget && !cliJudgeTarget) {
     throw new Error(
       `Target "${target.name}" is an agent provider ("${target.kind}") with no judge_target — agent providers cannot return structured JSON for judging. Set judge_target to an LLM provider (e.g., azure-llm).`,
     );
diff --git a/packages/core/src/evaluation/providers/agentv-provider.ts b/packages/core/src/evaluation/providers/agentv-provider.ts
new file mode 100644
index 000000000..88084c8fa
--- /dev/null
+++ b/packages/core/src/evaluation/providers/agentv-provider.ts
@@ -0,0 +1,89 @@
+import { createAnthropic } from '@ai-sdk/anthropic';
+import { createAzure } from '@ai-sdk/azure';
+import { createGoogleGenerativeAI } from '@ai-sdk/google';
+import { createOpenAI } from '@ai-sdk/openai';
+import type { LanguageModel } from 'ai';
+
+import type { AgentVResolvedConfig } from './targets.js';
+import type { Provider, ProviderRequest, ProviderResponse } from './types.js';
+
+/**
+ * Parse a model string like "openai:gpt-5-mini" into provider prefix and model name.
+ */
+function parseModelString(model: string): { provider: string; modelName: string } {
+  const colonIndex = model.indexOf(':');
+  if (colonIndex === -1) {
+    throw new Error(
+      `Invalid model string "${model}". Expected format "provider:model" (e.g., "openai:gpt-5-mini")`,
+    );
+  }
+  return {
+    provider: model.slice(0, colonIndex),
+    modelName: model.slice(colonIndex + 1),
+  };
+}
+
+/**
+ * Create a LanguageModel from a model string using the appropriate AI SDK provider.
+ */
+function createLanguageModel(modelString: string): LanguageModel {
+  const { provider, modelName } = parseModelString(modelString);
+
+  switch (provider) {
+    case 'openai':
+      // Cast: @ai-sdk/openai may return LanguageModelV3 while the rest of the
+      // codebase uses LanguageModelV2. The runtime API is compatible.
+      return createOpenAI()(modelName) as unknown as LanguageModel;
+    case 'anthropic':
+      return createAnthropic()(modelName);
+    case 'azure':
+      return createAzure()(modelName);
+    case 'google':
+      return createGoogleGenerativeAI()(modelName);
+    default:
+      throw new Error(
+        `Unsupported AI SDK provider "${provider}" in model string "${modelString}". Supported providers: openai, anthropic, azure, google`,
+      );
+  }
+}
+
+/**
+ * AgentV built-in provider for LLM judge evaluation.
+ *
+ * Resolves an AI SDK model string (e.g., "openai:gpt-5-mini", "anthropic:claude-sonnet-4-20250514")
+ * to a Vercel AI SDK LanguageModel by parsing the provider prefix and creating the appropriate
+ * AI SDK provider directly. This provider is used exclusively for judge evaluation — it does not
+ * support direct agent invocation.
+ *
+ * Usage: `--judge-target agentv --model openai:gpt-5-mini`
+ */
+export class AgentvProvider implements Provider {
+  readonly id: string;
+  readonly kind = 'agentv' as const;
+  readonly targetName: string;
+
+  private readonly model: LanguageModel;
+
+  constructor(targetName: string, config: AgentVResolvedConfig) {
+    this.id = `agentv:${targetName}`;
+    this.targetName = targetName;
+    this.model = createLanguageModel(config.model);
+  }
+
+  /**
+   * Direct invoke is not supported for the agentv provider.
+   * Use asLanguageModel() with generateText() instead.
+   */
+  async invoke(_request: ProviderRequest): Promise<ProviderResponse> {
+    throw new Error(
+      'AgentvProvider does not support direct invoke(). Use asLanguageModel() with generateText() instead.',
+    );
+  }
+
+  /**
+   * Returns the resolved AI SDK LanguageModel for use with generateText/generateObject.
+   */
+  asLanguageModel(): LanguageModel {
+    return this.model;
+  }
+}
diff --git a/packages/core/src/evaluation/providers/index.ts b/packages/core/src/evaluation/providers/index.ts
index 62cd8eef8..6ec6e2dfa 100644
--- a/packages/core/src/evaluation/providers/index.ts
+++ b/packages/core/src/evaluation/providers/index.ts
@@ -1,3 +1,4 @@
+import { AgentvProvider } from './agentv-provider.js';
 import { AnthropicProvider, AzureProvider, GeminiProvider } from './ai-sdk.js';
 import { ClaudeCliProvider } from './claude-cli.js';
 import { ClaudeSdkProvider } from './claude-sdk.js';
@@ -30,6 +31,7 @@ export type {
 } from './types.js';
 
 export type {
+  AgentVResolvedConfig,
   AnthropicResolvedConfig,
   AzureResolvedConfig,
   ClaudeResolvedConfig,
@@ -95,6 +97,7 @@ export function createBuiltinProviderRegistry(): ProviderRegistry {
     // claude-sdk is the explicit SDK provider (requires @anthropic-ai/claude-agent-sdk)
     .register('claude-sdk', (t) => new ClaudeSdkProvider(t.name, t.config as never))
     .register('mock', (t) => new MockProvider(t.name, t.config as never))
+    .register('agentv', (t) => new AgentvProvider(t.name, t.config as never))
     .register('vscode', (t) => new VSCodeProvider(t.name, t.config as never, 'vscode'))
     .register(
       'vscode-insiders',
diff --git a/packages/core/src/evaluation/providers/targets.ts b/packages/core/src/evaluation/providers/targets.ts
index aa30b06b6..26f827eae 100644
--- a/packages/core/src/evaluation/providers/targets.ts
+++ b/packages/core/src/evaluation/providers/targets.ts
@@ -514,6 +514,11 @@ export interface VSCodeResolvedConfig {
   readonly timeoutMs?: number;
 }
 
+export interface AgentVResolvedConfig {
+  readonly model: string;
+  readonly temperature: number;
+}
+
 /**
  * Healthcheck configuration type derived from CliHealthcheckSchema.
  * Supports both HTTP and command-based healthchecks.
@@ -628,6 +633,14 @@ export type ResolvedTarget =
       readonly providerBatching?: boolean;
       readonly config: VSCodeResolvedConfig;
     }
+  | {
+      readonly kind: 'agentv';
+      readonly name: string;
+      readonly judgeTarget?: string;
+      readonly workers?: number;
+      readonly providerBatching?: boolean;
+      readonly config: AgentVResolvedConfig;
+    }
   | {
       readonly kind: 'cli';
       readonly name: string;
@@ -841,6 +854,23 @@ export function resolveTargetDefinition(
         providerBatching,
         config: resolveVSCodeConfig(parsed, env, provider === 'vscode-insiders', evalFilePath),
       };
+    case 'agentv': {
+      const model = typeof parsed.model === 'string' ? parsed.model : undefined;
+      if (!model) {
+        throw new Error(
+          `Target "${parsed.name}" (provider: agentv) requires a "model" field (e.g., "openai:gpt-5-mini")`,
+        );
+      }
+      const temperature = typeof parsed.temperature === 'number' ? parsed.temperature : 0;
+      return {
+        kind: 'agentv',
+        name: parsed.name,
+        judgeTarget: parsed.judge_target,
+        workers: typeof parsed.workers === 'number' ? parsed.workers : undefined,
+        providerBatching,
+        config: { model, temperature },
+      };
+    }
     case 'cli':
       return {
         kind: 'cli',
diff --git a/packages/core/src/evaluation/providers/types.ts b/packages/core/src/evaluation/providers/types.ts
index af5e3b6a1..e0106071a 100644
--- a/packages/core/src/evaluation/providers/types.ts
+++ b/packages/core/src/evaluation/providers/types.ts
@@ -25,7 +25,8 @@ export type ProviderKind =
   | 'cli'
   | 'mock'
   | 'vscode'
-  | 'vscode-insiders';
+  | 'vscode-insiders'
+  | 'agentv';
 
 /**
  * Agent providers that have filesystem access and don't need unwrapped guidelines.
@@ -63,6 +64,7 @@ export const KNOWN_PROVIDERS: readonly ProviderKind[] = [
   'mock',
   'vscode',
   'vscode-insiders',
+  'agentv',
 ] as const;
 
 /**
diff --git a/packages/core/src/evaluation/registry/builtin-evaluators.ts b/packages/core/src/evaluation/registry/builtin-evaluators.ts
index dee6b0237..7d8e6ff88 100644
--- a/packages/core/src/evaluation/registry/builtin-evaluators.ts
+++ b/packages/core/src/evaluation/registry/builtin-evaluators.ts
@@ -6,9 +6,7 @@
  * the EvaluatorRegistry at startup.
  */
 
-import { readFileSync } from 'node:fs';
 import {
-  AgentJudgeEvaluator,
   CodeEvaluator,
   CompositeEvaluator,
   CostEvaluator,
@@ -34,10 +32,10 @@ import {
 } from '../evaluators.js';
 import { InlineAssertEvaluator } from '../evaluators/inline-assert.js';
 import { resolveCustomPrompt } from '../evaluators/prompt-resolution.js';
+import { isAgentProvider } from '../providers/types.js';
 import type { Provider } from '../providers/types.js';
 import type { ToolTrajectoryEvaluatorConfig } from '../trace.js';
 import type {
-  AgentJudgeEvaluatorConfig,
   CodeEvaluatorConfig,
   CompositeEvaluatorConfig,
   ContainsAllEvaluatorConfig,
@@ -74,6 +72,11 @@ export const INLINE_ASSERT_FN = Symbol.for('agentv.inline-assert-fn');
  * Factory for `llm-judge` evaluators.
  * Creates a wrapper that resolves custom prompts at evaluation time and
  * optionally overrides the judge target per evaluator.
+ *
+ * Auto-detects mode based on the resolved judge provider:
+ * - LLM providers (azure, anthropic, gemini): structured JSON mode
+ * - Agent providers (claude-cli, copilot, etc.): delegate mode
+ * - agentv provider: built-in AI SDK agent mode with filesystem tools
  */
 export const llmJudgeFactory: EvaluatorFactoryFn = (config, context) => {
   const c = config as LlmJudgeEvaluatorConfig;
@@ -88,12 +91,20 @@ export const llmJudgeFactory: EvaluatorFactoryFn = (config, context) => {
     if (!judgeTargetProvider) {
       throw new Error(`llm-judge evaluator '${c.name}': target '${c.target}' not found in targets`);
     }
+    // Only pass judgeTargetProvider for agent providers (delegate mode).
+    // LLM providers use the normal resolveJudgeProvider path for structured JSON mode.
+    // Note: agentv uses asLanguageModel() not invoke(), so it's not in AGENT_PROVIDER_KINDS;
+    // check it explicitly here for built-in agent mode.
+    const isAgent = isAgentProvider(judgeTargetProvider) || judgeTargetProvider.kind === 'agentv';
     evaluator = new LlmJudgeEvaluator({
       resolveJudgeProvider: async (evalContext) => {
         if (judgeTargetProvider) return judgeTargetProvider;
         if (evalContext.judgeProvider) return evalContext.judgeProvider;
         return judgeProvider;
       },
+      maxSteps: c.max_steps,
+      temperature: c.temperature,
+      ...(isAgent ? { judgeTargetProvider } : {}),
     });
   }
 
@@ -198,45 +209,6 @@ export const executionMetricsFactory: EvaluatorFactoryFn = (config) => {
   });
 };
 
-/** Factory for `agent-judge` evaluators. */
-export const agentJudgeFactory: EvaluatorFactoryFn = (config, context) => {
-  const c = config as AgentJudgeEvaluatorConfig;
-  const { judgeProvider, targetResolver } = context;
-
-  let customPrompt: string | undefined;
-  if (c.resolvedPromptPath) {
-    try {
-      customPrompt = readFileSync(c.resolvedPromptPath, 'utf-8');
-    } catch (error) {
-      const message = error instanceof Error ? error.message : String(error);
-      console.warn(`Could not read agent-judge prompt at ${c.resolvedPromptPath}: ${message}`);
-    }
-  } else if (c.prompt) {
-    customPrompt = c.prompt;
-  }
-
-  let judgeTargetProvider: Provider | undefined;
-  if (c.target && targetResolver) {
-    judgeTargetProvider = targetResolver(c.target);
-    if (!judgeTargetProvider) {
-      throw new Error(
-        `agent-judge evaluator '${c.name}': target '${c.target}' not found in targets`,
-      );
-    }
-  }
-
-  return new AgentJudgeEvaluator({
-    resolveJudgeProvider: async (ctx) => {
-      if (ctx.judgeProvider) return ctx.judgeProvider;
-      return judgeProvider;
-    },
-    maxSteps: c.max_steps,
-    temperature: c.temperature,
-    evaluatorTemplate: customPrompt,
-    judgeTargetProvider,
-  });
-};
-
 /** Factory for `skill-trigger` evaluator. */
 export const skillTriggerFactory: EvaluatorFactoryFn = (config) => {
   return new SkillTriggerEvaluator(config as SkillTriggerEvaluatorConfig);
@@ -440,7 +412,6 @@ export function createBuiltinRegistry(): EvaluatorRegistry {
     .register('cost', costFactory)
     .register('token-usage', tokenUsageFactory)
     .register('execution-metrics', executionMetricsFactory)
-    .register('agent-judge', agentJudgeFactory)
     .register('skill-trigger', skillTriggerFactory)
     .register('contains', containsFactory)
     .register('contains-any', containsAnyFactory)
diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts
index b69c272ab..b174af42f 100644
--- a/packages/core/src/evaluation/types.ts
+++ b/packages/core/src/evaluation/types.ts
@@ -158,7 +158,6 @@ const EVALUATOR_KIND_VALUES = [
   'cost',
   'token-usage',
   'execution-metrics',
-  'agent-judge',
   'skill-trigger',
   'contains',
   'contains-any',
@@ -337,6 +336,10 @@ export type LlmJudgeEvaluatorConfig = {
   readonly target?: string;
   /** Pass-through configuration for custom evaluator prompts (legacy, prefer prompt.config) */
   readonly config?: Record<string, unknown>;
+  /** Maximum agent steps for agentv built-in mode (default 10, max 50). Ignored in LLM mode. */
+  readonly max_steps?: number;
+  /** Temperature override for judge calls */
+  readonly temperature?: number;
 };
 
 /**
@@ -529,35 +532,6 @@ export type ExecutionMetricsEvaluatorConfig = {
   readonly negate?: boolean;
 };
 
-/**
- * Configuration for the agent-judge evaluator.
- * Runs an agentic investigation loop to audit workspaces and verify criteria.
- * Two modes:
- * - Built-in: Uses AI SDK generateText() with sandboxed filesystem tools
- * - Judge target: Delegates to an external agent provider via Provider.invoke()
- */
-export type AgentJudgeEvaluatorConfig = {
-  readonly name: string;
-  readonly type: 'agent-judge';
-  /** Custom evaluation prompt (inline text or file path) */
-  readonly prompt?: string;
-  readonly promptPath?: string;
-  /** Resolved absolute path for prompt file */
-  readonly resolvedPromptPath?: string;
-  /** Rubric items for structured evaluation (reuses llm-judge rubric infra) */
-  readonly rubrics?: readonly RubricItem[];
-  /** Maximum agent steps for built-in mode (default 10, max 50) */
-  readonly max_steps?: number;
-  /** Temperature for built-in mode (default 0) */
-  readonly temperature?: number;
-  /** Target name — delegates agent loop to this provider instead of built-in mode */
-  readonly target?: string;
-  readonly weight?: number;
-  readonly required?: boolean | number;
-  /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */
-  readonly negate?: boolean;
-};
-
 /**
  * Configuration for the contains assertion evaluator.
  * Checks whether the candidate output contains a specified substring.
@@ -766,7 +740,6 @@ export type EvaluatorConfig =
   | CostEvaluatorConfig
   | TokenUsageEvaluatorConfig
   | ExecutionMetricsEvaluatorConfig
-  | AgentJudgeEvaluatorConfig
   | SkillTriggerEvaluatorConfig
   | ContainsEvaluatorConfig
   | ContainsAnyEvaluatorConfig
diff --git a/packages/core/src/evaluation/validation/eval-file.schema.ts b/packages/core/src/evaluation/validation/eval-file.schema.ts
index 690373b43..e3bad5fed 100644
--- a/packages/core/src/evaluation/validation/eval-file.schema.ts
+++ b/packages/core/src/evaluation/validation/eval-file.schema.ts
@@ -87,6 +87,8 @@ const LlmJudgeSchema = EvaluatorCommonSchema.extend({
   model: z.string().optional(),
   target: z.string().optional(),
   config: z.record(z.unknown()).optional(),
+  max_steps: z.number().int().min(1).max(50).optional(),
+  temperature: z.number().min(0).max(2).optional(),
 });
 
 /** Aggregator configs for composite evaluator */
@@ -189,15 +191,6 @@ const ExecutionMetricsSchema = EvaluatorCommonSchema.extend({
   exploration_tolerance: z.number().min(0).optional(),
 });
 
-const AgentJudgeSchema = EvaluatorCommonSchema.extend({
-  type: z.enum(['agent-judge', 'agent_judge']),
-  prompt: z.string().optional(),
-  rubrics: z.array(RubricItemSchema).optional(),
-  max_steps: z.number().int().min(1).max(50).optional(),
-  temperature: z.number().min(0).max(2).optional(),
-  target: z.string().optional(),
-});
-
 const ContainsSchema = EvaluatorCommonSchema.extend({
   type: z.literal('contains'),
   value: z.string(),
@@ -233,7 +226,6 @@ const EvaluatorSchema = z.union([
   CostSchema,
   TokenUsageSchema,
   ExecutionMetricsSchema,
-  AgentJudgeSchema,
   ContainsSchema,
   RegexSchema,
   IsJsonSchema,
diff --git a/packages/core/test/evaluation/loaders/eval-yaml-transpiler.test.ts b/packages/core/test/evaluation/loaders/eval-yaml-transpiler.test.ts
index de224a1a2..0647ce387 100644
--- a/packages/core/test/evaluation/loaders/eval-yaml-transpiler.test.ts
+++ b/packages/core/test/evaluation/loaders/eval-yaml-transpiler.test.ts
@@ -244,7 +244,7 @@ describe('transpileEvalYaml — NL assertions', () => {
     expect(evals[0].assertions).toContain('The answer is clear and concise');
   });
 
-  it('converts agent-judge with rubrics to multiple assertions', () => {
+  it('converts llm-judge with rubrics to multiple assertions (rubrics variant)', () => {
     const suite = {
       tests: [
         {
@@ -253,7 +253,7 @@ describe('transpileEvalYaml — NL assertions', () => {
           assertions: [
             { type: 'skill-trigger', skill: 's', should_trigger: true },
             {
-              type: 'agent-judge',
+              type: 'llm-judge',
               rubrics: [
                 { id: 'r1', outcome: 'Correct result returned' },
                 { id: 'r2', outcome: 'No unnecessary steps' },
@@ -269,6 +269,31 @@ describe('transpileEvalYaml — NL assertions', () => {
     expect(evals[0].assertions).toContain('No unnecessary steps');
   });
 
+  it('converts llm-judge with rubrics to multiple assertions', () => {
+    const suite = {
+      tests: [
+        {
+          id: 't1',
+          input: 'test',
+          assertions: [
+            { type: 'skill-trigger', skill: 's', should_trigger: true },
+            {
+              type: 'llm-judge',
+              rubrics: [
+                { id: 'r1', outcome: 'Response is accurate' },
+                { id: 'r2', outcome: 'Formatting is correct' },
+              ],
+            },
+          ],
+        },
+      ],
+    };
+    const { files } = transpileEvalYaml(suite);
+    const evals = files.get('s')?.evals;
+    expect(evals[0].assertions).toContain('Response is accurate');
+    expect(evals[0].assertions).toContain('Formatting is correct');
+  });
+
   it('converts tool-trajectory to NL', () => {
     const suite = {
       tests: [
diff --git a/packages/core/test/evaluation/providers/agentv-provider.test.ts b/packages/core/test/evaluation/providers/agentv-provider.test.ts
new file mode 100644
index 000000000..2b0c0aadd
--- /dev/null
+++ b/packages/core/test/evaluation/providers/agentv-provider.test.ts
@@ -0,0 +1,133 @@
+import { describe, expect, it, vi } from 'vitest';
+
+// Mock AI SDK provider packages before importing the provider.
+// Each createXxx() returns a callable factory: createXxx()(modelName) => model stub.
+vi.mock('@ai-sdk/openai', () => ({
+  createOpenAI: () => (modelId: string) => ({
+    modelId,
+    specificationVersion: 'v2',
+    provider: 'openai',
+  }),
+}));
+
+vi.mock('@ai-sdk/anthropic', () => ({
+  createAnthropic: () => (modelId: string) => ({
+    modelId,
+    specificationVersion: 'v2',
+    provider: 'anthropic',
+  }),
+}));
+
+vi.mock('@ai-sdk/azure', () => ({
+  createAzure: () => (modelId: string) => ({
+    modelId,
+    specificationVersion: 'v2',
+    provider: 'azure',
+  }),
+}));
+
+vi.mock('@ai-sdk/google', () => ({
+  createGoogleGenerativeAI: () => (modelId: string) => ({
+    modelId,
+    specificationVersion: 'v2',
+    provider: 'google',
+  }),
+}));
+
+import { AgentvProvider } from '../../../src/evaluation/providers/agentv-provider.js';
+
+describe('AgentvProvider', () => {
+  it('has kind "agentv"', () => {
+    const provider = new AgentvProvider('test-judge', {
+      model: 'openai:gpt-5-mini',
+      temperature: 0,
+    });
+    expect(provider.kind).toBe('agentv');
+  });
+
+  it('has correct targetName', () => {
+    const provider = new AgentvProvider('my-judge', {
+      model: 'openai:gpt-5-mini',
+      temperature: 0,
+    });
+    expect(provider.targetName).toBe('my-judge');
+  });
+
+  it('has correct id format', () => {
+    const provider = new AgentvProvider('test-judge', {
+      model: 'openai:gpt-5-mini',
+      temperature: 0,
+    });
+    expect(provider.id).toBe('agentv:test-judge');
+  });
+
+  it('asLanguageModel() returns a defined LanguageModel', () => {
+    const provider = new AgentvProvider('test-judge', {
+      model: 'openai:gpt-5-mini',
+      temperature: 0,
+    });
+    const model = provider.asLanguageModel();
+    expect(model).toBeDefined();
+    expect((model as unknown as { modelId: string }).modelId).toBe('gpt-5-mini');
+  });
+
+  it('asLanguageModel() works with anthropic model strings', () => {
+    const provider = new AgentvProvider('test-judge', {
+      model: 'anthropic:claude-sonnet-4-20250514',
+      temperature: 0,
+    });
+    const model = provider.asLanguageModel();
+    expect(model).toBeDefined();
+    expect((model as unknown as { modelId: string }).modelId).toBe('claude-sonnet-4-20250514');
+  });
+
+  it('asLanguageModel() works with google model strings', () => {
+    const provider = new AgentvProvider('test-judge', {
+      model: 'google:gemini-2.5-flash',
+      temperature: 0,
+    });
+    const model = provider.asLanguageModel();
+    expect(model).toBeDefined();
+    expect((model as unknown as { modelId: string }).modelId).toBe('gemini-2.5-flash');
+  });
+
+  it('asLanguageModel() works with azure model strings', () => {
+    const provider = new AgentvProvider('test-judge', {
+      model: 'azure:gpt-4o-deployment',
+      temperature: 0,
+    });
+    const model = provider.asLanguageModel();
+    expect(model).toBeDefined();
+    expect((model as unknown as { modelId: string }).modelId).toBe('gpt-4o-deployment');
+  });
+
+  it('throws for unsupported provider prefix', () => {
+    expect(
+      () =>
+        new AgentvProvider('test-judge', {
+          model: 'unsupported:some-model',
+          temperature: 0,
+        }),
+    ).toThrow('Unsupported AI SDK provider "unsupported"');
+  });
+
+  it('throws for model string without colon separator', () => {
+    expect(
+      () =>
+        new AgentvProvider('test-judge', {
+          model: 'gpt-5-mini',
+          temperature: 0,
+        }),
+    ).toThrow('Invalid model string "gpt-5-mini"');
+  });
+
+  it('invoke() throws an error', async () => {
+    const provider = new AgentvProvider('test-judge', {
+      model: 'openai:gpt-5-mini',
+      temperature: 0,
+    });
+    await expect(provider.invoke({ question: 'test' })).rejects.toThrow(
+      'AgentvProvider does not support direct invoke()',
+    );
+  });
+});
diff --git a/packages/core/test/evaluation/providers/targets.test.ts b/packages/core/test/evaluation/providers/targets.test.ts
index eacd573b2..7c7d2b0c2 100644
--- a/packages/core/test/evaluation/providers/targets.test.ts
+++ b/packages/core/test/evaluation/providers/targets.test.ts
@@ -559,6 +559,57 @@ describe('resolveTargetDefinition', () => {
       ),
     ).toThrow(/workspace_template has been removed/i);
   });
+
+  it('resolves agentv target with model and default temperature', () => {
+    const target = resolveTargetDefinition(
+      {
+        name: 'agentv-judge',
+        provider: 'agentv',
+        model: 'openai:gpt-5-mini',
+      },
+      {},
+    );
+
+    expect(target.kind).toBe('agentv');
+    if (target.kind !== 'agentv') {
+      throw new Error('expected agentv target');
+    }
+
+    expect(target.config.model).toBe('openai:gpt-5-mini');
+    expect(target.config.temperature).toBe(0);
+  });
+
+  it('resolves agentv target with explicit temperature', () => {
+    const target = resolveTargetDefinition(
+      {
+        name: 'agentv-warm',
+        provider: 'agentv',
+        model: 'anthropic:claude-haiku-4.5',
+        temperature: 0.7,
+      },
+      {},
+    );
+
+    expect(target.kind).toBe('agentv');
+    if (target.kind !== 'agentv') {
+      throw new Error('expected agentv target');
+    }
+
+    expect(target.config.model).toBe('anthropic:claude-haiku-4.5');
+    expect(target.config.temperature).toBe(0.7);
+  });
+
+  it('throws when agentv target is missing model', () => {
+    expect(() =>
+      resolveTargetDefinition(
+        {
+          name: 'agentv-no-model',
+          provider: 'agentv',
+        },
+        {},
+      ),
+    ).toThrow(/model/i);
+  });
 });
 
 describe('createProvider', () => {
diff --git a/packages/eval/src/assertion.ts b/packages/eval/src/assertion.ts
index dd28ea304..bb77b4710 100644
--- a/packages/eval/src/assertion.ts
+++ b/packages/eval/src/assertion.ts
@@ -47,7 +47,6 @@ export type AssertionType =
   | 'cost'
   | 'token-usage'
   | 'execution-metrics'
-  | 'agent-judge'
   | 'skill-trigger'
   | 'contains'
   | 'contains-any'
@@ -67,7 +66,6 @@ export type AssertionType =
   | 'field_accuracy'
   | 'token_usage'
   | 'execution_metrics'
-  | 'agent_judge'
   | 'contains_any'
   | 'contains_all'
   | 'icontains_any'
diff --git a/plugins/agentv-dev/agents/eval-analyzer.md b/plugins/agentv-dev/agents/eval-analyzer.md
index 547c86267..31660128e 100644
--- a/plugins/agentv-dev/agents/eval-analyzer.md
+++ b/plugins/agentv-dev/agents/eval-analyzer.md
@@ -28,7 +28,7 @@ If `eval-path` is provided, also read the EVAL.yaml to understand evaluator conf
 
 ### Step 2: Deterministic-Upgrade Analysis
 
-For each evaluator entry in `scores` where `type` is `"llm-judge"`, `"rubrics"`, or `"agent-judge"`, inspect the `reasoning`, `hits`, and `misses` fields for patterns that indicate a deterministic assertion would suffice:
+For each evaluator entry in `scores` where `type` is `"llm-judge"` or `"rubrics"`, inspect the `reasoning`, `hits`, and `misses` fields for patterns that indicate a deterministic assertion would suffice:
 
 | Signal | Detection | Suggested Upgrade |
 |--------|-----------|-------------------|
@@ -123,7 +123,7 @@ If a section has no findings, include the header with "None found." underneath.
 - **Be specific:** Every suggestion must include the test case ID, evaluator name, evidence from the results, and a concrete replacement config.
 - **Be conservative:** Only suggest deterministic upgrades when the pattern is clear and consistent. Partial or ambiguous evidence should be noted but not acted on.
 - **Prioritize by impact:** Order suggestions by estimated cost savings (LLM-judge → deterministic saves the most).
-- **Handle all evaluator types:** Process `code-judge`, `tool-trajectory`, `llm-judge`, `agent-judge`, `rubrics`, `composite`, and all deterministic types. Only LLM-based types are candidates for deterministic upgrades.
+- **Handle all evaluator types:** Process `code-judge`, `tool-trajectory`, `llm-judge`, `rubrics`, `composite`, and all deterministic types. Only LLM-based types are candidates for deterministic upgrades.
 - **Multi-provider awareness:** When results span multiple targets, note if a suggestion applies to all targets or is target-specific.
 - **No false positives:** It is better to miss a suggestion than to recommend an incorrect upgrade. If unsure, add the finding to a "Needs Review" subsection with your reasoning.
 
diff --git a/plugins/agentv-dev/skills/agentv-eval-writer/SKILL.md b/plugins/agentv-dev/skills/agentv-eval-writer/SKILL.md
index 95d7bf796..5ae6275a3 100644
--- a/plugins/agentv-dev/skills/agentv-eval-writer/SKILL.md
+++ b/plugins/agentv-dev/skills/agentv-eval-writer/SKILL.md
@@ -222,7 +222,7 @@ tests:
 |----------|-------------|----------|
 | `criteria` + **no `assertions`** | Implicit `llm-judge` runs automatically against `criteria` | No |
 | `criteria` + **`assertions` with only deterministic evaluators** (contains, regex, etc.) | Only declared evaluators run. `criteria` is **not evaluated**. | Yes — warns that no evaluator will consume criteria |
-| `criteria` + **`assertions` with a judge** (llm-judge, code-judge, agent-judge, rubrics) | Declared evaluators run. Judges receive `criteria` as input. | No |
+| `criteria` + **`assertions` with a judge** (llm-judge, code-judge, rubrics) | Declared evaluators run. Judges receive `criteria` as input. | No |
 
 ### No assertions → implicit llm-judge
 
diff --git a/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json b/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json
index 9093c7e48..483031bf6 100644
--- a/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json
+++ b/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json
@@ -421,6 +421,16 @@
                             "config": {
                               "type": "object",
                               "additionalProperties": {}
+                            },
+                            "max_steps": {
+                              "type": "integer",
+                              "minimum": 1,
+                              "maximum": 50
+                            },
+                            "temperature": {
+                              "type": "number",
+                              "minimum": 0,
+                              "maximum": 2
                             }
                           },
                           "required": ["type"],
@@ -930,112 +940,6 @@
                           "required": ["type"],
                           "additionalProperties": false
                         },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
-                                },
-                                {
-                                  "type": "number",
-                                  "exclusiveMinimum": 0,
-                                  "maximum": 1
-                                }
-                              ]
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "enum": ["agent-judge", "agent_judge"]
-                            },
-                            "prompt": {
-                              "type": "string"
-                            },
-                            "rubrics": {
-                              "type": "array",
-                              "items": {
-                                "type": "object",
-                                "properties": {
-                                  "id": {
-                                    "type": "string"
-                                  },
-                                  "outcome": {
-                                    "type": "string"
-                                  },
-                                  "weight": {
-                                    "type": "number"
-                                  },
-                                  "required": {
-                                    "type": "boolean"
-                                  },
-                                  "required_min_score": {
-                                    "type": "integer",
-                                    "minimum": 0,
-                                    "maximum": 10
-                                  },
-                                  "score_ranges": {
-                                    "type": "array",
-                                    "items": {
-                                      "type": "object",
-                                      "properties": {
-                                        "score_range": {
-                                          "type": "array",
-                                          "minItems": 2,
-                                          "maxItems": 2,
-                                          "items": [
-                                            {
-                                              "type": "integer",
-                                              "minimum": 0,
-                                              "maximum": 10
-                                            },
-                                            {
-                                              "type": "integer",
-                                              "minimum": 0,
-                                              "maximum": 10
-                                            }
-                                          ]
-                                        },
-                                        "outcome": {
-                                          "type": "string",
-                                          "minLength": 1
-                                        }
-                                      },
-                                      "required": ["score_range", "outcome"],
-                                      "additionalProperties": false
-                                    }
-                                  }
-                                },
-                                "additionalProperties": false
-                              }
-                            },
-                            "max_steps": {
-                              "type": "integer",
-                              "minimum": 1,
-                              "maximum": 50
-                            },
-                            "temperature": {
-                              "type": "number",
-                              "minimum": 0,
-                              "maximum": 2
-                            },
-                            "target": {
-                              "type": "string"
-                            }
-                          },
-                          "required": ["type"],
-                          "additionalProperties": false
-                        },
                         {
                           "type": "object",
                           "properties": {
@@ -1495,6 +1399,16 @@
                             "config": {
                               "type": "object",
                               "additionalProperties": {}
+                            },
+                            "max_steps": {
+                              "type": "integer",
+                              "minimum": 1,
+                              "maximum": 50
+                            },
+                            "temperature": {
+                              "type": "number",
+                              "minimum": 0,
+                              "maximum": 2
                             }
                           },
                           "required": ["type"],
@@ -2004,112 +1918,6 @@
                           "required": ["type"],
                           "additionalProperties": false
                         },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
-                                },
-                                {
-                                  "type": "number",
-                                  "exclusiveMinimum": 0,
-                                  "maximum": 1
-                                }
-                              ]
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "enum": ["agent-judge", "agent_judge"]
-                            },
-                            "prompt": {
-                              "type": "string"
-                            },
-                            "rubrics": {
-                              "type": "array",
-                              "items": {
-                                "type": "object",
-                                "properties": {
-                                  "id": {
-                                    "type": "string"
-                                  },
-                                  "outcome": {
-                                    "type": "string"
-                                  },
-                                  "weight": {
-                                    "type": "number"
-                                  },
-                                  "required": {
-                                    "type": "boolean"
-                                  },
-                                  "required_min_score": {
-                                    "type": "integer",
-                                    "minimum": 0,
-                                    "maximum": 10
-                                  },
-                                  "score_ranges": {
-                                    "type": "array",
-                                    "items": {
-                                      "type": "object",
-                                      "properties": {
-                                        "score_range": {
-                                          "type": "array",
-                                          "minItems": 2,
-                                          "maxItems": 2,
-                                          "items": [
-                                            {
-                                              "type": "integer",
-                                              "minimum": 0,
-                                              "maximum": 10
-                                            },
-                                            {
-                                              "type": "integer",
-                                              "minimum": 0,
-                                              "maximum": 10
-                                            }
-                                          ]
-                                        },
-                                        "outcome": {
-                                          "type": "string",
-                                          "minLength": 1
-                                        }
-                                      },
-                                      "required": ["score_range", "outcome"],
-                                      "additionalProperties": false
-                                    }
-                                  }
-                                },
-                                "additionalProperties": false
-                              }
-                            },
-                            "max_steps": {
-                              "type": "integer",
-                              "minimum": 1,
-                              "maximum": 50
-                            },
-                            "temperature": {
-                              "type": "number",
-                              "minimum": 0,
-                              "maximum": 2
-                            },
-                            "target": {
-                              "type": "string"
-                            }
-                          },
-                          "required": ["type"],
-                          "additionalProperties": false
-                        },
                         {
                           "type": "object",
                           "properties": {
@@ -2569,6 +2377,16 @@
                             "config": {
                               "type": "object",
                               "additionalProperties": {}
+                            },
+                            "max_steps": {
+                              "type": "integer",
+                              "minimum": 1,
+                              "maximum": 50
+                            },
+                            "temperature": {
+                              "type": "number",
+                              "minimum": 0,
+                              "maximum": 2
                             }
                           },
                           "required": ["type"],
@@ -3105,125 +2923,19 @@
                             },
                             "type": {
                               "type": "string",
-                              "enum": ["agent-judge", "agent_judge"]
+                              "const": "contains"
                             },
-                            "prompt": {
-                              "type": "string"
-                            },
-                            "rubrics": {
-                              "type": "array",
-                              "items": {
-                                "type": "object",
-                                "properties": {
-                                  "id": {
-                                    "type": "string"
-                                  },
-                                  "outcome": {
-                                    "type": "string"
-                                  },
-                                  "weight": {
-                                    "type": "number"
-                                  },
-                                  "required": {
-                                    "type": "boolean"
-                                  },
-                                  "required_min_score": {
-                                    "type": "integer",
-                                    "minimum": 0,
-                                    "maximum": 10
-                                  },
-                                  "score_ranges": {
-                                    "type": "array",
-                                    "items": {
-                                      "type": "object",
-                                      "properties": {
-                                        "score_range": {
-                                          "type": "array",
-                                          "minItems": 2,
-                                          "maxItems": 2,
-                                          "items": [
-                                            {
-                                              "type": "integer",
-                                              "minimum": 0,
-                                              "maximum": 10
-                                            },
-                                            {
-                                              "type": "integer",
-                                              "minimum": 0,
-                                              "maximum": 10
-                                            }
-                                          ]
-                                        },
-                                        "outcome": {
-                                          "type": "string",
-                                          "minLength": 1
-                                        }
-                                      },
-                                      "required": ["score_range", "outcome"],
-                                      "additionalProperties": false
-                                    }
-                                  }
-                                },
-                                "additionalProperties": false
-                              }
-                            },
-                            "max_steps": {
-                              "type": "integer",
-                              "minimum": 1,
-                              "maximum": 50
-                            },
-                            "temperature": {
-                              "type": "number",
-                              "minimum": 0,
-                              "maximum": 2
-                            },
-                            "target": {
-                              "type": "string"
-                            }
-                          },
-                          "required": ["type"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
-                                },
-                                {
-                                  "type": "number",
-                                  "exclusiveMinimum": 0,
-                                  "maximum": 1
-                                }
-                              ]
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "const": "contains"
-                            },
-                            "value": {
-                              "type": "string"
-                            }
-                          },
-                          "required": ["type", "value"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
+                            "value": {
+                              "type": "string"
+                            }
+                          },
+                          "required": ["type", "value"],
+                          "additionalProperties": false
+                        },
+                        {
+                          "type": "object",
+                          "properties": {
+                            "name": {
                               "type": "string"
                             },
                             "weight": {
@@ -3655,6 +3367,16 @@
                                 "config": {
                                   "type": "object",
                                   "additionalProperties": {}
+                                },
+                                "max_steps": {
+                                  "type": "integer",
+                                  "minimum": 1,
+                                  "maximum": 50
+                                },
+                                "temperature": {
+                                  "type": "number",
+                                  "minimum": 0,
+                                  "maximum": 2
                                 }
                               },
                               "required": ["type"],
@@ -4164,112 +3886,6 @@
                               "required": ["type"],
                               "additionalProperties": false
                             },
-                            {
-                              "type": "object",
-                              "properties": {
-                                "name": {
-                                  "type": "string"
-                                },
-                                "weight": {
-                                  "type": "number",
-                                  "minimum": 0
-                                },
-                                "required": {
-                                  "anyOf": [
-                                    {
-                                      "type": "boolean"
-                                    },
-                                    {
-                                      "type": "number",
-                                      "exclusiveMinimum": 0,
-                                      "maximum": 1
-                                    }
-                                  ]
-                                },
-                                "negate": {
-                                  "type": "boolean"
-                                },
-                                "type": {
-                                  "type": "string",
-                                  "enum": ["agent-judge", "agent_judge"]
-                                },
-                                "prompt": {
-                                  "type": "string"
-                                },
-                                "rubrics": {
-                                  "type": "array",
-                                  "items": {
-                                    "type": "object",
-                                    "properties": {
-                                      "id": {
-                                        "type": "string"
-                                      },
-                                      "outcome": {
-                                        "type": "string"
-                                      },
-                                      "weight": {
-                                        "type": "number"
-                                      },
-                                      "required": {
-                                        "type": "boolean"
-                                      },
-                                      "required_min_score": {
-                                        "type": "integer",
-                                        "minimum": 0,
-                                        "maximum": 10
-                                      },
-                                      "score_ranges": {
-                                        "type": "array",
-                                        "items": {
-                                          "type": "object",
-                                          "properties": {
-                                            "score_range": {
-                                              "type": "array",
-                                              "minItems": 2,
-                                              "maxItems": 2,
-                                              "items": [
-                                                {
-                                                  "type": "integer",
-                                                  "minimum": 0,
-                                                  "maximum": 10
-                                                },
-                                                {
-                                                  "type": "integer",
-                                                  "minimum": 0,
-                                                  "maximum": 10
-                                                }
-                                              ]
-                                            },
-                                            "outcome": {
-                                              "type": "string",
-                                              "minLength": 1
-                                            }
-                                          },
-                                          "required": ["score_range", "outcome"],
-                                          "additionalProperties": false
-                                        }
-                                      }
-                                    },
-                                    "additionalProperties": false
-                                  }
-                                },
-                                "max_steps": {
-                                  "type": "integer",
-                                  "minimum": 1,
-                                  "maximum": 50
-                                },
-                                "temperature": {
-                                  "type": "number",
-                                  "minimum": 0,
-                                  "maximum": 2
-                                },
-                                "target": {
-                                  "type": "string"
-                                }
-                              },
-                              "required": ["type"],
-                              "additionalProperties": false
-                            },
                             {
                               "type": "object",
                               "properties": {
@@ -4729,6 +4345,16 @@
                                 "config": {
                                   "type": "object",
                                   "additionalProperties": {}
+                                },
+                                "max_steps": {
+                                  "type": "integer",
+                                  "minimum": 1,
+                                  "maximum": 50
+                                },
+                                "temperature": {
+                                  "type": "number",
+                                  "minimum": 0,
+                                  "maximum": 2
                                 }
                               },
                               "required": ["type"],
@@ -5265,125 +4891,19 @@
                                 },
                                 "type": {
                                   "type": "string",
-                                  "enum": ["agent-judge", "agent_judge"]
+                                  "const": "contains"
                                 },
-                                "prompt": {
-                                  "type": "string"
-                                },
-                                "rubrics": {
-                                  "type": "array",
-                                  "items": {
-                                    "type": "object",
-                                    "properties": {
-                                      "id": {
-                                        "type": "string"
-                                      },
-                                      "outcome": {
-                                        "type": "string"
-                                      },
-                                      "weight": {
-                                        "type": "number"
-                                      },
-                                      "required": {
-                                        "type": "boolean"
-                                      },
-                                      "required_min_score": {
-                                        "type": "integer",
-                                        "minimum": 0,
-                                        "maximum": 10
-                                      },
-                                      "score_ranges": {
-                                        "type": "array",
-                                        "items": {
-                                          "type": "object",
-                                          "properties": {
-                                            "score_range": {
-                                              "type": "array",
-                                              "minItems": 2,
-                                              "maxItems": 2,
-                                              "items": [
-                                                {
-                                                  "type": "integer",
-                                                  "minimum": 0,
-                                                  "maximum": 10
-                                                },
-                                                {
-                                                  "type": "integer",
-                                                  "minimum": 0,
-                                                  "maximum": 10
-                                                }
-                                              ]
-                                            },
-                                            "outcome": {
-                                              "type": "string",
-                                              "minLength": 1
-                                            }
-                                          },
-                                          "required": ["score_range", "outcome"],
-                                          "additionalProperties": false
-                                        }
-                                      }
-                                    },
-                                    "additionalProperties": false
-                                  }
-                                },
-                                "max_steps": {
-                                  "type": "integer",
-                                  "minimum": 1,
-                                  "maximum": 50
-                                },
-                                "temperature": {
-                                  "type": "number",
-                                  "minimum": 0,
-                                  "maximum": 2
-                                },
-                                "target": {
-                                  "type": "string"
-                                }
-                              },
-                              "required": ["type"],
-                              "additionalProperties": false
-                            },
-                            {
-                              "type": "object",
-                              "properties": {
-                                "name": {
-                                  "type": "string"
-                                },
-                                "weight": {
-                                  "type": "number",
-                                  "minimum": 0
-                                },
-                                "required": {
-                                  "anyOf": [
-                                    {
-                                      "type": "boolean"
-                                    },
-                                    {
-                                      "type": "number",
-                                      "exclusiveMinimum": 0,
-                                      "maximum": 1
-                                    }
-                                  ]
-                                },
-                                "negate": {
-                                  "type": "boolean"
-                                },
-                                "type": {
-                                  "type": "string",
-                                  "const": "contains"
-                                },
-                                "value": {
-                                  "type": "string"
-                                }
-                              },
-                              "required": ["type", "value"],
-                              "additionalProperties": false
-                            },
-                            {
-                              "type": "object",
-                              "properties": {
-                                "name": {
+                                "value": {
+                                  "type": "string"
+                                }
+                              },
+                              "required": ["type", "value"],
+                              "additionalProperties": false
+                            },
+                            {
+                              "type": "object",
+                              "properties": {
+                                "name": {
                                   "type": "string"
                                 },
                                 "weight": {
@@ -5803,6 +5323,16 @@
                                 "config": {
                                   "type": "object",
                                   "additionalProperties": {}
+                                },
+                                "max_steps": {
+                                  "type": "integer",
+                                  "minimum": 1,
+                                  "maximum": 50
+                                },
+                                "temperature": {
+                                  "type": "number",
+                                  "minimum": 0,
+                                  "maximum": 2
                                 }
                               },
                               "required": ["type"],
@@ -6312,112 +5842,6 @@
                               "required": ["type"],
                               "additionalProperties": false
                             },
-                            {
-                              "type": "object",
-                              "properties": {
-                                "name": {
-                                  "type": "string"
-                                },
-                                "weight": {
-                                  "type": "number",
-                                  "minimum": 0
-                                },
-                                "required": {
-                                  "anyOf": [
-                                    {
-                                      "type": "boolean"
-                                    },
-                                    {
-                                      "type": "number",
-                                      "exclusiveMinimum": 0,
-                                      "maximum": 1
-                                    }
-                                  ]
-                                },
-                                "negate": {
-                                  "type": "boolean"
-                                },
-                                "type": {
-                                  "type": "string",
-                                  "enum": ["agent-judge", "agent_judge"]
-                                },
-                                "prompt": {
-                                  "type": "string"
-                                },
-                                "rubrics": {
-                                  "type": "array",
-                                  "items": {
-                                    "type": "object",
-                                    "properties": {
-                                      "id": {
-                                        "type": "string"
-                                      },
-                                      "outcome": {
-                                        "type": "string"
-                                      },
-                                      "weight": {
-                                        "type": "number"
-                                      },
-                                      "required": {
-                                        "type": "boolean"
-                                      },
-                                      "required_min_score": {
-                                        "type": "integer",
-                                        "minimum": 0,
-                                        "maximum": 10
-                                      },
-                                      "score_ranges": {
-                                        "type": "array",
-                                        "items": {
-                                          "type": "object",
-                                          "properties": {
-                                            "score_range": {
-                                              "type": "array",
-                                              "minItems": 2,
-                                              "maxItems": 2,
-                                              "items": [
-                                                {
-                                                  "type": "integer",
-                                                  "minimum": 0,
-                                                  "maximum": 10
-                                                },
-                                                {
-                                                  "type": "integer",
-                                                  "minimum": 0,
-                                                  "maximum": 10
-                                                }
-                                              ]
-                                            },
-                                            "outcome": {
-                                              "type": "string",
-                                              "minLength": 1
-                                            }
-                                          },
-                                          "required": ["score_range", "outcome"],
-                                          "additionalProperties": false
-                                        }
-                                      }
-                                    },
-                                    "additionalProperties": false
-                                  }
-                                },
-                                "max_steps": {
-                                  "type": "integer",
-                                  "minimum": 1,
-                                  "maximum": 50
-                                },
-                                "temperature": {
-                                  "type": "number",
-                                  "minimum": 0,
-                                  "maximum": 2
-                                },
-                                "target": {
-                                  "type": "string"
-                                }
-                              },
-                              "required": ["type"],
-                              "additionalProperties": false
-                            },
                             {
                               "type": "object",
                               "properties": {
@@ -7292,6 +6716,16 @@
                             "config": {
                               "type": "object",
                               "additionalProperties": {}
+                            },
+                            "max_steps": {
+                              "type": "integer",
+                              "minimum": 1,
+                              "maximum": 50
+                            },
+                            "temperature": {
+                              "type": "number",
+                              "minimum": 0,
+                              "maximum": 2
                             }
                           },
                           "required": ["type"],
@@ -7828,125 +7262,19 @@
                             },
                             "type": {
                               "type": "string",
-                              "enum": ["agent-judge", "agent_judge"]
+                              "const": "contains"
                             },
-                            "prompt": {
-                              "type": "string"
-                            },
-                            "rubrics": {
-                              "type": "array",
-                              "items": {
-                                "type": "object",
-                                "properties": {
-                                  "id": {
-                                    "type": "string"
-                                  },
-                                  "outcome": {
-                                    "type": "string"
-                                  },
-                                  "weight": {
-                                    "type": "number"
-                                  },
-                                  "required": {
-                                    "type": "boolean"
-                                  },
-                                  "required_min_score": {
-                                    "type": "integer",
-                                    "minimum": 0,
-                                    "maximum": 10
-                                  },
-                                  "score_ranges": {
-                                    "type": "array",
-                                    "items": {
-                                      "type": "object",
-                                      "properties": {
-                                        "score_range": {
-                                          "type": "array",
-                                          "minItems": 2,
-                                          "maxItems": 2,
-                                          "items": [
-                                            {
-                                              "type": "integer",
-                                              "minimum": 0,
-                                              "maximum": 10
-                                            },
-                                            {
-                                              "type": "integer",
-                                              "minimum": 0,
-                                              "maximum": 10
-                                            }
-                                          ]
-                                        },
-                                        "outcome": {
-                                          "type": "string",
-                                          "minLength": 1
-                                        }
-                                      },
-                                      "required": ["score_range", "outcome"],
-                                      "additionalProperties": false
-                                    }
-                                  }
-                                },
-                                "additionalProperties": false
-                              }
-                            },
-                            "max_steps": {
-                              "type": "integer",
-                              "minimum": 1,
-                              "maximum": 50
-                            },
-                            "temperature": {
-                              "type": "number",
-                              "minimum": 0,
-                              "maximum": 2
-                            },
-                            "target": {
-                              "type": "string"
-                            }
-                          },
-                          "required": ["type"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
-                                },
-                                {
-                                  "type": "number",
-                                  "exclusiveMinimum": 0,
-                                  "maximum": 1
-                                }
-                              ]
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "const": "contains"
-                            },
-                            "value": {
-                              "type": "string"
-                            }
-                          },
-                          "required": ["type", "value"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
+                            "value": {
+                              "type": "string"
+                            }
+                          },
+                          "required": ["type", "value"],
+                          "additionalProperties": false
+                        },
+                        {
+                          "type": "object",
+                          "properties": {
+                            "name": {
                               "type": "string"
                             },
                             "weight": {
@@ -8366,6 +7694,16 @@
                             "config": {
                               "type": "object",
                               "additionalProperties": {}
+                            },
+                            "max_steps": {
+                              "type": "integer",
+                              "minimum": 1,
+                              "maximum": 50
+                            },
+                            "temperature": {
+                              "type": "number",
+                              "minimum": 0,
+                              "maximum": 2
                             }
                           },
                           "required": ["type"],
@@ -8875,112 +8213,6 @@
                           "required": ["type"],
                           "additionalProperties": false
                         },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
-                                },
-                                {
-                                  "type": "number",
-                                  "exclusiveMinimum": 0,
-                                  "maximum": 1
-                                }
-                              ]
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "enum": ["agent-judge", "agent_judge"]
-                            },
-                            "prompt": {
-                              "type": "string"
-                            },
-                            "rubrics": {
-                              "type": "array",
-                              "items": {
-                                "type": "object",
-                                "properties": {
-                                  "id": {
-                                    "type": "string"
-                                  },
-                                  "outcome": {
-                                    "type": "string"
-                                  },
-                                  "weight": {
-                                    "type": "number"
-                                  },
-                                  "required": {
-                                    "type": "boolean"
-                                  },
-                                  "required_min_score": {
-                                    "type": "integer",
-                                    "minimum": 0,
-                                    "maximum": 10
-                                  },
-                                  "score_ranges": {
-                                    "type": "array",
-                                    "items": {
-                                      "type": "object",
-                                      "properties": {
-                                        "score_range": {
-                                          "type": "array",
-                                          "minItems": 2,
-                                          "maxItems": 2,
-                                          "items": [
-                                            {
-                                              "type": "integer",
-                                              "minimum": 0,
-                                              "maximum": 10
-                                            },
-                                            {
-                                              "type": "integer",
-                                              "minimum": 0,
-                                              "maximum": 10
-                                            }
-                                          ]
-                                        },
-                                        "outcome": {
-                                          "type": "string",
-                                          "minLength": 1
-                                        }
-                                      },
-                                      "required": ["score_range", "outcome"],
-                                      "additionalProperties": false
-                                    }
-                                  }
-                                },
-                                "additionalProperties": false
-                              }
-                            },
-                            "max_steps": {
-                              "type": "integer",
-                              "minimum": 1,
-                              "maximum": 50
-                            },
-                            "temperature": {
-                              "type": "number",
-                              "minimum": 0,
-                              "maximum": 2
-                            },
-                            "target": {
-                              "type": "string"
-                            }
-                          },
-                          "required": ["type"],
-                          "additionalProperties": false
-                        },
                         {
                           "type": "object",
                           "properties": {
@@ -9440,6 +8672,16 @@
                             "config": {
                               "type": "object",
                               "additionalProperties": {}
+                            },
+                            "max_steps": {
+                              "type": "integer",
+                              "minimum": 1,
+                              "maximum": 50
+                            },
+                            "temperature": {
+                              "type": "number",
+                              "minimum": 0,
+                              "maximum": 2
                             }
                           },
                           "required": ["type"],
@@ -9976,125 +9218,19 @@
                             },
                             "type": {
                               "type": "string",
-                              "enum": ["agent-judge", "agent_judge"]
+                              "const": "contains"
                             },
-                            "prompt": {
-                              "type": "string"
-                            },
-                            "rubrics": {
-                              "type": "array",
-                              "items": {
-                                "type": "object",
-                                "properties": {
-                                  "id": {
-                                    "type": "string"
-                                  },
-                                  "outcome": {
-                                    "type": "string"
-                                  },
-                                  "weight": {
-                                    "type": "number"
-                                  },
-                                  "required": {
-                                    "type": "boolean"
-                                  },
-                                  "required_min_score": {
-                                    "type": "integer",
-                                    "minimum": 0,
-                                    "maximum": 10
-                                  },
-                                  "score_ranges": {
-                                    "type": "array",
-                                    "items": {
-                                      "type": "object",
-                                      "properties": {
-                                        "score_range": {
-                                          "type": "array",
-                                          "minItems": 2,
-                                          "maxItems": 2,
-                                          "items": [
-                                            {
-                                              "type": "integer",
-                                              "minimum": 0,
-                                              "maximum": 10
-                                            },
-                                            {
-                                              "type": "integer",
-                                              "minimum": 0,
-                                              "maximum": 10
-                                            }
-                                          ]
-                                        },
-                                        "outcome": {
-                                          "type": "string",
-                                          "minLength": 1
-                                        }
-                                      },
-                                      "required": ["score_range", "outcome"],
-                                      "additionalProperties": false
-                                    }
-                                  }
-                                },
-                                "additionalProperties": false
-                              }
-                            },
-                            "max_steps": {
-                              "type": "integer",
-                              "minimum": 1,
-                              "maximum": 50
-                            },
-                            "temperature": {
-                              "type": "number",
-                              "minimum": 0,
-                              "maximum": 2
-                            },
-                            "target": {
-                              "type": "string"
-                            }
-                          },
-                          "required": ["type"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number",
-                              "minimum": 0
-                            },
-                            "required": {
-                              "anyOf": [
-                                {
-                                  "type": "boolean"
-                                },
-                                {
-                                  "type": "number",
-                                  "exclusiveMinimum": 0,
-                                  "maximum": 1
-                                }
-                              ]
-                            },
-                            "negate": {
-                              "type": "boolean"
-                            },
-                            "type": {
-                              "type": "string",
-                              "const": "contains"
-                            },
-                            "value": {
-                              "type": "string"
-                            }
-                          },
-                          "required": ["type", "value"],
-                          "additionalProperties": false
-                        },
-                        {
-                          "type": "object",
-                          "properties": {
-                            "name": {
+                            "value": {
+                              "type": "string"
+                            }
+                          },
+                          "required": ["type", "value"],
+                          "additionalProperties": false
+                        },
+                        {
+                          "type": "object",
+                          "properties": {
+                            "name": {
                               "type": "string"
                             },
                             "weight": {
@@ -10526,6 +9662,16 @@
                                 "config": {
                                   "type": "object",
                                   "additionalProperties": {}
+                                },
+                                "max_steps": {
+                                  "type": "integer",
+                                  "minimum": 1,
+                                  "maximum": 50
+                                },
+                                "temperature": {
+                                  "type": "number",
+                                  "minimum": 0,
+                                  "maximum": 2
                                 }
                               },
                               "required": ["type"],
@@ -11035,112 +10181,6 @@
                               "required": ["type"],
                               "additionalProperties": false
                             },
-                            {
-                              "type": "object",
-                              "properties": {
-                                "name": {
-                                  "type": "string"
-                                },
-                                "weight": {
-                                  "type": "number",
-                                  "minimum": 0
-                                },
-                                "required": {
-                                  "anyOf": [
-                                    {
-                                      "type": "boolean"
-                                    },
-                                    {
-                                      "type": "number",
-                                      "exclusiveMinimum": 0,
-                                      "maximum": 1
-                                    }
-                                  ]
-                                },
-                                "negate": {
-                                  "type": "boolean"
-                                },
-                                "type": {
-                                  "type": "string",
-                                  "enum": ["agent-judge", "agent_judge"]
-                                },
-                                "prompt": {
-                                  "type": "string"
-                                },
-                                "rubrics": {
-                                  "type": "array",
-                                  "items": {
-                                    "type": "object",
-                                    "properties": {
-                                      "id": {
-                                        "type": "string"
-                                      },
-                                      "outcome": {
-                                        "type": "string"
-                                      },
-                                      "weight": {
-                                        "type": "number"
-                                      },
-                                      "required": {
-                                        "type": "boolean"
-                                      },
-                                      "required_min_score": {
-                                        "type": "integer",
-                                        "minimum": 0,
-                                        "maximum": 10
-                                      },
-                                      "score_ranges": {
-                                        "type": "array",
-                                        "items": {
-                                          "type": "object",
-                                          "properties": {
-                                            "score_range": {
-                                              "type": "array",
-                                              "minItems": 2,
-                                              "maxItems": 2,
-                                              "items": [
-                                                {
-                                                  "type": "integer",
-                                                  "minimum": 0,
-                                                  "maximum": 10
-                                                },
-                                                {
-                                                  "type": "integer",
-                                                  "minimum": 0,
-                                                  "maximum": 10
-                                                }
-                                              ]
-                                            },
-                                            "outcome": {
-                                              "type": "string",
-                                              "minLength": 1
-                                            }
-                                          },
-                                          "required": ["score_range", "outcome"],
-                                          "additionalProperties": false
-                                        }
-                                      }
-                                    },
-                                    "additionalProperties": false
-                                  }
-                                },
-                                "max_steps": {
-                                  "type": "integer",
-                                  "minimum": 1,
-                                  "maximum": 50
-                                },
-                                "temperature": {
-                                  "type": "number",
-                                  "minimum": 0,
-                                  "maximum": 2
-                                },
-                                "target": {
-                                  "type": "string"
-                                }
-                              },
-                              "required": ["type"],
-                              "additionalProperties": false
-                            },
                             {
                               "type": "object",
                               "properties": {
@@ -11600,6 +10640,16 @@
                                 "config": {
                                   "type": "object",
                                   "additionalProperties": {}
+                                },
+                                "max_steps": {
+                                  "type": "integer",
+                                  "minimum": 1,
+                                  "maximum": 50
+                                },
+                                "temperature": {
+                                  "type": "number",
+                                  "minimum": 0,
+                                  "maximum": 2
                                 }
                               },
                               "required": ["type"],
@@ -12136,125 +11186,19 @@
                                 },
                                 "type": {
                                   "type": "string",
-                                  "enum": ["agent-judge", "agent_judge"]
+                                  "const": "contains"
                                 },
-                                "prompt": {
-                                  "type": "string"
-                                },
-                                "rubrics": {
-                                  "type": "array",
-                                  "items": {
-                                    "type": "object",
-                                    "properties": {
-                                      "id": {
-                                        "type": "string"
-                                      },
-                                      "outcome": {
-                                        "type": "string"
-                                      },
-                                      "weight": {
-                                        "type": "number"
-                                      },
-                                      "required": {
-                                        "type": "boolean"
-                                      },
-                                      "required_min_score": {
-                                        "type": "integer",
-                                        "minimum": 0,
-                                        "maximum": 10
-                                      },
-                                      "score_ranges": {
-                                        "type": "array",
-                                        "items": {
-                                          "type": "object",
-                                          "properties": {
-                                            "score_range": {
-                                              "type": "array",
-                                              "minItems": 2,
-                                              "maxItems": 2,
-                                              "items": [
-                                                {
-                                                  "type": "integer",
-                                                  "minimum": 0,
-                                                  "maximum": 10
-                                                },
-                                                {
-                                                  "type": "integer",
-                                                  "minimum": 0,
-                                                  "maximum": 10
-                                                }
-                                              ]
-                                            },
-                                            "outcome": {
-                                              "type": "string",
-                                              "minLength": 1
-                                            }
-                                          },
-                                          "required": ["score_range", "outcome"],
-                                          "additionalProperties": false
-                                        }
-                                      }
-                                    },
-                                    "additionalProperties": false
-                                  }
-                                },
-                                "max_steps": {
-                                  "type": "integer",
-                                  "minimum": 1,
-                                  "maximum": 50
-                                },
-                                "temperature": {
-                                  "type": "number",
-                                  "minimum": 0,
-                                  "maximum": 2
-                                },
-                                "target": {
-                                  "type": "string"
-                                }
-                              },
-                              "required": ["type"],
-                              "additionalProperties": false
-                            },
-                            {
-                              "type": "object",
-                              "properties": {
-                                "name": {
-                                  "type": "string"
-                                },
-                                "weight": {
-                                  "type": "number",
-                                  "minimum": 0
-                                },
-                                "required": {
-                                  "anyOf": [
-                                    {
-                                      "type": "boolean"
-                                    },
-                                    {
-                                      "type": "number",
-                                      "exclusiveMinimum": 0,
-                                      "maximum": 1
-                                    }
-                                  ]
-                                },
-                                "negate": {
-                                  "type": "boolean"
-                                },
-                                "type": {
-                                  "type": "string",
-                                  "const": "contains"
-                                },
-                                "value": {
-                                  "type": "string"
-                                }
-                              },
-                              "required": ["type", "value"],
-                              "additionalProperties": false
-                            },
-                            {
-                              "type": "object",
-                              "properties": {
-                                "name": {
+                                "value": {
+                                  "type": "string"
+                                }
+                              },
+                              "required": ["type", "value"],
+                              "additionalProperties": false
+                            },
+                            {
+                              "type": "object",
+                              "properties": {
+                                "name": {
                                   "type": "string"
                                 },
                                 "weight": {
@@ -12674,6 +11618,16 @@
                                 "config": {
                                   "type": "object",
                                   "additionalProperties": {}
+                                },
+                                "max_steps": {
+                                  "type": "integer",
+                                  "minimum": 1,
+                                  "maximum": 50
+                                },
+                                "temperature": {
+                                  "type": "number",
+                                  "minimum": 0,
+                                  "maximum": 2
                                 }
                               },
                               "required": ["type"],
@@ -13183,112 +12137,6 @@
                               "required": ["type"],
                               "additionalProperties": false
                             },
-                            {
-                              "type": "object",
-                              "properties": {
-                                "name": {
-                                  "type": "string"
-                                },
-                                "weight": {
-                                  "type": "number",
-                                  "minimum": 0
-                                },
-                                "required": {
-                                  "anyOf": [
-                                    {
-                                      "type": "boolean"
-                                    },
-                                    {
-                                      "type": "number",
-                                      "exclusiveMinimum": 0,
-                                      "maximum": 1
-                                    }
-                                  ]
-                                },
-                                "negate": {
-                                  "type": "boolean"
-                                },
-                                "type": {
-                                  "type": "string",
-                                  "enum": ["agent-judge", "agent_judge"]
-                                },
-                                "prompt": {
-                                  "type": "string"
-                                },
-                                "rubrics": {
-                                  "type": "array",
-                                  "items": {
-                                    "type": "object",
-                                    "properties": {
-                                      "id": {
-                                        "type": "string"
-                                      },
-                                      "outcome": {
-                                        "type": "string"
-                                      },
-                                      "weight": {
-                                        "type": "number"
-                                      },
-                                      "required": {
-                                        "type": "boolean"
-                                      },
-                                      "required_min_score": {
-                                        "type": "integer",
-                                        "minimum": 0,
-                                        "maximum": 10
-                                      },
-                                      "score_ranges": {
-                                        "type": "array",
-                                        "items": {
-                                          "type": "object",
-                                          "properties": {
-                                            "score_range": {
-                                              "type": "array",
-                                              "minItems": 2,
-                                              "maxItems": 2,
-                                              "items": [
-                                                {
-                                                  "type": "integer",
-                                                  "minimum": 0,
-                                                  "maximum": 10
-                                                },
-                                                {
-                                                  "type": "integer",
-                                                  "minimum": 0,
-                                                  "maximum": 10
-                                                }
-                                              ]
-                                            },
-                                            "outcome": {
-                                              "type": "string",
-                                              "minLength": 1
-                                            }
-                                          },
-                                          "required": ["score_range", "outcome"],
-                                          "additionalProperties": false
-                                        }
-                                      }
-                                    },
-                                    "additionalProperties": false
-                                  }
-                                },
-                                "max_steps": {
-                                  "type": "integer",
-                                  "minimum": 1,
-                                  "maximum": 50
-                                },
-                                "temperature": {
-                                  "type": "number",
-                                  "minimum": 0,
-                                  "maximum": 2
-                                },
-                                "target": {
-                                  "type": "string"
-                                }
-                              },
-                              "required": ["type"],
-                              "additionalProperties": false
-                            },
                             {
                               "type": "object",
                               "properties": {
@@ -14067,6 +12915,16 @@
                       "config": {
                         "type": "object",
                         "additionalProperties": {}
+                      },
+                      "max_steps": {
+                        "type": "integer",
+                        "minimum": 1,
+                        "maximum": 50
+                      },
+                      "temperature": {
+                        "type": "number",
+                        "minimum": 0,
+                        "maximum": 2
                       }
                     },
                     "required": ["type"],
@@ -14603,125 +13461,19 @@
                       },
                       "type": {
                         "type": "string",
-                        "enum": ["agent-judge", "agent_judge"]
+                        "const": "contains"
                       },
-                      "prompt": {
-                        "type": "string"
-                      },
-                      "rubrics": {
-                        "type": "array",
-                        "items": {
-                          "type": "object",
-                          "properties": {
-                            "id": {
-                              "type": "string"
-                            },
-                            "outcome": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number"
-                            },
-                            "required": {
-                              "type": "boolean"
-                            },
-                            "required_min_score": {
-                              "type": "integer",
-                              "minimum": 0,
-                              "maximum": 10
-                            },
-                            "score_ranges": {
-                              "type": "array",
-                              "items": {
-                                "type": "object",
-                                "properties": {
-                                  "score_range": {
-                                    "type": "array",
-                                    "minItems": 2,
-                                    "maxItems": 2,
-                                    "items": [
-                                      {
-                                        "type": "integer",
-                                        "minimum": 0,
-                                        "maximum": 10
-                                      },
-                                      {
-                                        "type": "integer",
-                                        "minimum": 0,
-                                        "maximum": 10
-                                      }
-                                    ]
-                                  },
-                                  "outcome": {
-                                    "type": "string",
-                                    "minLength": 1
-                                  }
-                                },
-                                "required": ["score_range", "outcome"],
-                                "additionalProperties": false
-                              }
-                            }
-                          },
-                          "additionalProperties": false
-                        }
-                      },
-                      "max_steps": {
-                        "type": "integer",
-                        "minimum": 1,
-                        "maximum": 50
-                      },
-                      "temperature": {
-                        "type": "number",
-                        "minimum": 0,
-                        "maximum": 2
-                      },
-                      "target": {
-                        "type": "string"
-                      }
-                    },
-                    "required": ["type"],
-                    "additionalProperties": false
-                  },
-                  {
-                    "type": "object",
-                    "properties": {
-                      "name": {
-                        "type": "string"
-                      },
-                      "weight": {
-                        "type": "number",
-                        "minimum": 0
-                      },
-                      "required": {
-                        "anyOf": [
-                          {
-                            "type": "boolean"
-                          },
-                          {
-                            "type": "number",
-                            "exclusiveMinimum": 0,
-                            "maximum": 1
-                          }
-                        ]
-                      },
-                      "negate": {
-                        "type": "boolean"
-                      },
-                      "type": {
-                        "type": "string",
-                        "const": "contains"
-                      },
-                      "value": {
-                        "type": "string"
-                      }
-                    },
-                    "required": ["type", "value"],
-                    "additionalProperties": false
-                  },
-                  {
-                    "type": "object",
-                    "properties": {
-                      "name": {
+                      "value": {
+                        "type": "string"
+                      }
+                    },
+                    "required": ["type", "value"],
+                    "additionalProperties": false
+                  },
+                  {
+                    "type": "object",
+                    "properties": {
+                      "name": {
                         "type": "string"
                       },
                       "weight": {
@@ -15141,6 +13893,16 @@
                       "config": {
                         "type": "object",
                         "additionalProperties": {}
+                      },
+                      "max_steps": {
+                        "type": "integer",
+                        "minimum": 1,
+                        "maximum": 50
+                      },
+                      "temperature": {
+                        "type": "number",
+                        "minimum": 0,
+                        "maximum": 2
                       }
                     },
                     "required": ["type"],
@@ -15650,112 +14412,6 @@
                     "required": ["type"],
                     "additionalProperties": false
                   },
-                  {
-                    "type": "object",
-                    "properties": {
-                      "name": {
-                        "type": "string"
-                      },
-                      "weight": {
-                        "type": "number",
-                        "minimum": 0
-                      },
-                      "required": {
-                        "anyOf": [
-                          {
-                            "type": "boolean"
-                          },
-                          {
-                            "type": "number",
-                            "exclusiveMinimum": 0,
-                            "maximum": 1
-                          }
-                        ]
-                      },
-                      "negate": {
-                        "type": "boolean"
-                      },
-                      "type": {
-                        "type": "string",
-                        "enum": ["agent-judge", "agent_judge"]
-                      },
-                      "prompt": {
-                        "type": "string"
-                      },
-                      "rubrics": {
-                        "type": "array",
-                        "items": {
-                          "type": "object",
-                          "properties": {
-                            "id": {
-                              "type": "string"
-                            },
-                            "outcome": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number"
-                            },
-                            "required": {
-                              "type": "boolean"
-                            },
-                            "required_min_score": {
-                              "type": "integer",
-                              "minimum": 0,
-                              "maximum": 10
-                            },
-                            "score_ranges": {
-                              "type": "array",
-                              "items": {
-                                "type": "object",
-                                "properties": {
-                                  "score_range": {
-                                    "type": "array",
-                                    "minItems": 2,
-                                    "maxItems": 2,
-                                    "items": [
-                                      {
-                                        "type": "integer",
-                                        "minimum": 0,
-                                        "maximum": 10
-                                      },
-                                      {
-                                        "type": "integer",
-                                        "minimum": 0,
-                                        "maximum": 10
-                                      }
-                                    ]
-                                  },
-                                  "outcome": {
-                                    "type": "string",
-                                    "minLength": 1
-                                  }
-                                },
-                                "required": ["score_range", "outcome"],
-                                "additionalProperties": false
-                              }
-                            }
-                          },
-                          "additionalProperties": false
-                        }
-                      },
-                      "max_steps": {
-                        "type": "integer",
-                        "minimum": 1,
-                        "maximum": 50
-                      },
-                      "temperature": {
-                        "type": "number",
-                        "minimum": 0,
-                        "maximum": 2
-                      },
-                      "target": {
-                        "type": "string"
-                      }
-                    },
-                    "required": ["type"],
-                    "additionalProperties": false
-                  },
                   {
                     "type": "object",
                     "properties": {
@@ -16215,6 +14871,16 @@
                       "config": {
                         "type": "object",
                         "additionalProperties": {}
+                      },
+                      "max_steps": {
+                        "type": "integer",
+                        "minimum": 1,
+                        "maximum": 50
+                      },
+                      "temperature": {
+                        "type": "number",
+                        "minimum": 0,
+                        "maximum": 2
                       }
                     },
                     "required": ["type"],
@@ -16696,135 +15362,29 @@
                         "minimum": 0
                       },
                       "max_llm_calls": {
-                        "type": "number",
-                        "minimum": 0
-                      },
-                      "max_tokens": {
-                        "type": "number",
-                        "minimum": 0
-                      },
-                      "max_cost_usd": {
-                        "type": "number",
-                        "minimum": 0
-                      },
-                      "max_duration_ms": {
-                        "type": "number",
-                        "minimum": 0
-                      },
-                      "target_exploration_ratio": {
-                        "type": "number",
-                        "minimum": 0,
-                        "maximum": 1
-                      },
-                      "exploration_tolerance": {
-                        "type": "number",
-                        "minimum": 0
-                      }
-                    },
-                    "required": ["type"],
-                    "additionalProperties": false
-                  },
-                  {
-                    "type": "object",
-                    "properties": {
-                      "name": {
-                        "type": "string"
-                      },
-                      "weight": {
-                        "type": "number",
-                        "minimum": 0
-                      },
-                      "required": {
-                        "anyOf": [
-                          {
-                            "type": "boolean"
-                          },
-                          {
-                            "type": "number",
-                            "exclusiveMinimum": 0,
-                            "maximum": 1
-                          }
-                        ]
-                      },
-                      "negate": {
-                        "type": "boolean"
-                      },
-                      "type": {
-                        "type": "string",
-                        "enum": ["agent-judge", "agent_judge"]
-                      },
-                      "prompt": {
-                        "type": "string"
-                      },
-                      "rubrics": {
-                        "type": "array",
-                        "items": {
-                          "type": "object",
-                          "properties": {
-                            "id": {
-                              "type": "string"
-                            },
-                            "outcome": {
-                              "type": "string"
-                            },
-                            "weight": {
-                              "type": "number"
-                            },
-                            "required": {
-                              "type": "boolean"
-                            },
-                            "required_min_score": {
-                              "type": "integer",
-                              "minimum": 0,
-                              "maximum": 10
-                            },
-                            "score_ranges": {
-                              "type": "array",
-                              "items": {
-                                "type": "object",
-                                "properties": {
-                                  "score_range": {
-                                    "type": "array",
-                                    "minItems": 2,
-                                    "maxItems": 2,
-                                    "items": [
-                                      {
-                                        "type": "integer",
-                                        "minimum": 0,
-                                        "maximum": 10
-                                      },
-                                      {
-                                        "type": "integer",
-                                        "minimum": 0,
-                                        "maximum": 10
-                                      }
-                                    ]
-                                  },
-                                  "outcome": {
-                                    "type": "string",
-                                    "minLength": 1
-                                  }
-                                },
-                                "required": ["score_range", "outcome"],
-                                "additionalProperties": false
-                              }
-                            }
-                          },
-                          "additionalProperties": false
-                        }
+                        "type": "number",
+                        "minimum": 0
                       },
-                      "max_steps": {
-                        "type": "integer",
-                        "minimum": 1,
-                        "maximum": 50
+                      "max_tokens": {
+                        "type": "number",
+                        "minimum": 0
                       },
-                      "temperature": {
+                      "max_cost_usd": {
+                        "type": "number",
+                        "minimum": 0
+                      },
+                      "max_duration_ms": {
+                        "type": "number",
+                        "minimum": 0
+                      },
+                      "target_exploration_ratio": {
                         "type": "number",
                         "minimum": 0,
-                        "maximum": 2
+                        "maximum": 1
                       },
-                      "target": {
-                        "type": "string"
+                      "exploration_tolerance": {
+                        "type": "number",
+                        "minimum": 0
                       }
                     },
                     "required": ["type"],
@@ -17335,6 +15895,16 @@
                   "config": {
                     "type": "object",
                     "additionalProperties": {}
+                  },
+                  "max_steps": {
+                    "type": "integer",
+                    "minimum": 1,
+                    "maximum": 50
+                  },
+                  "temperature": {
+                    "type": "number",
+                    "minimum": 0,
+                    "maximum": 2
                   }
                 },
                 "required": ["type"],
@@ -17844,112 +16414,6 @@
                 "required": ["type"],
                 "additionalProperties": false
               },
-              {
-                "type": "object",
-                "properties": {
-                  "name": {
-                    "type": "string"
-                  },
-                  "weight": {
-                    "type": "number",
-                    "minimum": 0
-                  },
-                  "required": {
-                    "anyOf": [
-                      {
-                        "type": "boolean"
-                      },
-                      {
-                        "type": "number",
-                        "exclusiveMinimum": 0,
-                        "maximum": 1
-                      }
-                    ]
-                  },
-                  "negate": {
-                    "type": "boolean"
-                  },
-                  "type": {
-                    "type": "string",
-                    "enum": ["agent-judge", "agent_judge"]
-                  },
-                  "prompt": {
-                    "type": "string"
-                  },
-                  "rubrics": {
-                    "type": "array",
-                    "items": {
-                      "type": "object",
-                      "properties": {
-                        "id": {
-                          "type": "string"
-                        },
-                        "outcome": {
-                          "type": "string"
-                        },
-                        "weight": {
-                          "type": "number"
-                        },
-                        "required": {
-                          "type": "boolean"
-                        },
-                        "required_min_score": {
-                          "type": "integer",
-                          "minimum": 0,
-                          "maximum": 10
-                        },
-                        "score_ranges": {
-                          "type": "array",
-                          "items": {
-                            "type": "object",
-                            "properties": {
-                              "score_range": {
-                                "type": "array",
-                                "minItems": 2,
-                                "maxItems": 2,
-                                "items": [
-                                  {
-                                    "type": "integer",
-                                    "minimum": 0,
-                                    "maximum": 10
-                                  },
-                                  {
-                                    "type": "integer",
-                                    "minimum": 0,
-                                    "maximum": 10
-                                  }
-                                ]
-                              },
-                              "outcome": {
-                                "type": "string",
-                                "minLength": 1
-                              }
-                            },
-                            "required": ["score_range", "outcome"],
-                            "additionalProperties": false
-                          }
-                        }
-                      },
-                      "additionalProperties": false
-                    }
-                  },
-                  "max_steps": {
-                    "type": "integer",
-                    "minimum": 1,
-                    "maximum": 50
-                  },
-                  "temperature": {
-                    "type": "number",
-                    "minimum": 0,
-                    "maximum": 2
-                  },
-                  "target": {
-                    "type": "string"
-                  }
-                },
-                "required": ["type"],
-                "additionalProperties": false
-              },
               {
                 "type": "object",
                 "properties": {
@@ -18409,6 +16873,16 @@
                   "config": {
                     "type": "object",
                     "additionalProperties": {}
+                  },
+                  "max_steps": {
+                    "type": "integer",
+                    "minimum": 1,
+                    "maximum": 50
+                  },
+                  "temperature": {
+                    "type": "number",
+                    "minimum": 0,
+                    "maximum": 2
                   }
                 },
                 "required": ["type"],
@@ -18918,112 +17392,6 @@
                 "required": ["type"],
                 "additionalProperties": false
               },
-              {
-                "type": "object",
-                "properties": {
-                  "name": {
-                    "type": "string"
-                  },
-                  "weight": {
-                    "type": "number",
-                    "minimum": 0
-                  },
-                  "required": {
-                    "anyOf": [
-                      {
-                        "type": "boolean"
-                      },
-                      {
-                        "type": "number",
-                        "exclusiveMinimum": 0,
-                        "maximum": 1
-                      }
-                    ]
-                  },
-                  "negate": {
-                    "type": "boolean"
-                  },
-                  "type": {
-                    "type": "string",
-                    "enum": ["agent-judge", "agent_judge"]
-                  },
-                  "prompt": {
-                    "type": "string"
-                  },
-                  "rubrics": {
-                    "type": "array",
-                    "items": {
-                      "type": "object",
-                      "properties": {
-                        "id": {
-                          "type": "string"
-                        },
-                        "outcome": {
-                          "type": "string"
-                        },
-                        "weight": {
-                          "type": "number"
-                        },
-                        "required": {
-                          "type": "boolean"
-                        },
-                        "required_min_score": {
-                          "type": "integer",
-                          "minimum": 0,
-                          "maximum": 10
-                        },
-                        "score_ranges": {
-                          "type": "array",
-                          "items": {
-                            "type": "object",
-                            "properties": {
-                              "score_range": {
-                                "type": "array",
-                                "minItems": 2,
-                                "maxItems": 2,
-                                "items": [
-                                  {
-                                    "type": "integer",
-                                    "minimum": 0,
-                                    "maximum": 10
-                                  },
-                                  {
-                                    "type": "integer",
-                                    "minimum": 0,
-                                    "maximum": 10
-                                  }
-                                ]
-                              },
-                              "outcome": {
-                                "type": "string",
-                                "minLength": 1
-                              }
-                            },
-                            "required": ["score_range", "outcome"],
-                            "additionalProperties": false
-                          }
-                        }
-                      },
-                      "additionalProperties": false
-                    }
-                  },
-                  "max_steps": {
-                    "type": "integer",
-                    "minimum": 1,
-                    "maximum": 50
-                  },
-                  "temperature": {
-                    "type": "number",
-                    "minimum": 0,
-                    "maximum": 2
-                  },
-                  "target": {
-                    "type": "string"
-                  }
-                },
-                "required": ["type"],
-                "additionalProperties": false
-              },
               {
                 "type": "object",
                 "properties": {