EntityProcess · christso · Mar 15, 2026 · Mar 15, 2026 · Mar 15, 2026 · Mar 15, 2026
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -159,6 +159,28 @@ Unit tests alone are insufficient for evaluator changes. After implementing or m
 
 5. **Note:** `--dry-run` returns mock responses that don't match evaluator output schemas. Use it only for testing harness flow, not evaluator logic.
 
+## Completing Work — E2E Checklist
+
+Before marking any branch as ready for review, complete this checklist:
+
+1. **Copy `.env` to worktree** (if working in a git worktree):
+   ```bash
+   cp /home/christso/projects/agentv/.env .env
+   ```
+   Without this, any eval run or LLM-dependent test will fail with missing API key errors.
+
+2. **Run unit tests**: `bun run test` — all must pass.
+
+3. **Run at least one real eval** against an example file to verify end-to-end behavior:
+   ```bash
+   bun apps/cli/src/cli.ts eval examples/features/rubric/evals/dataset.eval.yaml --test-id <test-id>
+   ```
+   Inspect the output JSONL to confirm correct evaluator type, scores, and hits/misses.
+
+4. **Verify no regressions** in areas adjacent to your changes (e.g., if you changed evaluator parsing, run an eval that exercises different evaluator types).
+
+5. **Mark PR as ready** only after all above steps pass.
+
 ## Evaluator Type System
 
 Evaluator types use **kebab-case** everywhere (matching promptfoo convention):
@@ -248,6 +270,7 @@ When working on a GitHub issue, **ALWAYS** follow this workflow:
    ```
 
 4. **Before merging**, ensure:
+   - **E2E verification completed** (see "Completing Work — E2E Checklist" below)
    - CI pipeline passes (all checks green)
    - Code has been reviewed if required
    - No merge conflicts with `main`

diff --git a/apps/cli/src/commands/eval/commands/run.ts b/apps/cli/src/commands/eval/commands/run.ts
@@ -163,6 +163,17 @@ export const evalRunCommand = command({
       description:
         'Write companion artifacts (grading/<test>.json, timing.json, benchmark.json) to the specified directory',
     }),
+    judgeTarget: option({
+      type: optional(string),
+      long: 'judge-target',
+      description:
+        'Override judge target for all evaluators (e.g., "agentv", or a target name from targets.yaml)',
+    }),
+    model: option({
+      type: optional(string),
+      long: 'model',
+      description: 'Override model for the judge target (e.g., "openai:gpt-5-mini")',
+    }),
   },
   handler: async (args) => {
     // Launch interactive wizard when no eval paths and stdin is a TTY
@@ -203,6 +214,8 @@ export const evalRunCommand = command({
       strict: args.strict,
       benchmarkJson: args.benchmarkJson,
       artifacts: args.artifacts,
+      judgeTarget: args.judgeTarget,
+      model: args.model,
     };
     await runEvalCommand({ testFiles: resolvedPaths, rawOptions });
   },

diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts
@@ -82,6 +82,8 @@ interface NormalizedOptions {
   readonly workspacePath?: string;
   readonly benchmarkJson?: string;
   readonly artifacts?: string;
+  readonly judgeTarget?: string;
+  readonly model?: string;
 }
 
 function normalizeBoolean(value: unknown): boolean {
@@ -249,6 +251,8 @@ function normalizeOptions(
     workspacePath,
     benchmarkJson: normalizeString(rawOptions.benchmarkJson),
     artifacts: normalizeString(rawOptions.artifacts),
+    judgeTarget: normalizeString(rawOptions.judgeTarget),
+    model: normalizeString(rawOptions.model),
   } satisfies NormalizedOptions;
 }
 
@@ -593,6 +597,8 @@ async function runSingleEvalFile(params: {
     trials: trialsConfig,
     totalBudgetUsd,
     failOnError,
+    judgeTarget: options.judgeTarget,
+    model: options.model,
     streamCallbacks: streamingObserver?.getStreamCallbacks(),
     onResult: async (result: EvaluationResult) => {
       // Finalize streaming observer span with score
@@ -674,6 +680,11 @@ export async function runEvalCommand(input: RunEvalCommandInput): Promise<void>
 
   let options = normalizeOptions(input.rawOptions, config, yamlConfig?.execution);
 
+  // Validate --judge-target / --model combinations
+  if (options.judgeTarget === 'agentv' && !options.model) {
+    throw new Error('--judge-target agentv requires --model (e.g., --model openai:gpt-5-mini)');
+  }
+
   // --retry-errors: override filter to only re-run execution_error test cases.
   // IMPORTANT: JSONL must be fully loaded here, before the output writer is created below,
   // since the retry source and output destination may refer to the same file.

diff --git a/apps/web/src/content/docs/evaluation/eval-cases.mdx b/apps/web/src/content/docs/evaluation/eval-cases.mdx
@@ -265,7 +265,7 @@ tests:
 
 ### `assert` present — explicit evaluators only
 
-When `assert` is defined, only the declared evaluators run. No implicit judge is added. Judges that are declared (such as `llm-judge`, `code-judge`, `agent-judge`, or `rubrics`) receive `criteria` as input automatically.
+When `assert` is defined, only the declared evaluators run. No implicit judge is added. Judges that are declared (such as `llm-judge`, `code-judge`, or `rubrics`) receive `criteria` as input automatically.
 
 If `assert` contains only deterministic evaluators (like `contains` or `regex`), the `criteria` field is not evaluated and a warning is emitted:
 

diff --git a/apps/web/src/content/docs/guides/agent-eval-layers.mdx b/apps/web/src/content/docs/guides/agent-eval-layers.mdx
@@ -15,8 +15,8 @@ Covers plan quality, plan adherence, and tool selection rationale. Use LLM-based
 
 | Concern | AgentV evaluator |
 |---------|-----------------|
-| Plan quality & coherence | `llm_judge` with reasoning-focused prompt |
-| Workspace-aware auditing | `agent_judge` with rubrics |
+| Plan quality & coherence | `llm-judge` with reasoning-focused prompt |
+| Workspace-aware auditing | `llm-judge` with rubrics |
 
 ```yaml
 # Layer 1: Reasoning — verify the agent's plan makes sense
@@ -29,7 +29,7 @@ assertions:
       Did it select appropriate tools for the task?
       Score 1.0 if reasoning is sound, 0.0 if not.
   - name: workspace-audit
-    type: agent-judge
+    type: llm-judge
     max_steps: 5
     temperature: 0
     rubrics:

diff --git a/bun.lock b/bun.lock
diff --git a/docs/plans/2026-02-26-eval-schema-generation-design.md b/docs/plans/2026-02-26-eval-schema-generation-design.md
@@ -248,14 +248,9 @@ const ExecutionMetricsSchema = EvaluatorCommonSchema.extend({
   exploration_tolerance: z.number().min(0).optional(),
 });
 
-const AgentJudgeSchema = EvaluatorCommonSchema.extend({
-  type: z.literal('agent_judge'),
-  prompt: z.string().optional(),
-  rubrics: z.array(RubricItemSchema).optional(),
-  max_steps: z.number().int().min(1).max(50).optional(),
-  temperature: z.number().min(0).max(2).optional(),
-  target: z.string().optional(),
-});
+// Note: agent_judge was removed — llm-judge now covers all judge use cases
+// including agentic behavior (auto-detected based on judge provider kind).
+// See LlmJudgeSchema above for the unified schema.
 
 const ContainsSchema = EvaluatorCommonSchema.extend({
   type: z.literal('contains'),
@@ -292,7 +287,6 @@ const EvaluatorSchema = z.union([
   CostSchema,
   TokenUsageSchema,
   ExecutionMetricsSchema,
-  AgentJudgeSchema,
   ContainsSchema,
   RegexSchema,
   IsJsonSchema,

diff --git a/examples/features/agent-judge/.agentv/targets.yaml b/examples/features/agent-judge/.agentv/targets.yaml
diff --git a/examples/features/agent-judge/evals/dataset.eval.baseline.jsonl b/examples/features/agent-judge/evals/dataset.eval.baseline.jsonl
diff --git a/examples/features/agent-judge/evals/dataset.eval.yaml b/examples/features/agent-judge/evals/dataset.eval.yaml
diff --git a/examples/features/agent-judge/workspace-template/package.json b/examples/features/agent-judge/workspace-template/package.json
diff --git a/examples/features/agent-judge/workspace-template/src/main.ts b/examples/features/agent-judge/workspace-template/src/main.ts
diff --git a/examples/features/file-changes-judges/.agentv/targets.yaml b/examples/features/file-changes-judges/.agentv/targets.yaml
@@ -11,15 +11,15 @@ targets:
     workspace_template: ../workspace-template
     judge_target: azure_judge
 
-  # Azure OpenAI — used as LLM judge (rubrics) and built-in agent_judge provider
+  # Azure OpenAI — used as LLM judge (rubrics) and built-in llm-judge provider
   - name: azure_judge
     provider: azure
     endpoint: ${{ AZURE_OPENAI_ENDPOINT }}
     api_key: ${{ AZURE_OPENAI_API_KEY }}
     model: ${{ AZURE_DEPLOYMENT_NAME }}
     version: ${{ AZURE_OPENAI_API_VERSION }}
 
-  # Copilot CLI — used as delegated agent_judge target
+  # Copilot CLI — used as delegated llm-judge target
   - name: copilot_judge
     provider: copilot-cli
     model: claude-haiku-4.5
diff --git a/examples/features/file-changes-judges/evals/dataset.eval.yaml b/examples/features/file-changes-judges/evals/dataset.eval.yaml
@@ -2,13 +2,13 @@
 #
 # Proves that file_changes diffs are correctly passed to all judge types:
 #   1. rubrics      — LLM judge (Azure) evaluates the diff
-#   2. agent_judge  — built-in mode (Azure via AI SDK) sees file_changes in prompt
-#   3. agent_judge  — delegated mode (Copilot CLI with haiku) sees file_changes in prompt
+#   2. llm-judge    — built-in mode (Azure via AI SDK) sees file_changes in prompt
+#   3. llm-judge    — delegated mode (Copilot CLI with haiku) sees file_changes in prompt
 #
 # The mock agent adds a `subtract` function to calculator.ts, producing a small
 # diff (~10 lines) that fits comfortably in any LLM context window.
 
-description: Verify file_changes diffs are accessible to LLM judge, built-in agent judge, and copilot-cli agent judge
+description: Verify file_changes diffs are accessible to LLM judge (rubrics, built-in, and copilot-cli)
 
 execution:
   target: mock_agent
@@ -43,14 +43,14 @@ tests:
             outcome: "The file_changes contains a valid unified diff format"
             weight: 0.5
 
-      # 2. Built-in agent judge — Azure via AI SDK with filesystem tools
-      - name: agent-judge-builtin
-        type: agent-judge
+      # 2. Built-in LLM judge — Azure via AI SDK with filesystem tools
+      - name: llm-judge-builtin
+        type: llm-judge
         max_steps: 3
         temperature: 0
 
-      # 3. Copilot CLI agent judge — delegated via target
-      - name: agent-judge-copilot
-        type: agent-judge
+      # 3. Copilot CLI LLM judge — delegated via target
+      - name: llm-judge-copilot
+        type: llm-judge
         target: copilot_judge
         temperature: 0
diff --git a/packages/core/package.json b/packages/core/package.json
@@ -39,11 +39,12 @@
   },
   "files": ["dist", "README.md"],
   "dependencies": {
-    "@agentv/eval": "workspace:*",
     "@agentclientprotocol/sdk": "^0.14.1",
+    "@agentv/eval": "workspace:*",
     "@ai-sdk/anthropic": "^2.0.53",
     "@ai-sdk/azure": "^2.0.78",
     "@ai-sdk/google": "^2.0.44",
+    "@ai-sdk/openai": "^2.0.0",
     "@anthropic-ai/claude-agent-sdk": "^0.2.49",
     "@github/copilot-sdk": "^0.1.25",
     "@mariozechner/pi-agent-core": "^0.54.2",