Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 23 additions & 0 deletions CLAUDE.md
Original file line number Diff line number Diff line change
Expand Up @@ -159,6 +159,28 @@ Unit tests alone are insufficient for evaluator changes. After implementing or m

5. **Note:** `--dry-run` returns mock responses that don't match evaluator output schemas. Use it only for testing harness flow, not evaluator logic.

## Completing Work — E2E Checklist

Before marking any branch as ready for review, complete this checklist:

1. **Copy `.env` to worktree** (if working in a git worktree):
```bash
cp /home/christso/projects/agentv/.env .env
```
Without this, any eval run or LLM-dependent test will fail with missing API key errors.

2. **Run unit tests**: `bun run test` — all must pass.

3. **Run at least one real eval** against an example file to verify end-to-end behavior:
```bash
bun apps/cli/src/cli.ts eval examples/features/rubric/evals/dataset.eval.yaml --test-id <test-id>
```
Inspect the output JSONL to confirm correct evaluator type, scores, and hits/misses.

4. **Verify no regressions** in areas adjacent to your changes (e.g., if you changed evaluator parsing, run an eval that exercises different evaluator types).

5. **Mark PR as ready** only after all above steps pass.

## Evaluator Type System

Evaluator types use **kebab-case** everywhere (matching promptfoo convention):
Expand Down Expand Up @@ -248,6 +270,7 @@ When working on a GitHub issue, **ALWAYS** follow this workflow:
```

4. **Before merging**, ensure:
- **E2E verification completed** (see "Completing Work — E2E Checklist" below)
- CI pipeline passes (all checks green)
- Code has been reviewed if required
- No merge conflicts with `main`
Expand Down
13 changes: 13 additions & 0 deletions apps/cli/src/commands/eval/commands/run.ts
Original file line number Diff line number Diff line change
Expand Up @@ -163,6 +163,17 @@ export const evalRunCommand = command({
description:
'Write companion artifacts (grading/<test>.json, timing.json, benchmark.json) to the specified directory',
}),
judgeTarget: option({
type: optional(string),
long: 'judge-target',
description:
'Override judge target for all evaluators (e.g., "agentv", or a target name from targets.yaml)',
}),
model: option({
type: optional(string),
long: 'model',
description: 'Override model for the judge target (e.g., "openai:gpt-5-mini")',
}),
},
handler: async (args) => {
// Launch interactive wizard when no eval paths and stdin is a TTY
Expand Down Expand Up @@ -203,6 +214,8 @@ export const evalRunCommand = command({
strict: args.strict,
benchmarkJson: args.benchmarkJson,
artifacts: args.artifacts,
judgeTarget: args.judgeTarget,
model: args.model,
};
await runEvalCommand({ testFiles: resolvedPaths, rawOptions });
},
Expand Down
11 changes: 11 additions & 0 deletions apps/cli/src/commands/eval/run-eval.ts
Original file line number Diff line number Diff line change
Expand Up @@ -82,6 +82,8 @@ interface NormalizedOptions {
readonly workspacePath?: string;
readonly benchmarkJson?: string;
readonly artifacts?: string;
readonly judgeTarget?: string;
readonly model?: string;
}

function normalizeBoolean(value: unknown): boolean {
Expand Down Expand Up @@ -249,6 +251,8 @@ function normalizeOptions(
workspacePath,
benchmarkJson: normalizeString(rawOptions.benchmarkJson),
artifacts: normalizeString(rawOptions.artifacts),
judgeTarget: normalizeString(rawOptions.judgeTarget),
model: normalizeString(rawOptions.model),
} satisfies NormalizedOptions;
}

Expand Down Expand Up @@ -593,6 +597,8 @@ async function runSingleEvalFile(params: {
trials: trialsConfig,
totalBudgetUsd,
failOnError,
judgeTarget: options.judgeTarget,
model: options.model,
streamCallbacks: streamingObserver?.getStreamCallbacks(),
onResult: async (result: EvaluationResult) => {
// Finalize streaming observer span with score
Expand Down Expand Up @@ -674,6 +680,11 @@ export async function runEvalCommand(input: RunEvalCommandInput): Promise<void>

let options = normalizeOptions(input.rawOptions, config, yamlConfig?.execution);

// Validate --judge-target / --model combinations
if (options.judgeTarget === 'agentv' && !options.model) {
throw new Error('--judge-target agentv requires --model (e.g., --model openai:gpt-5-mini)');
}

// --retry-errors: override filter to only re-run execution_error test cases.
// IMPORTANT: JSONL must be fully loaded here, before the output writer is created below,
// since the retry source and output destination may refer to the same file.
Expand Down
2 changes: 1 addition & 1 deletion apps/web/src/content/docs/evaluation/eval-cases.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -265,7 +265,7 @@ tests:

### `assert` present — explicit evaluators only

When `assert` is defined, only the declared evaluators run. No implicit judge is added. Judges that are declared (such as `llm-judge`, `code-judge`, `agent-judge`, or `rubrics`) receive `criteria` as input automatically.
When `assert` is defined, only the declared evaluators run. No implicit judge is added. Judges that are declared (such as `llm-judge`, `code-judge`, or `rubrics`) receive `criteria` as input automatically.

If `assert` contains only deterministic evaluators (like `contains` or `regex`), the `criteria` field is not evaluated and a warning is emitted:

Expand Down
6 changes: 3 additions & 3 deletions apps/web/src/content/docs/guides/agent-eval-layers.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -15,8 +15,8 @@ Covers plan quality, plan adherence, and tool selection rationale. Use LLM-based

| Concern | AgentV evaluator |
|---------|-----------------|
| Plan quality & coherence | `llm_judge` with reasoning-focused prompt |
| Workspace-aware auditing | `agent_judge` with rubrics |
| Plan quality & coherence | `llm-judge` with reasoning-focused prompt |
| Workspace-aware auditing | `llm-judge` with rubrics |

```yaml
# Layer 1: Reasoning — verify the agent's plan makes sense
Expand All @@ -29,7 +29,7 @@ assertions:
Did it select appropriate tools for the task?
Score 1.0 if reasoning is sound, 0.0 if not.
- name: workspace-audit
type: agent-judge
type: llm-judge
max_steps: 5
temperature: 0
rubrics:
Expand Down
7 changes: 4 additions & 3 deletions bun.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

12 changes: 3 additions & 9 deletions docs/plans/2026-02-26-eval-schema-generation-design.md
Original file line number Diff line number Diff line change
Expand Up @@ -248,14 +248,9 @@ const ExecutionMetricsSchema = EvaluatorCommonSchema.extend({
exploration_tolerance: z.number().min(0).optional(),
});

const AgentJudgeSchema = EvaluatorCommonSchema.extend({
type: z.literal('agent_judge'),
prompt: z.string().optional(),
rubrics: z.array(RubricItemSchema).optional(),
max_steps: z.number().int().min(1).max(50).optional(),
temperature: z.number().min(0).max(2).optional(),
target: z.string().optional(),
});
// Note: agent_judge was removed — llm-judge now covers all judge use cases
// including agentic behavior (auto-detected based on judge provider kind).
// See LlmJudgeSchema above for the unified schema.

const ContainsSchema = EvaluatorCommonSchema.extend({
type: z.literal('contains'),
Expand Down Expand Up @@ -292,7 +287,6 @@ const EvaluatorSchema = z.union([
CostSchema,
TokenUsageSchema,
ExecutionMetricsSchema,
AgentJudgeSchema,
ContainsSchema,
RegexSchema,
IsJsonSchema,
Expand Down
22 changes: 0 additions & 22 deletions examples/features/agent-judge/.agentv/targets.yaml

This file was deleted.

This file was deleted.

64 changes: 0 additions & 64 deletions examples/features/agent-judge/evals/dataset.eval.yaml

This file was deleted.

5 changes: 0 additions & 5 deletions examples/features/agent-judge/workspace-template/package.json

This file was deleted.

11 changes: 0 additions & 11 deletions examples/features/agent-judge/workspace-template/src/main.ts

This file was deleted.

4 changes: 2 additions & 2 deletions examples/features/file-changes-judges/.agentv/targets.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -11,15 +11,15 @@ targets:
workspace_template: ../workspace-template
judge_target: azure_judge

# Azure OpenAI — used as LLM judge (rubrics) and built-in agent_judge provider
# Azure OpenAI — used as LLM judge (rubrics) and built-in llm-judge provider
- name: azure_judge
provider: azure
endpoint: ${{ AZURE_OPENAI_ENDPOINT }}
api_key: ${{ AZURE_OPENAI_API_KEY }}
model: ${{ AZURE_DEPLOYMENT_NAME }}
version: ${{ AZURE_OPENAI_API_VERSION }}

# Copilot CLI — used as delegated agent_judge target
# Copilot CLI — used as delegated llm-judge target
- name: copilot_judge
provider: copilot-cli
model: claude-haiku-4.5
18 changes: 9 additions & 9 deletions examples/features/file-changes-judges/evals/dataset.eval.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -2,13 +2,13 @@
#
# Proves that file_changes diffs are correctly passed to all judge types:
# 1. rubrics — LLM judge (Azure) evaluates the diff
# 2. agent_judge — built-in mode (Azure via AI SDK) sees file_changes in prompt
# 3. agent_judge — delegated mode (Copilot CLI with haiku) sees file_changes in prompt
# 2. llm-judge — built-in mode (Azure via AI SDK) sees file_changes in prompt
# 3. llm-judge — delegated mode (Copilot CLI with haiku) sees file_changes in prompt
#
# The mock agent adds a `subtract` function to calculator.ts, producing a small
# diff (~10 lines) that fits comfortably in any LLM context window.

description: Verify file_changes diffs are accessible to LLM judge, built-in agent judge, and copilot-cli agent judge
description: Verify file_changes diffs are accessible to LLM judge (rubrics, built-in, and copilot-cli)

execution:
target: mock_agent
Expand Down Expand Up @@ -43,14 +43,14 @@ tests:
outcome: "The file_changes contains a valid unified diff format"
weight: 0.5

# 2. Built-in agent judge — Azure via AI SDK with filesystem tools
- name: agent-judge-builtin
type: agent-judge
# 2. Built-in LLM judge — Azure via AI SDK with filesystem tools
- name: llm-judge-builtin
type: llm-judge
max_steps: 3
temperature: 0

# 3. Copilot CLI agent judge — delegated via target
- name: agent-judge-copilot
type: agent-judge
# 3. Copilot CLI LLM judge — delegated via target
- name: llm-judge-copilot
type: llm-judge
target: copilot_judge
temperature: 0
3 changes: 2 additions & 1 deletion packages/core/package.json
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,12 @@
},
"files": ["dist", "README.md"],
"dependencies": {
"@agentv/eval": "workspace:*",
"@agentclientprotocol/sdk": "^0.14.1",
"@agentv/eval": "workspace:*",
"@ai-sdk/anthropic": "^2.0.53",
"@ai-sdk/azure": "^2.0.78",
"@ai-sdk/google": "^2.0.44",
"@ai-sdk/openai": "^2.0.0",
"@anthropic-ai/claude-agent-sdk": "^0.2.49",
"@github/copilot-sdk": "^0.1.25",
"@mariozechner/pi-agent-core": "^0.54.2",
Expand Down
Loading