Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 7 additions & 7 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ tests:

expected_output: "42"

assert:
assertions:
- name: math_check
type: code-judge
command: ./validators/check_math.py
Expand Down Expand Up @@ -162,7 +162,7 @@ description: Math evaluation dataset
dataset: math-tests
execution:
target: azure-base
assert:
assertions:
- name: correctness
type: llm-judge
prompt: ./judges/correctness.md
Expand Down Expand Up @@ -259,7 +259,7 @@ print(json.dumps({
Reference evaluators in your eval file:

```yaml
assert:
assertions:
- name: my_validator
type: code-judge
command: ./validators/check_answer.py
Expand Down Expand Up @@ -289,7 +289,7 @@ export default defineAssertion(({ answer }) => {
Files in `.agentv/assertions/` are auto-discovered by filename — use directly in YAML:

```yaml
assert:
assertions:
- type: word-count # matches word-count.ts
- type: contains
value: "Hello"
Expand Down Expand Up @@ -439,7 +439,7 @@ Built-in assertion types for common text-matching patterns — no LLM judge or c
All assertions support `weight`, `required`, and `negate` flags. Use `negate: true` to invert (no `not_` prefix needed).

```yaml
assert:
assertions:
# Case-insensitive matching for natural language variation
- type: icontains-any
value: ["missing rule code", "need rule code", "provide rule code"]
Expand Down Expand Up @@ -486,7 +486,7 @@ When agents respond via tool calls instead of text, use `tool_trajectory` instea
Create markdown judge files with evaluation criteria and scoring guidelines:

```yaml
assert:
assertions:
- name: semantic_check
type: llm-judge
prompt: ./judges/correctness.md
Expand All @@ -505,7 +505,7 @@ tests:

input: Explain quicksort algorithm

assert:
assertions:
- type: rubrics
criteria:
- Mentions divide-and-conquer approach
Expand Down
14 changes: 7 additions & 7 deletions apps/cli/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -58,7 +58,7 @@ tests:

expected_output: "42"

assert:
assertions:
- name: math_check
type: code-judge
command: ./validators/check_math.py
Expand Down Expand Up @@ -162,7 +162,7 @@ description: Math evaluation dataset
dataset: math-tests
execution:
target: azure-base
assert:
assertions:
- name: correctness
type: llm-judge
prompt: ./judges/correctness.md
Expand Down Expand Up @@ -259,7 +259,7 @@ print(json.dumps({
Reference evaluators in your eval file:

```yaml
assert:
assertions:
- name: my_validator
type: code-judge
command: ./validators/check_answer.py
Expand Down Expand Up @@ -289,7 +289,7 @@ export default defineAssertion(({ answer }) => {
Files in `.agentv/assertions/` are auto-discovered by filename — use directly in YAML:

```yaml
assert:
assertions:
- type: word-count # matches word-count.ts
- type: contains
value: "Hello"
Expand Down Expand Up @@ -439,7 +439,7 @@ Built-in assertion types for common text-matching patterns — no LLM judge or c
All assertions support `weight`, `required`, and `negate` flags. Use `negate: true` to invert (no `not_` prefix needed).

```yaml
assert:
assertions:
# Case-insensitive matching for natural language variation
- type: icontains-any
value: ["missing rule code", "need rule code", "provide rule code"]
Expand Down Expand Up @@ -486,7 +486,7 @@ When agents respond via tool calls instead of text, use `tool_trajectory` instea
Create markdown judge files with evaluation criteria and scoring guidelines:

```yaml
assert:
assertions:
- name: semantic_check
type: llm-judge
prompt: ./judges/correctness.md
Expand All @@ -505,7 +505,7 @@ tests:

input: Explain quicksort algorithm

assert:
assertions:
- type: rubrics
criteria:
- Mentions divide-and-conquer approach
Expand Down
2 changes: 1 addition & 1 deletion apps/cli/src/commands/convert/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,7 @@ export function convertEvalsJsonToYaml(inputPath: string): string {
if (test.assertions && test.assertions.length > 0) {
lines.push(' # Promoted from evals.json assertions[]');
lines.push(' # Replace with type: is_json, contains, or regex for deterministic checks');
lines.push(' assert:');
lines.push(' assertions:');
for (const assertion of test.assertions) {
lines.push(` - name: ${assertion.name}`);
lines.push(` type: ${assertion.type}`);
Expand Down
6 changes: 3 additions & 3 deletions apps/cli/src/commands/create/commands.ts
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ tests:
criteria: Agent responds correctly
input: "Hello, how are you?"
expected_output: "I'm doing well"
assert:
assertions:
- type: contains
value: "well"
`,
Expand All @@ -53,7 +53,7 @@ tests:
criteria: Agent responds correctly and completely
input: "Hello, how are you?"
expected_output: "I'm doing well, thank you for asking!"
assert:
assertions:
- type: llm-judge
rubric:
accuracy:
Expand Down Expand Up @@ -126,7 +126,7 @@ export const createAssertionCommand = command({
await mkdir(dir, { recursive: true });
await writeFile(filePath, content);
console.log(`Created ${path.relative(process.cwd(), filePath)} (template: ${templateName})`);
console.log(`\nUse in EVAL.yaml:\n assert:\n - type: ${name}`);
console.log(`\nUse in EVAL.yaml:\n assertions:\n - type: ${name}`);
},
});

Expand Down
2 changes: 1 addition & 1 deletion apps/cli/test/commands/convert/convert-evals-json.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,7 @@ describe('convertEvalsJsonToYaml', () => {
const yaml = convertEvalsJsonToYaml(filePath);
expect(yaml).toContain('id: "1"');
expect(yaml).toContain('Just a prompt');
expect(yaml).not.toContain('assert:');
expect(yaml).not.toContain('assertions:');
expect(yaml).not.toContain('expected_output:');
});

Expand Down
2 changes: 1 addition & 1 deletion apps/cli/test/prompt-eval.integration.test.ts
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ async function createFixture(): Promise<PromptEvalFixture> {
tests:
- id: greeting-test
criteria: Assistant greets the user by name
assert:
assertions:
- name: mentions-name
type: contains
value: Taylor
Expand Down
4 changes: 2 additions & 2 deletions apps/web/src/content/docs/evaluation/batch-cli.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -53,7 +53,7 @@ tests:
name: Example A
amount: 5000

assert:
assertions:
- name: decision-check
type: code-judge
command: [bun, run, ./scripts/check-output.ts]
Expand Down Expand Up @@ -81,7 +81,7 @@ tests:
name: Example B
amount: 25000

assert:
assertions:
- name: decision-check
type: code-judge
command: [bun, run, ./scripts/check-output.ts]
Expand Down
14 changes: 7 additions & 7 deletions apps/web/src/content/docs/evaluation/eval-cases.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -81,7 +81,7 @@ tests:

execution:
target: gpt4_target
assert:
assertions:
- name: depth_check
type: llm-judge
prompt: ./judges/depth.md
Expand All @@ -90,7 +90,7 @@ tests:
Per-case `assert` evaluators are **merged** with root-level `assert` evaluators — test-specific evaluators run first, then root-level defaults are appended. To opt out of root-level defaults for a specific test, set `execution.skip_defaults: true`:

```yaml
assert:
assertions:
- name: latency_check
type: latency
threshold: 5000
Expand All @@ -106,7 +106,7 @@ tests:
input: Handle this edge case
execution:
skip_defaults: true
assert:
assertions:
- name: custom_eval
type: llm-judge
# Does NOT get latency_check
Expand Down Expand Up @@ -179,7 +179,7 @@ tests:
- id: json-api
criteria: Returns valid JSON with status field
input: Return the system status as JSON
assert:
assertions:
- type: is-json
- type: contains
value: '"status"'
Expand All @@ -201,7 +201,7 @@ tests:
expected_output:
- role: assistant
content: "DENIED"
assert:
assertions:
- type: contains
value: "DENIED"
required: true
Expand All @@ -225,7 +225,7 @@ Any evaluator in `assert` can be marked as `required`. When a required evaluator
| `required: 0.6` | Must score >= 0.6 to pass (custom threshold between 0 and 1) |

```yaml
assert:
assertions:
- type: contains
value: "DENIED"
required: true # must pass (>= 0.8)
Expand Down Expand Up @@ -282,7 +282,7 @@ tests:
- id: mixed-eval
criteria: Response is helpful and mentions the fix
input: "Debug this function..."
assert:
assertions:
- type: llm-judge # explicit — receives criteria automatically
- type: contains
value: "fix"
Expand Down
6 changes: 3 additions & 3 deletions apps/web/src/content/docs/evaluation/eval-files.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@ description: Math problem solving evaluation
execution:
target: default

assert:
assertions:
- name: correctness
type: llm-judge
prompt: ./judges/correctness.md
Expand Down Expand Up @@ -76,7 +76,7 @@ The `assert` field is the canonical way to define suite-level evaluators. Suite-

```yaml
description: API response validation
assert:
assertions:
- type: is-json
required: true
- type: contains
Expand Down Expand Up @@ -199,7 +199,7 @@ description: Math evaluation dataset
dataset: math-tests
execution:
target: azure-base
assert:
assertions:
- name: correctness
type: llm-judge
prompt: ./judges/correctness.md
Expand Down
16 changes: 8 additions & 8 deletions apps/web/src/content/docs/evaluation/examples.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -82,7 +82,7 @@ tests:
- id: json-generation-with-validation
criteria: Generates valid JSON with required fields

assert:
assertions:
- name: json_format_validator
type: code-judge
command: [uv, run, validate_json.py]
Expand Down Expand Up @@ -117,7 +117,7 @@ tests:
- id: research-depth
criteria: Agent researches thoroughly
input: Research REST vs GraphQL
assert:
assertions:
- name: research-check
type: tool-trajectory
mode: any_order
Expand All @@ -129,7 +129,7 @@ tests:
- id: auth-flow
criteria: Agent follows auth sequence
input: Authenticate user
assert:
assertions:
- name: auth-sequence
type: tool-trajectory
mode: exact
Expand All @@ -150,13 +150,13 @@ execution:
tests:
- file://../fixtures/labeled-judge-export.jsonl

assert:
assertions:
- name: judge-panel
type: composite
aggregator:
type: threshold
threshold: 0.6
assert:
assertions:
- name: judge-gpt-5-mini
type: llm-judge
target: judge_gpt_5_mini
Expand Down Expand Up @@ -186,7 +186,7 @@ tests:
- id: validate-trace-file
criteria: Trace contains required steps
input: Analyze trace
assert:
assertions:
- name: trace-check
type: tool-trajectory
mode: in_order
Expand Down Expand Up @@ -293,7 +293,7 @@ tests:
amount: 5000
currency: USD

assert:
assertions:
- name: decision-check
type: code-judge
command: [bun, run, ./scripts/check-batch-cli-output.ts]
Expand Down Expand Up @@ -326,7 +326,7 @@ tests:
amount: 2000
currency: USD

assert:
assertions:
- name: decision-check
type: code-judge
command: [bun, run, ./scripts/check-batch-cli-output.ts]
Expand Down
Loading