EntityProcess · christso · Mar 15, 2026 · Mar 15, 2026 · Mar 15, 2026
diff --git a/README.md b/README.md
@@ -58,7 +58,7 @@ tests:
 
     expected_output: "42"
 
-    assert:
+    assertions:
       - name: math_check
         type: code-judge
         command: ./validators/check_math.py
@@ -162,7 +162,7 @@ description: Math evaluation dataset
 dataset: math-tests
 execution:
   target: azure-base
-assert:
+assertions:
   - name: correctness
     type: llm-judge
     prompt: ./judges/correctness.md
@@ -259,7 +259,7 @@ print(json.dumps({
 Reference evaluators in your eval file:
 
 ```yaml
-assert:
+assertions:
   - name: my_validator
     type: code-judge
     command: ./validators/check_answer.py
@@ -289,7 +289,7 @@ export default defineAssertion(({ answer }) => {
 Files in `.agentv/assertions/` are auto-discovered by filename — use directly in YAML:
 
 ```yaml
-assert:
+assertions:
   - type: word-count    # matches word-count.ts
   - type: contains
     value: "Hello"
@@ -439,7 +439,7 @@ Built-in assertion types for common text-matching patterns — no LLM judge or c
 All assertions support `weight`, `required`, and `negate` flags. Use `negate: true` to invert (no `not_` prefix needed).
 
 ```yaml
-assert:
+assertions:
   # Case-insensitive matching for natural language variation
   - type: icontains-any
     value: ["missing rule code", "need rule code", "provide rule code"]
@@ -486,7 +486,7 @@ When agents respond via tool calls instead of text, use `tool_trajectory` instea
 Create markdown judge files with evaluation criteria and scoring guidelines:
 
 ```yaml
-assert:
+assertions:
   - name: semantic_check
     type: llm-judge
     prompt: ./judges/correctness.md
@@ -505,7 +505,7 @@ tests:
 
     input: Explain quicksort algorithm
 
-    assert:
+    assertions:
       - type: rubrics
         criteria:
           - Mentions divide-and-conquer approach

diff --git a/apps/cli/README.md b/apps/cli/README.md
@@ -58,7 +58,7 @@ tests:
 
     expected_output: "42"
 
-    assert:
+    assertions:
       - name: math_check
         type: code-judge
         command: ./validators/check_math.py
@@ -162,7 +162,7 @@ description: Math evaluation dataset
 dataset: math-tests
 execution:
   target: azure-base
-assert:
+assertions:
   - name: correctness
     type: llm-judge
     prompt: ./judges/correctness.md
@@ -259,7 +259,7 @@ print(json.dumps({
 Reference evaluators in your eval file:
 
 ```yaml
-assert:
+assertions:
   - name: my_validator
     type: code-judge
     command: ./validators/check_answer.py
@@ -289,7 +289,7 @@ export default defineAssertion(({ answer }) => {
 Files in `.agentv/assertions/` are auto-discovered by filename — use directly in YAML:
 
 ```yaml
-assert:
+assertions:
   - type: word-count    # matches word-count.ts
   - type: contains
     value: "Hello"
@@ -439,7 +439,7 @@ Built-in assertion types for common text-matching patterns — no LLM judge or c
 All assertions support `weight`, `required`, and `negate` flags. Use `negate: true` to invert (no `not_` prefix needed).
 
 ```yaml
-assert:
+assertions:
   # Case-insensitive matching for natural language variation
   - type: icontains-any
     value: ["missing rule code", "need rule code", "provide rule code"]
@@ -486,7 +486,7 @@ When agents respond via tool calls instead of text, use `tool_trajectory` instea
 Create markdown judge files with evaluation criteria and scoring guidelines:
 
 ```yaml
-assert:
+assertions:
   - name: semantic_check
     type: llm-judge
     prompt: ./judges/correctness.md
@@ -505,7 +505,7 @@ tests:
 
     input: Explain quicksort algorithm
 
-    assert:
+    assertions:
       - type: rubrics
         criteria:
           - Mentions divide-and-conquer approach

diff --git a/apps/cli/src/commands/convert/index.ts b/apps/cli/src/commands/convert/index.ts
@@ -134,7 +134,7 @@ export function convertEvalsJsonToYaml(inputPath: string): string {
     if (test.assertions && test.assertions.length > 0) {
       lines.push('    # Promoted from evals.json assertions[]');
       lines.push('    # Replace with type: is_json, contains, or regex for deterministic checks');
-      lines.push('    assert:');
+      lines.push('    assertions:');
       for (const assertion of test.assertions) {
         lines.push(`      - name: ${assertion.name}`);
         lines.push(`        type: ${assertion.type}`);

diff --git a/apps/cli/src/commands/create/commands.ts b/apps/cli/src/commands/create/commands.ts
@@ -40,7 +40,7 @@ tests:
     criteria: Agent responds correctly
     input: "Hello, how are you?"
     expected_output: "I'm doing well"
-    assert:
+    assertions:
       - type: contains
         value: "well"
 `,
@@ -53,7 +53,7 @@ tests:
     criteria: Agent responds correctly and completely
     input: "Hello, how are you?"
     expected_output: "I'm doing well, thank you for asking!"
-    assert:
+    assertions:
       - type: llm-judge
         rubric:
           accuracy:
@@ -126,7 +126,7 @@ export const createAssertionCommand = command({
     await mkdir(dir, { recursive: true });
     await writeFile(filePath, content);
     console.log(`Created ${path.relative(process.cwd(), filePath)} (template: ${templateName})`);
-    console.log(`\nUse in EVAL.yaml:\n  assert:\n    - type: ${name}`);
+    console.log(`\nUse in EVAL.yaml:\n  assertions:\n    - type: ${name}`);
   },
 });
 

diff --git a/apps/cli/test/commands/convert/convert-evals-json.test.ts b/apps/cli/test/commands/convert/convert-evals-json.test.ts
@@ -46,7 +46,7 @@ describe('convertEvalsJsonToYaml', () => {
     const yaml = convertEvalsJsonToYaml(filePath);
     expect(yaml).toContain('id: "1"');
     expect(yaml).toContain('Just a prompt');
-    expect(yaml).not.toContain('assert:');
+    expect(yaml).not.toContain('assertions:');
     expect(yaml).not.toContain('expected_output:');
   });
 

diff --git a/apps/cli/test/prompt-eval.integration.test.ts b/apps/cli/test/prompt-eval.integration.test.ts
@@ -33,7 +33,7 @@ async function createFixture(): Promise<PromptEvalFixture> {
 tests:
   - id: greeting-test
     criteria: Assistant greets the user by name
-    assert:
+    assertions:
       - name: mentions-name
         type: contains
         value: Taylor

diff --git a/apps/web/src/content/docs/evaluation/batch-cli.mdx b/apps/web/src/content/docs/evaluation/batch-cli.mdx
@@ -53,7 +53,7 @@ tests:
             name: Example A
             amount: 5000
 
-    assert:
+    assertions:
       - name: decision-check
         type: code-judge
         command: [bun, run, ./scripts/check-output.ts]
@@ -81,7 +81,7 @@ tests:
             name: Example B
             amount: 25000
 
-    assert:
+    assertions:
       - name: decision-check
         type: code-judge
         command: [bun, run, ./scripts/check-output.ts]

diff --git a/apps/web/src/content/docs/evaluation/eval-cases.mdx b/apps/web/src/content/docs/evaluation/eval-cases.mdx
@@ -81,7 +81,7 @@ tests:
 
     execution:
       target: gpt4_target
-    assert:
+    assertions:
       - name: depth_check
         type: llm-judge
         prompt: ./judges/depth.md
@@ -90,7 +90,7 @@ tests:
 Per-case `assert` evaluators are **merged** with root-level `assert` evaluators — test-specific evaluators run first, then root-level defaults are appended. To opt out of root-level defaults for a specific test, set `execution.skip_defaults: true`:
 
 ```yaml
-assert:
+assertions:
   - name: latency_check
     type: latency
     threshold: 5000
@@ -106,7 +106,7 @@ tests:
     input: Handle this edge case
     execution:
       skip_defaults: true
-    assert:
+    assertions:
       - name: custom_eval
         type: llm-judge
     # Does NOT get latency_check
@@ -179,7 +179,7 @@ tests:
   - id: json-api
     criteria: Returns valid JSON with status field
     input: Return the system status as JSON
-    assert:
+    assertions:
       - type: is-json
       - type: contains
         value: '"status"'
@@ -201,7 +201,7 @@ tests:
     expected_output:
       - role: assistant
         content: "DENIED"
-    assert:
+    assertions:
       - type: contains
         value: "DENIED"
         required: true
@@ -225,7 +225,7 @@ Any evaluator in `assert` can be marked as `required`. When a required evaluator
 | `required: 0.6` | Must score >= 0.6 to pass (custom threshold between 0 and 1) |
 
 ```yaml
-assert:
+assertions:
   - type: contains
     value: "DENIED"
     required: true          # must pass (>= 0.8)
@@ -282,7 +282,7 @@ tests:
   - id: mixed-eval
     criteria: Response is helpful and mentions the fix
     input: "Debug this function..."
-    assert:
+    assertions:
       - type: llm-judge        # explicit — receives criteria automatically
       - type: contains
         value: "fix"

diff --git a/apps/web/src/content/docs/evaluation/eval-files.mdx b/apps/web/src/content/docs/evaluation/eval-files.mdx
@@ -16,7 +16,7 @@ description: Math problem solving evaluation
 execution:
   target: default
 
-assert:
+assertions:
   - name: correctness
     type: llm-judge
     prompt: ./judges/correctness.md
@@ -76,7 +76,7 @@ The `assert` field is the canonical way to define suite-level evaluators. Suite-
 
 ```yaml
 description: API response validation
-assert:
+assertions:
   - type: is-json
     required: true
   - type: contains
@@ -199,7 +199,7 @@ description: Math evaluation dataset
 dataset: math-tests
 execution:
   target: azure-base
-assert:
+assertions:
   - name: correctness
     type: llm-judge
     prompt: ./judges/correctness.md

diff --git a/apps/web/src/content/docs/evaluation/examples.mdx b/apps/web/src/content/docs/evaluation/examples.mdx
@@ -82,7 +82,7 @@ tests:
   - id: json-generation-with-validation
     criteria: Generates valid JSON with required fields
 
-    assert:
+    assertions:
       - name: json_format_validator
         type: code-judge
         command: [uv, run, validate_json.py]
@@ -117,7 +117,7 @@ tests:
   - id: research-depth
     criteria: Agent researches thoroughly
     input: Research REST vs GraphQL
-    assert:
+    assertions:
       - name: research-check
         type: tool-trajectory
         mode: any_order
@@ -129,7 +129,7 @@ tests:
   - id: auth-flow
     criteria: Agent follows auth sequence
     input: Authenticate user
-    assert:
+    assertions:
       - name: auth-sequence
         type: tool-trajectory
         mode: exact
@@ -150,13 +150,13 @@ execution:
 tests:
   - file://../fixtures/labeled-judge-export.jsonl
 
-assert:
+assertions:
   - name: judge-panel
     type: composite
     aggregator:
       type: threshold
       threshold: 0.6
-    assert:
+    assertions:
       - name: judge-gpt-5-mini
         type: llm-judge
         target: judge_gpt_5_mini
@@ -186,7 +186,7 @@ tests:
   - id: validate-trace-file
     criteria: Trace contains required steps
     input: Analyze trace
-    assert:
+    assertions:
       - name: trace-check
         type: tool-trajectory
         mode: in_order
@@ -293,7 +293,7 @@ tests:
             amount: 5000
             currency: USD
 
-    assert:
+    assertions:
       - name: decision-check
         type: code-judge
         command: [bun, run, ./scripts/check-batch-cli-output.ts]
@@ -326,7 +326,7 @@ tests:
             amount: 2000
             currency: USD
 
-    assert:
+    assertions:
       - name: decision-check
         type: code-judge
         command: [bun, run, ./scripts/check-batch-cli-output.ts]