From 8148eae25adc86cc61d2bafa4c018576085363b3 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 15 Mar 2026 03:53:24 +0000 Subject: [PATCH 1/2] refactor: rename assert: to assertions: in EVAL.yaml schema (#603) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rename the YAML key `assert:` to `assertions:` across the codebase. The old `assert:` key is preserved as a deprecated backward-compatible alias with a console warning when used at the suite level. Changes: - eval-file.schema.ts: add `assertions:` as primary field on EvalFileSchema - yaml-parser.ts: read `assertions ?? assert` for suite/test-level fields; emit deprecation warning for old `assert` key - eval-validator.ts: validate `assertions ?? assert`; update error message locations to reference `assertions` - evaluator-parser.ts: update warning messages to reference `assertions` - All examples/*.yaml: rename `assert:` → `assertions:` - All test fixtures: update YAML strings and primary object keys - All docs/README/skills: update YAML code snippets - Regenerate eval-schema.json to reflect updated Zod schema --- README.md | 14 +- apps/cli/README.md | 14 +- apps/cli/src/commands/convert/index.ts | 2 +- apps/cli/src/commands/create/commands.ts | 6 +- .../convert/convert-evals-json.test.ts | 2 +- apps/cli/test/prompt-eval.integration.test.ts | 2 +- .../src/content/docs/evaluation/batch-cli.mdx | 4 +- .../content/docs/evaluation/eval-cases.mdx | 14 +- .../content/docs/evaluation/eval-files.mdx | 6 +- .../src/content/docs/evaluation/examples.mdx | 16 +- .../src/content/docs/evaluation/rubrics.mdx | 10 +- apps/web/src/content/docs/evaluation/sdk.mdx | 2 +- .../content/docs/evaluators/code-judges.mdx | 6 +- .../src/content/docs/evaluators/composite.mdx | 14 +- .../docs/evaluators/custom-assertions.mdx | 6 +- .../docs/evaluators/custom-evaluators.mdx | 6 +- .../docs/evaluators/execution-metrics.mdx | 8 +- .../content/docs/evaluators/llm-judges.mdx | 6 +- .../docs/evaluators/structured-data.mdx | 12 +- .../docs/evaluators/tool-trajectory.mdx | 16 +- .../docs/getting-started/quickstart.mdx | 2 +- .../content/docs/guides/agent-eval-layers.mdx | 10 +- .../docs/guides/agent-skills-evals.mdx | 4 +- .../docs/guides/autoevals-integration.mdx | 6 +- .../guides/skill-improvement-workflow.mdx | 2 +- apps/web/src/content/docs/tools/generate.mdx | 4 +- docs/COMPARISON.md | 4 +- ...026-02-26-eval-schema-generation-design.md | 8 +- .../agent-judge/evals/dataset.eval.yaml | 4 +- .../assert-extended/evals/dataset.eval.yaml | 20 +- .../features/assert/evals/dataset.eval.yaml | 8 +- .../features/basic/evals/dataset.eval.yaml | 2 +- .../batch-cli/evals/dataset.eval.yaml | 8 +- .../code-judge-sdk/evals/dataset.eval.yaml | 2 +- .../evals/contextual-precision.eval.yaml | 2 +- .../evals/contextual-recall.eval.yaml | 2 +- .../composite/evals/dataset.eval.yaml | 18 +- .../evals/dataset.eval.yaml | 6 +- .../evals/dataset.eval.yaml | 14 +- .../evals/confusion-metrics.eval.yaml | 2 +- .../evals/field-accuracy.eval.yaml | 8 +- .../execution-metrics/evals/dataset.eval.yaml | 12 +- .../evals/dataset.eval.yaml | 2 +- .../file-changes/evals/dataset.eval.yaml | 4 +- .../evals/dataset.eval.yaml | 2 +- .../evals/dataset.eval.yaml | 10 +- .../nlp-metrics/evals/dataset.eval.yaml | 10 +- .../evals/dataset.eval.yaml | 4 +- .../repo-lifecycle/evals/dataset.eval.yaml | 2 +- .../repo-lifecycle/evals/pool-e2e.eval.yaml | 4 +- .../features/rubric/evals/dataset.eval.yaml | 8 +- .../sdk-config-file/evals/dataset.eval.yaml | 4 +- .../features/sdk-custom-assertion/README.md | 2 +- .../evals/dataset.eval.yaml | 6 +- .../evals/dataset.eval.yaml | 4 +- .../evals/dataset.eval.yaml | 6 +- .../evals/trace-file-demo.eval.yaml | 12 +- .../evals/dataset.eval.yaml | 22 +- .../trace-evaluation/evals/dataset.eval.yaml | 10 +- .../trial-output-consistency/README.md | 4 +- .../evals/dataset.eval.yaml | 8 +- .../evals/dataset.eval.yaml | 6 +- .../evals/dataset.eval.yaml | 2 +- .../evals/dataset-vscode.eval.yaml | 2 +- .../evals/dataset.eval.yaml | 2 +- .../evals/accuracy/dataset.eval.yaml | 2 +- .../evals/regression/dataset.eval.yaml | 2 +- .../cross-repo-sync/evals/dataset.eval.yaml | 6 +- .../evals/dataset.eval.yaml | 2 +- .../showcase/evaluator-conformance/EVAL.yaml | 4 +- .../export-screening/evals/dataset.eval.yaml | 2 +- .../showcase/multi-model-benchmark/README.md | 6 +- .../evals/benchmark.eval.yaml | 2 +- .../evals/setup-a.eval.yaml | 4 +- .../evals/setup-b.eval.yaml | 4 +- .../evals/encouragement.eval.yaml | 2 +- .../psychotherapy/evals/listening.eval.yaml | 2 +- .../psychotherapy/evals/routing.eval.yaml | 2 +- .../tool-eval-demo.yaml | 8 +- .../evaluation/loaders/evaluator-parser.ts | 8 +- .../src/evaluation/loaders/jsonl-parser.ts | 2 +- .../evaluation/validation/eval-file.schema.ts | 2 + .../evaluation/validation/eval-validator.ts | 10 +- packages/core/src/evaluation/yaml-parser.ts | 26 +- .../test/evaluation/criteria-optional.test.ts | 10 +- .../loaders/evaluator-parser.test.ts | 79 +- .../validation/eval-validator.test.ts | 32 +- packages/eval/src/assertion.ts | 4 +- .../agentv-dev/skills/agentv-bench/SKILL.md | 6 +- .../skills/agentv-chat-to-eval/README.md | 6 +- .../skills/agentv-chat-to-eval/SKILL.md | 8 +- .../examples/transcript-json.md | 8 +- .../examples/transcript-markdown.md | 8 +- .../skills/agentv-eval-analyzer/SKILL.md | 4 +- .../skills/agentv-eval-writer/SKILL.md | 54 +- .../references/eval-schema.json | 4700 ++++++++++++++--- .../references/rubric-evaluator.md | 14 +- 97 files changed, 4418 insertions(+), 1061 deletions(-) diff --git a/README.md b/README.md index 0f330d836..91cf038f2 100644 --- a/README.md +++ b/README.md @@ -58,7 +58,7 @@ tests: expected_output: "42" - assert: + assertions: - name: math_check type: code-judge command: ./validators/check_math.py @@ -162,7 +162,7 @@ description: Math evaluation dataset dataset: math-tests execution: target: azure-base -assert: +assertions: - name: correctness type: llm-judge prompt: ./judges/correctness.md @@ -259,7 +259,7 @@ print(json.dumps({ Reference evaluators in your eval file: ```yaml -assert: +assertions: - name: my_validator type: code-judge command: ./validators/check_answer.py @@ -289,7 +289,7 @@ export default defineAssertion(({ answer }) => { Files in `.agentv/assertions/` are auto-discovered by filename — use directly in YAML: ```yaml -assert: +assertions: - type: word-count # matches word-count.ts - type: contains value: "Hello" @@ -439,7 +439,7 @@ Built-in assertion types for common text-matching patterns — no LLM judge or c All assertions support `weight`, `required`, and `negate` flags. Use `negate: true` to invert (no `not_` prefix needed). ```yaml -assert: +assertions: # Case-insensitive matching for natural language variation - type: icontains-any value: ["missing rule code", "need rule code", "provide rule code"] @@ -486,7 +486,7 @@ When agents respond via tool calls instead of text, use `tool_trajectory` instea Create markdown judge files with evaluation criteria and scoring guidelines: ```yaml -assert: +assertions: - name: semantic_check type: llm-judge prompt: ./judges/correctness.md @@ -505,7 +505,7 @@ tests: input: Explain quicksort algorithm - assert: + assertions: - type: rubrics criteria: - Mentions divide-and-conquer approach diff --git a/apps/cli/README.md b/apps/cli/README.md index 0f330d836..91cf038f2 100644 --- a/apps/cli/README.md +++ b/apps/cli/README.md @@ -58,7 +58,7 @@ tests: expected_output: "42" - assert: + assertions: - name: math_check type: code-judge command: ./validators/check_math.py @@ -162,7 +162,7 @@ description: Math evaluation dataset dataset: math-tests execution: target: azure-base -assert: +assertions: - name: correctness type: llm-judge prompt: ./judges/correctness.md @@ -259,7 +259,7 @@ print(json.dumps({ Reference evaluators in your eval file: ```yaml -assert: +assertions: - name: my_validator type: code-judge command: ./validators/check_answer.py @@ -289,7 +289,7 @@ export default defineAssertion(({ answer }) => { Files in `.agentv/assertions/` are auto-discovered by filename — use directly in YAML: ```yaml -assert: +assertions: - type: word-count # matches word-count.ts - type: contains value: "Hello" @@ -439,7 +439,7 @@ Built-in assertion types for common text-matching patterns — no LLM judge or c All assertions support `weight`, `required`, and `negate` flags. Use `negate: true` to invert (no `not_` prefix needed). ```yaml -assert: +assertions: # Case-insensitive matching for natural language variation - type: icontains-any value: ["missing rule code", "need rule code", "provide rule code"] @@ -486,7 +486,7 @@ When agents respond via tool calls instead of text, use `tool_trajectory` instea Create markdown judge files with evaluation criteria and scoring guidelines: ```yaml -assert: +assertions: - name: semantic_check type: llm-judge prompt: ./judges/correctness.md @@ -505,7 +505,7 @@ tests: input: Explain quicksort algorithm - assert: + assertions: - type: rubrics criteria: - Mentions divide-and-conquer approach diff --git a/apps/cli/src/commands/convert/index.ts b/apps/cli/src/commands/convert/index.ts index 897f46991..bb067f17f 100644 --- a/apps/cli/src/commands/convert/index.ts +++ b/apps/cli/src/commands/convert/index.ts @@ -134,7 +134,7 @@ export function convertEvalsJsonToYaml(inputPath: string): string { if (test.assertions && test.assertions.length > 0) { lines.push(' # Promoted from evals.json assertions[]'); lines.push(' # Replace with type: is_json, contains, or regex for deterministic checks'); - lines.push(' assert:'); + lines.push(' assertions:'); for (const assertion of test.assertions) { lines.push(` - name: ${assertion.name}`); lines.push(` type: ${assertion.type}`); diff --git a/apps/cli/src/commands/create/commands.ts b/apps/cli/src/commands/create/commands.ts index 8d07705e8..69ea90511 100644 --- a/apps/cli/src/commands/create/commands.ts +++ b/apps/cli/src/commands/create/commands.ts @@ -40,7 +40,7 @@ tests: criteria: Agent responds correctly input: "Hello, how are you?" expected_output: "I'm doing well" - assert: + assertions: - type: contains value: "well" `, @@ -53,7 +53,7 @@ tests: criteria: Agent responds correctly and completely input: "Hello, how are you?" expected_output: "I'm doing well, thank you for asking!" - assert: + assertions: - type: llm-judge rubric: accuracy: @@ -126,7 +126,7 @@ export const createAssertionCommand = command({ await mkdir(dir, { recursive: true }); await writeFile(filePath, content); console.log(`Created ${path.relative(process.cwd(), filePath)} (template: ${templateName})`); - console.log(`\nUse in EVAL.yaml:\n assert:\n - type: ${name}`); + console.log(`\nUse in EVAL.yaml:\n assertions:\n - type: ${name}`); }, }); diff --git a/apps/cli/test/commands/convert/convert-evals-json.test.ts b/apps/cli/test/commands/convert/convert-evals-json.test.ts index 2b87245fb..f3433199b 100644 --- a/apps/cli/test/commands/convert/convert-evals-json.test.ts +++ b/apps/cli/test/commands/convert/convert-evals-json.test.ts @@ -46,7 +46,7 @@ describe('convertEvalsJsonToYaml', () => { const yaml = convertEvalsJsonToYaml(filePath); expect(yaml).toContain('id: "1"'); expect(yaml).toContain('Just a prompt'); - expect(yaml).not.toContain('assert:'); + expect(yaml).not.toContain('assertions:'); expect(yaml).not.toContain('expected_output:'); }); diff --git a/apps/cli/test/prompt-eval.integration.test.ts b/apps/cli/test/prompt-eval.integration.test.ts index f0691a7ab..8a610aced 100644 --- a/apps/cli/test/prompt-eval.integration.test.ts +++ b/apps/cli/test/prompt-eval.integration.test.ts @@ -33,7 +33,7 @@ async function createFixture(): Promise { tests: - id: greeting-test criteria: Assistant greets the user by name - assert: + assertions: - name: mentions-name type: contains value: Taylor diff --git a/apps/web/src/content/docs/evaluation/batch-cli.mdx b/apps/web/src/content/docs/evaluation/batch-cli.mdx index 91f09a2f0..4747184f7 100644 --- a/apps/web/src/content/docs/evaluation/batch-cli.mdx +++ b/apps/web/src/content/docs/evaluation/batch-cli.mdx @@ -53,7 +53,7 @@ tests: name: Example A amount: 5000 - assert: + assertions: - name: decision-check type: code-judge command: [bun, run, ./scripts/check-output.ts] @@ -81,7 +81,7 @@ tests: name: Example B amount: 25000 - assert: + assertions: - name: decision-check type: code-judge command: [bun, run, ./scripts/check-output.ts] diff --git a/apps/web/src/content/docs/evaluation/eval-cases.mdx b/apps/web/src/content/docs/evaluation/eval-cases.mdx index e0622bfcf..cc1545b64 100644 --- a/apps/web/src/content/docs/evaluation/eval-cases.mdx +++ b/apps/web/src/content/docs/evaluation/eval-cases.mdx @@ -81,7 +81,7 @@ tests: execution: target: gpt4_target - assert: + assertions: - name: depth_check type: llm-judge prompt: ./judges/depth.md @@ -90,7 +90,7 @@ tests: Per-case `assert` evaluators are **merged** with root-level `assert` evaluators — test-specific evaluators run first, then root-level defaults are appended. To opt out of root-level defaults for a specific test, set `execution.skip_defaults: true`: ```yaml -assert: +assertions: - name: latency_check type: latency threshold: 5000 @@ -106,7 +106,7 @@ tests: input: Handle this edge case execution: skip_defaults: true - assert: + assertions: - name: custom_eval type: llm-judge # Does NOT get latency_check @@ -179,7 +179,7 @@ tests: - id: json-api criteria: Returns valid JSON with status field input: Return the system status as JSON - assert: + assertions: - type: is-json - type: contains value: '"status"' @@ -201,7 +201,7 @@ tests: expected_output: - role: assistant content: "DENIED" - assert: + assertions: - type: contains value: "DENIED" required: true @@ -225,7 +225,7 @@ Any evaluator in `assert` can be marked as `required`. When a required evaluator | `required: 0.6` | Must score >= 0.6 to pass (custom threshold between 0 and 1) | ```yaml -assert: +assertions: - type: contains value: "DENIED" required: true # must pass (>= 0.8) @@ -282,7 +282,7 @@ tests: - id: mixed-eval criteria: Response is helpful and mentions the fix input: "Debug this function..." - assert: + assertions: - type: llm-judge # explicit — receives criteria automatically - type: contains value: "fix" diff --git a/apps/web/src/content/docs/evaluation/eval-files.mdx b/apps/web/src/content/docs/evaluation/eval-files.mdx index 935ec9278..f1ddc620f 100644 --- a/apps/web/src/content/docs/evaluation/eval-files.mdx +++ b/apps/web/src/content/docs/evaluation/eval-files.mdx @@ -16,7 +16,7 @@ description: Math problem solving evaluation execution: target: default -assert: +assertions: - name: correctness type: llm-judge prompt: ./judges/correctness.md @@ -76,7 +76,7 @@ The `assert` field is the canonical way to define suite-level evaluators. Suite- ```yaml description: API response validation -assert: +assertions: - type: is-json required: true - type: contains @@ -199,7 +199,7 @@ description: Math evaluation dataset dataset: math-tests execution: target: azure-base -assert: +assertions: - name: correctness type: llm-judge prompt: ./judges/correctness.md diff --git a/apps/web/src/content/docs/evaluation/examples.mdx b/apps/web/src/content/docs/evaluation/examples.mdx index 915e937ba..943fa9959 100644 --- a/apps/web/src/content/docs/evaluation/examples.mdx +++ b/apps/web/src/content/docs/evaluation/examples.mdx @@ -82,7 +82,7 @@ tests: - id: json-generation-with-validation criteria: Generates valid JSON with required fields - assert: + assertions: - name: json_format_validator type: code-judge command: [uv, run, validate_json.py] @@ -117,7 +117,7 @@ tests: - id: research-depth criteria: Agent researches thoroughly input: Research REST vs GraphQL - assert: + assertions: - name: research-check type: tool-trajectory mode: any_order @@ -129,7 +129,7 @@ tests: - id: auth-flow criteria: Agent follows auth sequence input: Authenticate user - assert: + assertions: - name: auth-sequence type: tool-trajectory mode: exact @@ -150,13 +150,13 @@ execution: tests: - file://../fixtures/labeled-judge-export.jsonl -assert: +assertions: - name: judge-panel type: composite aggregator: type: threshold threshold: 0.6 - assert: + assertions: - name: judge-gpt-5-mini type: llm-judge target: judge_gpt_5_mini @@ -186,7 +186,7 @@ tests: - id: validate-trace-file criteria: Trace contains required steps input: Analyze trace - assert: + assertions: - name: trace-check type: tool-trajectory mode: in_order @@ -293,7 +293,7 @@ tests: amount: 5000 currency: USD - assert: + assertions: - name: decision-check type: code-judge command: [bun, run, ./scripts/check-batch-cli-output.ts] @@ -326,7 +326,7 @@ tests: amount: 2000 currency: USD - assert: + assertions: - name: decision-check type: code-judge command: [bun, run, ./scripts/check-batch-cli-output.ts] diff --git a/apps/web/src/content/docs/evaluation/rubrics.mdx b/apps/web/src/content/docs/evaluation/rubrics.mdx index 30d348751..8848a2115 100644 --- a/apps/web/src/content/docs/evaluation/rubrics.mdx +++ b/apps/web/src/content/docs/evaluation/rubrics.mdx @@ -16,7 +16,7 @@ tests: - id: quicksort-explain criteria: Explain how quicksort works input: Explain quicksort algorithm - assert: + assertions: - Mentions divide-and-conquer approach - Explains partition step - States time complexity @@ -33,7 +33,7 @@ tests: - id: quicksort-explain criteria: Explain how quicksort works input: Explain quicksort algorithm - assert: + assertions: - type: rubrics criteria: - Mentions divide-and-conquer approach @@ -46,7 +46,7 @@ tests: For fine-grained control, use rubric objects with weights and requirements: ```yaml -assert: +assertions: - type: rubrics criteria: - id: core-concept @@ -77,7 +77,7 @@ assert: For quality gradients instead of binary pass/fail, use score ranges: ```yaml -assert: +assertions: - type: rubrics criteria: - id: accuracy @@ -134,7 +134,7 @@ tests: - id: code-quality criteria: Generates correct, clean Python code input: Write a fibonacci function - assert: + assertions: - type: rubrics criteria: - Returns correct values for n=0,1,2,10 diff --git a/apps/web/src/content/docs/evaluation/sdk.mdx b/apps/web/src/content/docs/evaluation/sdk.mdx index a020d3dde..6d4eb7640 100644 --- a/apps/web/src/content/docs/evaluation/sdk.mdx +++ b/apps/web/src/content/docs/evaluation/sdk.mdx @@ -67,7 +67,7 @@ Convention-based discovery maps filename → assertion type: Reference directly in your eval file — no `command:` needed: ```yaml -assert: +assertions: - type: word-count - type: contains value: "Hello" diff --git a/apps/web/src/content/docs/evaluators/code-judges.mdx b/apps/web/src/content/docs/evaluators/code-judges.mdx index 962f9626d..c9058e4e2 100644 --- a/apps/web/src/content/docs/evaluators/code-judges.mdx +++ b/apps/web/src/content/docs/evaluators/code-judges.mdx @@ -94,7 +94,7 @@ console.log(JSON.stringify({ ## Referencing in Eval Files ```yaml -assert: +assertions: - name: my_validator type: code-judge command: [./validators/check_answer.py] @@ -139,7 +139,7 @@ Code judges can call an LLM through a target proxy for metrics that require mult Add a `target` block to the evaluator config: ```yaml -assert: +assertions: - name: contextual-precision type: code-judge command: [bun, scripts/contextual-precision.ts] @@ -281,7 +281,7 @@ tests: - id: implement-feature criteria: Agent implements the feature correctly input: "Implement the TODO functions in src/index.ts" - assert: + assertions: - name: functional-check type: code-judge command: [bun, scripts/functional-check.ts] diff --git a/apps/web/src/content/docs/evaluators/composite.mdx b/apps/web/src/content/docs/evaluators/composite.mdx index 8956dec6f..b164fcc76 100644 --- a/apps/web/src/content/docs/evaluators/composite.mdx +++ b/apps/web/src/content/docs/evaluators/composite.mdx @@ -12,10 +12,10 @@ Composite evaluators combine multiple evaluators and aggregate their results int A composite evaluator wraps two or more sub-evaluators and an aggregator that determines the final score: ```yaml -assert: +assertions: - name: my_composite type: composite - assert: + assertions: - name: evaluator_1 type: llm-judge prompt: ./prompts/check1.md @@ -116,10 +116,10 @@ tests: input: Explain quantum computing - assert: + assertions: - name: safety_gate type: composite - assert: + assertions: - name: safety type: llm-judge prompt: ./prompts/safety-check.md @@ -140,7 +140,7 @@ Assign different importance to each evaluation dimension: ```yaml - name: release_readiness type: composite - assert: + assertions: - name: correctness type: llm-judge prompt: ./prompts/correctness.md @@ -165,10 +165,10 @@ Composites can contain other composites for hierarchical evaluation: ```yaml - name: comprehensive_eval type: composite - assert: + assertions: - name: content_quality type: composite - assert: + assertions: - name: accuracy type: llm-judge prompt: ./prompts/accuracy.md diff --git a/apps/web/src/content/docs/evaluators/custom-assertions.mdx b/apps/web/src/content/docs/evaluators/custom-assertions.mdx index 32a43e2ce..1c8ade11f 100644 --- a/apps/web/src/content/docs/evaluators/custom-assertions.mdx +++ b/apps/web/src/content/docs/evaluators/custom-assertions.mdx @@ -49,7 +49,7 @@ Custom assertion types cannot override built-in types (`contains`, `equals`, `is Reference the assertion by type name directly -- no `command:` path needed: ```yaml -assert: +assertions: - type: word-count - type: contains value: "Hello" @@ -224,7 +224,7 @@ tests: criteria: Agent gives a multi-word greeting input: "Say hello and introduce yourself" expected_output: "Hello! I'm an AI assistant here to help you." - assert: + assertions: - type: contains value: "Hello" - type: word-count @@ -233,7 +233,7 @@ tests: criteria: Agent gives a short but valid response input: "What is 2+2?" expected_output: "The answer is 4." - assert: + assertions: - type: contains value: "4" - type: word-count diff --git a/apps/web/src/content/docs/evaluators/custom-evaluators.mdx b/apps/web/src/content/docs/evaluators/custom-evaluators.mdx index f6ba91980..8fec7de4a 100644 --- a/apps/web/src/content/docs/evaluators/custom-evaluators.mdx +++ b/apps/web/src/content/docs/evaluators/custom-evaluators.mdx @@ -23,7 +23,7 @@ Evaluators are configured using `assert` — either top-level (applies to all te ```yaml description: My evaluation -assert: +assertions: - name: correctness type: llm-judge prompt: ./judges/correctness.md @@ -41,7 +41,7 @@ tests: - id: test-1 criteria: Returns valid JSON input: Generate a JSON config - assert: + assertions: - name: json_check type: code-judge command: [./validators/check_json.py] @@ -56,7 +56,7 @@ tests: - id: code-generation criteria: Generates correct Python code input: Write a sorting function - assert: + assertions: - type: rubrics criteria: - Code is syntactically valid diff --git a/apps/web/src/content/docs/evaluators/execution-metrics.mdx b/apps/web/src/content/docs/evaluators/execution-metrics.mdx index 3ca8fe202..5ac6af091 100644 --- a/apps/web/src/content/docs/evaluators/execution-metrics.mdx +++ b/apps/web/src/content/docs/evaluators/execution-metrics.mdx @@ -12,7 +12,7 @@ AgentV provides built-in evaluators for checking execution metrics against thres The `execution_metrics` evaluator provides declarative threshold-based checks on multiple metrics in a single evaluator. ```yaml -assert: +assertions: - name: efficiency type: execution-metrics max_tool_calls: 10 # Maximum tool invocations @@ -50,7 +50,7 @@ tests: - id: efficient-research criteria: Agent researches and summarizes efficiently input: Research the topic and provide a summary - assert: + assertions: - name: efficiency type: execution-metrics max_tool_calls: 15 @@ -65,7 +65,7 @@ tests: Check that an agent maintains a good balance between reading (exploration) and writing (action): ```yaml -assert: +assertions: - name: exploration-balance type: execution-metrics target_exploration_ratio: 0.6 # 60% should be read-only tools @@ -123,7 +123,7 @@ tests: - id: code-generation criteria: Generates correct, efficient code input: Write a sorting algorithm - assert: + assertions: # Semantic quality - name: quality type: llm-judge diff --git a/apps/web/src/content/docs/evaluators/llm-judges.mdx b/apps/web/src/content/docs/evaluators/llm-judges.mdx index b1723aa37..ab6946914 100644 --- a/apps/web/src/content/docs/evaluators/llm-judges.mdx +++ b/apps/web/src/content/docs/evaluators/llm-judges.mdx @@ -26,7 +26,7 @@ When `assert` **is** present, no default judge is added. To use an LLM judge alo Reference an LLM judge in your eval file: ```yaml -assert: +assertions: - name: semantic_check type: llm-judge prompt: ./judges/correctness.md @@ -78,7 +78,7 @@ Score the response from 0.0 to 1.0 based on: By default, an `llm-judge` uses the suite target's `judge_target`. Override it per evaluator when you need multiple judge models in one run: ```yaml -assert: +assertions: - name: judge-gpt type: llm-judge target: judge_gpt_5_mini @@ -130,7 +130,7 @@ Evaluate and provide a score from 0 to 1.`; When using TypeScript templates, configure them in YAML with optional `config` data passed to the command: ```yaml -assert: +assertions: - name: custom-eval type: llm-judge prompt: diff --git a/apps/web/src/content/docs/evaluators/structured-data.mdx b/apps/web/src/content/docs/evaluators/structured-data.mdx index 60f69758d..e29a9f43b 100644 --- a/apps/web/src/content/docs/evaluators/structured-data.mdx +++ b/apps/web/src/content/docs/evaluators/structured-data.mdx @@ -29,7 +29,7 @@ tests: Use `field_accuracy` to compare fields in the candidate JSON against the ground-truth object in `expected_output`. ```yaml -assert: +assertions: - name: invoice_fields type: field-accuracy aggregation: weighted_average @@ -68,7 +68,7 @@ For fuzzy string matching, use a `code_judge` evaluator (e.g. Levenshtein distan Gate on execution time (in milliseconds) reported by the provider via `trace`. ```yaml -assert: +assertions: - name: performance type: latency threshold: 2000 @@ -79,7 +79,7 @@ assert: Gate on monetary cost reported by the provider via `trace`. ```yaml -assert: +assertions: - name: budget type: cost budget: 0.10 @@ -90,7 +90,7 @@ assert: Gate on provider-reported token usage. Useful when cost is unavailable or model pricing differs. ```yaml -assert: +assertions: - name: token-budget type: token-usage max_total: 10000 @@ -104,10 +104,10 @@ assert: Use a `composite` evaluator to produce a single "release gate" score from multiple checks: ```yaml -assert: +assertions: - name: release_gate type: composite - assert: + assertions: - name: correctness type: field-accuracy fields: diff --git a/apps/web/src/content/docs/evaluators/tool-trajectory.mdx b/apps/web/src/content/docs/evaluators/tool-trajectory.mdx index bbcfad1f2..c80e34f33 100644 --- a/apps/web/src/content/docs/evaluators/tool-trajectory.mdx +++ b/apps/web/src/content/docs/evaluators/tool-trajectory.mdx @@ -14,7 +14,7 @@ Tool trajectory evaluators validate that an agent used the expected tools during Validates that each tool was called at least N times, regardless of order: ```yaml -assert: +assertions: - name: tool-usage type: tool-trajectory mode: any_order @@ -30,7 +30,7 @@ Use `any_order` when you want to ensure required tools are used but don't care a Validates tools appear in the expected sequence, but allows gaps (other tools can appear between expected ones): ```yaml -assert: +assertions: - name: workflow-sequence type: tool-trajectory mode: in_order @@ -48,7 +48,7 @@ Use `in_order` when you need to verify logical workflow order while allowing the Validates the exact tool sequence with no gaps or extra tools: ```yaml -assert: +assertions: - name: auth-sequence type: tool-trajectory mode: exact @@ -65,7 +65,7 @@ Use `exact` for security-critical workflows, strict protocol validation, or regr For `in_order` and `exact` modes, you can optionally validate tool arguments: ```yaml -assert: +assertions: - name: search-validation type: tool-trajectory mode: in_order @@ -93,7 +93,7 @@ assert: For `in_order` and `exact` modes, you can validate per-tool timing with `max_duration_ms`: ```yaml -assert: +assertions: - name: perf-check type: tool-trajectory mode: in_order @@ -189,7 +189,7 @@ tests: input: Research machine learning frameworks - assert: + assertions: # Check minimum tool usage - name: coverage type: tool-trajectory @@ -218,7 +218,7 @@ tests: input: Process the customer dataset - assert: + assertions: - name: pipeline-check type: tool-trajectory mode: exact @@ -238,7 +238,7 @@ tests: input: Process the customer dataset quickly - assert: + assertions: - name: pipeline-perf type: tool-trajectory mode: in_order diff --git a/apps/web/src/content/docs/getting-started/quickstart.mdx b/apps/web/src/content/docs/getting-started/quickstart.mdx index 8e6c5b762..68e333040 100644 --- a/apps/web/src/content/docs/getting-started/quickstart.mdx +++ b/apps/web/src/content/docs/getting-started/quickstart.mdx @@ -51,7 +51,7 @@ tests: expected_output: "42" - assert: + assertions: - name: math_check type: code-judge command: [./validators/check_math.py] diff --git a/apps/web/src/content/docs/guides/agent-eval-layers.mdx b/apps/web/src/content/docs/guides/agent-eval-layers.mdx index aca84ba72..783a2ca55 100644 --- a/apps/web/src/content/docs/guides/agent-eval-layers.mdx +++ b/apps/web/src/content/docs/guides/agent-eval-layers.mdx @@ -20,7 +20,7 @@ Covers plan quality, plan adherence, and tool selection rationale. Use LLM-based ```yaml # Layer 1: Reasoning — verify the agent's plan makes sense -assert: +assertions: - name: plan-quality type: llm-judge prompt: | @@ -54,7 +54,7 @@ Covers tool call correctness, argument validity, execution path, and redundancy. ```yaml # Layer 2: Action — verify the agent called the right tools -assert: +assertions: - name: tool-sequence type: tool-trajectory mode: in_order @@ -86,7 +86,7 @@ Covers task completion, output correctness, step efficiency, latency, and cost. ```yaml # Layer 3: End-to-End — verify task completion and efficiency -assert: +assertions: - name: answer-correct type: contains value: "42" @@ -119,7 +119,7 @@ Covers prompt injection resilience, policy adherence, bias, and content safety. ```yaml # Layer 4: Safety — verify the agent doesn't do harmful things -assert: +assertions: - name: no-pii-leak type: regex value: "\\d{3}-\\d{2}-\\d{4}" @@ -162,7 +162,7 @@ tests: expected_output: "The capital of France is Paris." - assert: + assertions: # Layer 1: Reasoning - name: reasoning type: llm-judge diff --git a/apps/web/src/content/docs/guides/agent-skills-evals.mdx b/apps/web/src/content/docs/guides/agent-skills-evals.mdx index 358b46916..b1c9f5b1a 100644 --- a/apps/web/src/content/docs/guides/agent-skills-evals.mdx +++ b/apps/web/src/content/docs/guides/agent-skills-evals.mdx @@ -163,7 +163,7 @@ tests: content: "Find the top 3 months by revenue." # Promoted from evals.json assertions[] # Replace with type: is_json, contains, or regex for deterministic checks - assert: + assertions: - name: assertion-1 type: llm-judge prompt: "Output identifies November as the highest revenue month" @@ -228,7 +228,7 @@ tests: A customer says their order #12345 hasn't arrived after 2 weeks. Help them. expected_output: | An empathetic response that offers to track the order and provides next steps. - assert: + assertions: - name: acknowledges-frustration type: llm-judge prompt: Response acknowledges the customer's frustration diff --git a/apps/web/src/content/docs/guides/autoevals-integration.mdx b/apps/web/src/content/docs/guides/autoevals-integration.mdx index 99bf0c1f9..abfb32dca 100644 --- a/apps/web/src/content/docs/guides/autoevals-integration.mdx +++ b/apps/web/src/content/docs/guides/autoevals-integration.mdx @@ -61,7 +61,7 @@ tests: - role: user content: "What is the capital of France?" expected_output: "Paris is the capital of France." - assert: + assertions: - name: factuality type: code-judge command: ["bun", "run", "judges/factuality.ts"] @@ -110,7 +110,7 @@ tests: - role: user content: "Summarize the key findings from the research paper." expected_output: "The paper found that transformer models outperform RNNs on long-range tasks." - assert: + assertions: - name: faithfulness type: code-judge command: ["python", "judges/faithfulness.py"] @@ -198,7 +198,7 @@ tests: - role: user content: "What are the benefits of exercise?" expected_output: "Exercise improves cardiovascular health, mental well-being, and longevity." - assert: + assertions: - name: rag-quality type: code-judge command: ["bun", "run", "judges/rag-suite.ts"] diff --git a/apps/web/src/content/docs/guides/skill-improvement-workflow.mdx b/apps/web/src/content/docs/guides/skill-improvement-workflow.mdx index fed99081a..b68613fb2 100644 --- a/apps/web/src/content/docs/guides/skill-improvement-workflow.mdx +++ b/apps/web/src/content/docs/guides/skill-improvement-workflow.mdx @@ -242,7 +242,7 @@ tests: input: - role: user content: "Review this Python function for bugs:..." - assert: + assertions: - name: assertion-1 type: llm-judge prompt: "Identifies the division by zero risk" diff --git a/apps/web/src/content/docs/tools/generate.mdx b/apps/web/src/content/docs/tools/generate.mdx index 8e38977fd..f309d2e39 100644 --- a/apps/web/src/content/docs/tools/generate.mdx +++ b/apps/web/src/content/docs/tools/generate.mdx @@ -22,7 +22,7 @@ This analyzes each test's `criteria` field and creates structured rubric criteri 1. Reads each test's `criteria` 2. Uses an LLM to decompose the criteria into individual checkable rubric items 3. Assigns weights based on importance -4. Writes rubric criteria back under `assert: - type: rubrics` +4. Writes rubric criteria back under `assertions: - type: rubrics` ## Example @@ -42,7 +42,7 @@ tests: - id: quicksort criteria: Explains quicksort with time complexity and examples input: Explain quicksort - assert: + assertions: - type: rubrics criteria: - Explains divide-and-conquer approach diff --git a/docs/COMPARISON.md b/docs/COMPARISON.md index f1d9c1a8c..88c741444 100644 --- a/docs/COMPARISON.md +++ b/docs/COMPARISON.md @@ -23,7 +23,7 @@ **1. Hybrid Judge System (Code + LLM with Custom Prompts)** ```yaml -assert: +assertions: - name: format_check type: code_judge # Deterministic: checks concrete outputs command: ./validators/check_format.py @@ -118,7 +118,7 @@ Alternative approaches: ### Scenario: Deterministic + Subjective Evaluation ```yaml -assert: +assertions: - name: syntax_check type: code_judge command: ["python", "check_syntax.py"] diff --git a/docs/plans/2026-02-26-eval-schema-generation-design.md b/docs/plans/2026-02-26-eval-schema-generation-design.md index c1edcc646..a20a7909f 100644 --- a/docs/plans/2026-02-26-eval-schema-generation-design.md +++ b/docs/plans/2026-02-26-eval-schema-generation-design.md @@ -175,7 +175,7 @@ const AggregatorSchema = z.discriminatedUnion('type', [ const CompositeSchema: z.ZodType = z.lazy(() => EvaluatorCommonSchema.extend({ type: z.literal('composite'), - assert: z.array(EvaluatorSchema).optional(), + assertions: z.array(EvaluatorSchema).optional(), evaluators: z.array(EvaluatorSchema).optional(), aggregator: AggregatorSchema, }), @@ -333,7 +333,7 @@ const TrialsSchema = z.object({ const ExecutionSchema = z.object({ target: z.string().optional(), targets: z.array(z.string()).optional(), - assert: z.array(EvaluatorSchema).optional(), + assertions: z.array(EvaluatorSchema).optional(), evaluators: z.array(EvaluatorSchema).optional(), skip_defaults: z.boolean().optional(), cache: z.boolean().optional(), @@ -352,7 +352,7 @@ const EvalTestSchema = z.object({ expected_outcome: z.string().optional(), input: InputSchema.optional(), expected_output: ExpectedOutputSchema.optional(), - assert: z.array(EvaluatorSchema).optional(), + assertions: z.array(EvaluatorSchema).optional(), evaluators: z.array(EvaluatorSchema).optional(), execution: ExecutionSchema.optional(), workspace: WorkspaceSchema.optional(), @@ -387,7 +387,7 @@ export const EvalFileSchema = z.object({ // Execution execution: ExecutionSchema.optional(), // Suite-level assertions - assert: z.array(EvaluatorSchema).optional(), + assertions: z.array(EvaluatorSchema).optional(), // Workspace workspace: WorkspaceSchema.optional(), }); diff --git a/examples/features/agent-judge/evals/dataset.eval.yaml b/examples/features/agent-judge/evals/dataset.eval.yaml index bc87f4d37..a9bf21048 100644 --- a/examples/features/agent-judge/evals/dataset.eval.yaml +++ b/examples/features/agent-judge/evals/dataset.eval.yaml @@ -25,7 +25,7 @@ tests: - type: text value: Create unit tests for all functions in src/main.ts - assert: + assertions: - name: workspace-audit type: agent-judge max_steps: 5 @@ -42,7 +42,7 @@ tests: - type: text value: Create unit tests for all functions in src/main.ts - assert: + assertions: - name: workspace-audit-rubric type: agent-judge max_steps: 5 diff --git a/examples/features/assert-extended/evals/dataset.eval.yaml b/examples/features/assert-extended/evals/dataset.eval.yaml index 8d9d9da48..8180da129 100644 --- a/examples/features/assert-extended/evals/dataset.eval.yaml +++ b/examples/features/assert-extended/evals/dataset.eval.yaml @@ -16,7 +16,7 @@ tests: criteria: Response should include some form of greeting input: "Greet the user warmly." expected_output: "Hello! Welcome aboard." - assert: + assertions: - type: contains-any value: ["Hello", "Hi", "Hey", "Welcome", "Greetings"] @@ -31,7 +31,7 @@ tests: - role: user content: "Confirm my details: name is Alice, email is alice@example.com" expected_output: "Confirmed: Alice, alice@example.com" - assert: + assertions: - type: contains-all value: ["Alice", "alice@example.com"] @@ -42,7 +42,7 @@ tests: criteria: Response mentions "error" in any case input: "Report the system status." expected_output: "No errors detected. System is healthy." - assert: + assertions: - type: icontains value: "error" @@ -54,7 +54,7 @@ tests: criteria: Agent asks for missing rule codes input: "Process this customs declaration. Country: BE." expected_output: "I still need the rule codes to process this declaration." - assert: + assertions: - type: icontains-any value: ["missing rule code", "need rule code", "provide rule code", "share rule code", "require rule code"] required: true @@ -66,7 +66,7 @@ tests: criteria: Response mentions all required field types input: "What fields are needed for a customs entry?" expected_output: "You need the Country Code, Rule Codes, and Expected Values." - assert: + assertions: - type: icontains-all value: ["country code", "rule codes", "expected values"] @@ -77,7 +77,7 @@ tests: criteria: Response starts with a formal prefix input: "Write a formal letter opening." expected_output: "Dear Sir/Madam, I am writing to inform you..." - assert: + assertions: - type: starts-with value: "Dear" @@ -88,7 +88,7 @@ tests: criteria: Response ends with a professional sign-off input: "End your response with 'Best regards'" expected_output: "Thank you for your inquiry. Best regards" - assert: + assertions: - type: ends-with value: "Best regards" @@ -99,7 +99,7 @@ tests: criteria: Response contains an email pattern (case-insensitive) input: "Provide a support email." expected_output: "Contact us at Support@Example.COM" - assert: + assertions: - type: regex value: "[a-z]+@[a-z]+\\.[a-z]+" flags: "i" @@ -111,7 +111,7 @@ tests: criteria: Response must NOT mention any competitor input: "Describe our product advantages." expected_output: "Our product offers best-in-class performance and reliability." - assert: + assertions: - type: contains-any value: ["CompetitorA", "CompetitorB", "CompetitorC"] negate: true @@ -124,7 +124,7 @@ tests: criteria: Agent should ask for missing rule codes and mention expected format input: "Process customs entry for country BE. No other data provided." expected_output: "I need the Customs Rule Codes to process this entry. Please provide them as true/false values (e.g., AU123 = true)." - assert: + assertions: - name: asks-for-rule-codes type: icontains-any value: ["rule code", "rule codes"] diff --git a/examples/features/assert/evals/dataset.eval.yaml b/examples/features/assert/evals/dataset.eval.yaml index ed5e4e405..4ddcfc722 100644 --- a/examples/features/assert/evals/dataset.eval.yaml +++ b/examples/features/assert/evals/dataset.eval.yaml @@ -18,7 +18,7 @@ tests: expected_output: - role: assistant content: Hello world! - assert: + assertions: - type: contains value: Hello - type: regex @@ -37,7 +37,7 @@ tests: expected_output: - role: assistant content: '{"status": "ok", "code": 200}' - assert: + assertions: - type: is-json required: true - type: contains @@ -56,7 +56,7 @@ tests: expected_output: - role: assistant content: Good morning! It's a pleasure to meet you. - assert: + assertions: - type: regex value: "Good (morning|afternoon|evening)" required: true @@ -74,7 +74,7 @@ tests: expected_output: - role: assistant content: "4" - assert: + assertions: - type: equals value: "4" required: true diff --git a/examples/features/basic/evals/dataset.eval.yaml b/examples/features/basic/evals/dataset.eval.yaml index 573252023..c4ab9cbbf 100644 --- a/examples/features/basic/evals/dataset.eval.yaml +++ b/examples/features/basic/evals/dataset.eval.yaml @@ -74,7 +74,7 @@ tests: target: azure-base # Multiple evaluators - supports both code-based and LLM judges - assert: + assertions: - name: keyword_check type: code-judge # Code evaluators handle regex, keywords, linting, etc. command: ["uv", "run", "check_python_keywords.py"] diff --git a/examples/features/batch-cli/evals/dataset.eval.yaml b/examples/features/batch-cli/evals/dataset.eval.yaml index 11b83b847..f58a8f7ab 100644 --- a/examples/features/batch-cli/evals/dataset.eval.yaml +++ b/examples/features/batch-cli/evals/dataset.eval.yaml @@ -42,7 +42,7 @@ tests: amount: 5000 currency: USD - assert: + assertions: - name: decision-check type: code-judge command: ["bun", "run", "../judges/check-batch-cli-output.ts"] @@ -81,7 +81,7 @@ tests: amount: 2000 currency: USD - assert: + assertions: - name: decision-check type: code-judge command: ["bun", "run", "../judges/check-batch-cli-output.ts"] @@ -120,7 +120,7 @@ tests: amount: 25000 currency: USD - assert: + assertions: - name: decision-check type: code-judge command: ["bun", "run", "../judges/check-batch-cli-output.ts"] @@ -159,7 +159,7 @@ tests: amount: 25000 currency: USD - assert: + assertions: - name: decision-check type: code-judge command: ["bun", "run", "../judges/check-batch-cli-output.ts"] diff --git a/examples/features/code-judge-sdk/evals/dataset.eval.yaml b/examples/features/code-judge-sdk/evals/dataset.eval.yaml index 947f572dc..ac0ee1ac3 100644 --- a/examples/features/code-judge-sdk/evals/dataset.eval.yaml +++ b/examples/features/code-judge-sdk/evals/dataset.eval.yaml @@ -29,7 +29,7 @@ tests: content: |- Attachments detected (2): example.txt, python.instructions.md. - assert: + assertions: - name: attachment-check type: code-judge command: ["bun", "run", "../scripts/verify-attachments.ts"] diff --git a/examples/features/code-judge-with-llm-calls/evals/contextual-precision.eval.yaml b/examples/features/code-judge-with-llm-calls/evals/contextual-precision.eval.yaml index bc55ea7b3..95073ab2c 100644 --- a/examples/features/code-judge-with-llm-calls/evals/contextual-precision.eval.yaml +++ b/examples/features/code-judge-with-llm-calls/evals/contextual-precision.eval.yaml @@ -17,7 +17,7 @@ # mixed-ranking: ~0.833 (2 relevant nodes with 1 irrelevant between) # relevant-node-last: ~0.333 (relevant node ranked last — worst case) -assert: +assertions: - name: contextual_precision type: code-judge command: [bun, run, ../scripts/contextual-precision.ts] diff --git a/examples/features/code-judge-with-llm-calls/evals/contextual-recall.eval.yaml b/examples/features/code-judge-with-llm-calls/evals/contextual-recall.eval.yaml index 88df3c7a0..83679f058 100644 --- a/examples/features/code-judge-with-llm-calls/evals/contextual-recall.eval.yaml +++ b/examples/features/code-judge-with-llm-calls/evals/contextual-recall.eval.yaml @@ -21,7 +21,7 @@ # partial-recall: ~0.333 (only 1 of 3 statements attributable to retrieval) # zero-recall: ~0.000 (no retrieval context supports the expected answer) -assert: +assertions: - name: contextual_recall type: code-judge command: [bun, run, ../scripts/contextual-recall.ts] diff --git a/examples/features/composite/evals/dataset.eval.yaml b/examples/features/composite/evals/dataset.eval.yaml index 6f9e8c819..17b96a950 100644 --- a/examples/features/composite/evals/dataset.eval.yaml +++ b/examples/features/composite/evals/dataset.eval.yaml @@ -17,10 +17,10 @@ tests: Machine learning is a subset of AI that enables systems to learn from data and improve performance without explicit programming. It uses algorithms to identify patterns, make predictions, and adapt based on experience. criteria: | The response should be both concise and detailed, balancing brevity with informative content. - assert: + assertions: - name: release_gate type: composite - assert: + assertions: - name: safety type: llm-judge prompt: ../prompts/safety-check.md @@ -44,10 +44,10 @@ tests: Quantum computing uses quantum bits (qubits) that can exist in multiple states simultaneously, unlike classical bits that are either 0 or 1. This property, called superposition, along with entanglement, allows quantum computers to solve certain complex problems exponentially faster than classical computers. criteria: | The response should be accurate, accessible to non-experts, and pass safety checks. - assert: + assertions: - name: safety_gate type: composite - assert: + assertions: - name: safety type: llm-judge prompt: ../prompts/safety-check-strict.md @@ -70,10 +70,10 @@ tests: Premium wireless headphones featuring active noise cancellation, 30-hour battery life, premium sound quality with enhanced bass, comfortable over-ear design, and seamless Bluetooth 5.0 connectivity. criteria: | The response should balance conciseness with detail effectively. - assert: + assertions: - name: final_decision type: composite - assert: + assertions: - name: conciseness type: llm-judge prompt: ../prompts/conciseness-check.md @@ -95,13 +95,13 @@ tests: Supervised learning uses labeled training data to learn patterns and make predictions, like classifying emails as spam or not spam. Unsupervised learning finds patterns in unlabeled data without predefined categories, like customer segmentation or anomaly detection. criteria: | The response should be accurate, clear, safe, and appropriately detailed. - assert: + assertions: - name: comprehensive_evaluation type: composite - assert: + assertions: - name: content_quality type: composite - assert: + assertions: - name: accuracy type: llm-judge prompt: ../prompts/accuracy-check.md diff --git a/examples/features/default-evaluators/evals/dataset.eval.yaml b/examples/features/default-evaluators/evals/dataset.eval.yaml index 5a38d9ba3..361ae7e08 100644 --- a/examples/features/default-evaluators/evals/dataset.eval.yaml +++ b/examples/features/default-evaluators/evals/dataset.eval.yaml @@ -7,7 +7,7 @@ description: Root-level evaluators that automatically apply to every test execution: target: default -assert: +assertions: - name: tone_check type: llm-judge prompt: | @@ -27,7 +27,7 @@ tests: criteria: The assistant provides a helpful response about refunds input: "I want a refund" expected_output: "I'd be happy to help you with a refund. Could you provide your order number?" - assert: + assertions: - name: helpfulness type: llm-judge # Also gets tone_check from root-level assert @@ -42,7 +42,7 @@ tests: expected_output: "I understand this is urgent. Let me help you right away. Can you tell me which system is affected so I can start troubleshooting?" execution: skip_defaults: true - assert: + assertions: - name: urgency_check type: llm-judge # Does NOT get tone_check — skip_defaults opts out diff --git a/examples/features/deterministic-evaluators/evals/dataset.eval.yaml b/examples/features/deterministic-evaluators/evals/dataset.eval.yaml index fb0bd4007..63487065c 100644 --- a/examples/features/deterministic-evaluators/evals/dataset.eval.yaml +++ b/examples/features/deterministic-evaluators/evals/dataset.eval.yaml @@ -14,7 +14,7 @@ tests: criteria: Response mentions the word "Hello" input: "Say hello to the user." expected_output: "Hello there! How can I help you today?" - assert: + assertions: - type: contains value: "Hello" @@ -27,7 +27,7 @@ tests: - role: user content: "Provide your contact email." expected_output: "You can reach me at support@example.com." - assert: + assertions: - type: regex value: "[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,}" @@ -40,7 +40,7 @@ tests: - role: user content: "What is 2+2?" expected_output: "4" - assert: + assertions: - type: equals value: "4" @@ -49,7 +49,7 @@ tests: criteria: Response begins with a greeting input: "Start your reply with 'Dear User'." expected_output: "Dear User, thank you for contacting us." - assert: + assertions: - type: regex value: "^Dear User" @@ -62,7 +62,7 @@ tests: - role: user content: "Return a JSON object with a status field set to ok and code 200." expected_output: '{"status": "ok", "code": 200}' - assert: + assertions: - type: is-json # --- combining multiple assertions on one test --- @@ -74,7 +74,7 @@ tests: - role: user content: 'Return a JSON object with a "result" key set to the number 42.' expected_output: '{"result": 42}' - assert: + assertions: - type: is-json required: true - type: contains @@ -89,7 +89,7 @@ tests: - role: user content: 'Return a JSON object with a "message" field set to "success".' expected_output: '{"message": "success"}' - assert: + assertions: - type: is-json required: true - type: contains diff --git a/examples/features/document-extraction/evals/confusion-metrics.eval.yaml b/examples/features/document-extraction/evals/confusion-metrics.eval.yaml index 5ef0d2666..a07ef2389 100644 --- a/examples/features/document-extraction/evals/confusion-metrics.eval.yaml +++ b/examples/features/document-extraction/evals/confusion-metrics.eval.yaml @@ -30,7 +30,7 @@ description: Header field confusion metrics (TP/TN/FP/FN aggregation) execution: target: mock_extractor -assert: +assertions: - name: header_confusion type: code-judge command: ["bun", "run", "../judges/header_confusion_metrics.ts"] diff --git a/examples/features/document-extraction/evals/field-accuracy.eval.yaml b/examples/features/document-extraction/evals/field-accuracy.eval.yaml index d51bc5f8d..160962202 100644 --- a/examples/features/document-extraction/evals/field-accuracy.eval.yaml +++ b/examples/features/document-extraction/evals/field-accuracy.eval.yaml @@ -27,7 +27,7 @@ description: Field accuracy evaluator patterns (per-evalcase scoring) execution: target: mock_extractor -assert: +assertions: # Primary evaluator: Correctness via field-level accuracy - name: invoice_field_accuracy type: field-accuracy @@ -210,7 +210,7 @@ tests: This simulates OCR output that preserves document formatting. Uses code_judge with config pass-through for multi-field fuzzy matching. - assert: + assertions: # Multi-field fuzzy match with config pass-through - name: party_names_fuzzy type: code-judge @@ -339,7 +339,7 @@ tests: criteria: | Extractor correctly extracts first two line items with proper array structure. Field paths like line_items[0].description should resolve correctly. - assert: + assertions: - name: line_items_check type: field-accuracy fields: @@ -391,7 +391,7 @@ tests: This test demonstrates greedy matching for line items. Index-based comparison would incorrectly penalize the extractor, but greedy matching correctly aligns items by description similarity. - assert: + assertions: - name: line_items_matched type: code-judge command: ["bun", "run", "../judges/line_item_matching.ts"] diff --git a/examples/features/execution-metrics/evals/dataset.eval.yaml b/examples/features/execution-metrics/evals/dataset.eval.yaml index 37da44b1e..19a3a60e6 100644 --- a/examples/features/execution-metrics/evals/dataset.eval.yaml +++ b/examples/features/execution-metrics/evals/dataset.eval.yaml @@ -35,7 +35,7 @@ tests: - role: user content: Hello, this is a simple question. - assert: + assertions: - name: efficiency-check type: execution-metrics max_tool_calls: 10 @@ -55,7 +55,7 @@ tests: - role: user content: Hello, give me a simple response. - assert: + assertions: - name: full-efficiency-check type: execution-metrics max_tool_calls: 15 @@ -78,7 +78,7 @@ tests: - role: user content: Research and analyze the topic of machine learning. - assert: + assertions: # Check tool trajectory - name: trajectory-check type: tool-trajectory @@ -106,7 +106,7 @@ tests: - role: user content: Implement a small code improvement based on the existing file. - assert: + assertions: - name: exploration-balance type: execution-metrics target_exploration_ratio: 0.5 # 50% should be read-only tools @@ -125,7 +125,7 @@ tests: - role: user content: Generate a brief summary. - assert: + assertions: - name: cost-check type: execution-metrics max_cost_usd: 0.05 @@ -144,7 +144,7 @@ tests: - role: user content: Process the data efficiently. - assert: + assertions: # Declarative threshold checks - name: metric-thresholds type: execution-metrics diff --git a/examples/features/file-changes-judges/evals/dataset.eval.yaml b/examples/features/file-changes-judges/evals/dataset.eval.yaml index 7c3a4bc7e..65ebd68df 100644 --- a/examples/features/file-changes-judges/evals/dataset.eval.yaml +++ b/examples/features/file-changes-judges/evals/dataset.eval.yaml @@ -26,7 +26,7 @@ tests: - type: text value: Add a subtract function to src/calculator.ts - assert: + assertions: # 1. LLM judge with rubrics — Azure evaluates file_changes diff - name: llm-judge-rubrics type: rubrics diff --git a/examples/features/file-changes/evals/dataset.eval.yaml b/examples/features/file-changes/evals/dataset.eval.yaml index 398be26c2..69aad9e71 100644 --- a/examples/features/file-changes/evals/dataset.eval.yaml +++ b/examples/features/file-changes/evals/dataset.eval.yaml @@ -27,7 +27,7 @@ tests: - type: text value: Edit hello.txt and config.json, create src/utils.ts and tests/main.test.ts, delete obsolete.log. - assert: + assertions: - name: check-edits-and-creates type: code-judge command: ["uv", "run", "../check_file_changes.py"] @@ -47,7 +47,7 @@ tests: - type: text value: Edit hello.txt and config.json, create src/utils.ts and tests/main.test.ts, delete obsolete.log. - assert: + assertions: - name: check-deletes-and-structure type: code-judge command: ["uv", "run", "../check_file_changes.py"] diff --git a/examples/features/functional-grading/evals/dataset.eval.yaml b/examples/features/functional-grading/evals/dataset.eval.yaml index e4806109b..f5a04d690 100644 --- a/examples/features/functional-grading/evals/dataset.eval.yaml +++ b/examples/features/functional-grading/evals/dataset.eval.yaml @@ -24,7 +24,7 @@ tests: Implement the add, multiply, and fibonacci functions in src/index.ts. The function signatures are already defined — replace the throw statements with working implementations. - assert: + assertions: - name: functional-check type: code-judge command: ["bun", "run", "../scripts/functional-check.ts"] diff --git a/examples/features/latency-assertions/evals/dataset.eval.yaml b/examples/features/latency-assertions/evals/dataset.eval.yaml index 1e8d823ac..5c581de9d 100644 --- a/examples/features/latency-assertions/evals/dataset.eval.yaml +++ b/examples/features/latency-assertions/evals/dataset.eval.yaml @@ -49,7 +49,7 @@ tests: - role: user content: Read the config.json file. - assert: + assertions: - name: fast-read type: tool-trajectory mode: in_order @@ -69,7 +69,7 @@ tests: - role: user content: Read the large-file.json. - assert: + assertions: - name: slow-read type: tool-trajectory mode: in_order @@ -90,7 +90,7 @@ tests: - role: user content: Process the customer data from the API. - assert: + assertions: - name: data-pipeline-perf type: tool-trajectory mode: in_order @@ -115,7 +115,7 @@ tests: - role: user content: Authenticate the user with provided credentials. - assert: + assertions: - name: auth-perf type: tool-trajectory mode: exact @@ -139,7 +139,7 @@ tests: - role: user content: What's the weather like in Paris? - assert: + assertions: - name: weather-perf type: tool-trajectory mode: in_order diff --git a/examples/features/nlp-metrics/evals/dataset.eval.yaml b/examples/features/nlp-metrics/evals/dataset.eval.yaml index 6208a333b..5a02020ec 100644 --- a/examples/features/nlp-metrics/evals/dataset.eval.yaml +++ b/examples/features/nlp-metrics/evals/dataset.eval.yaml @@ -19,7 +19,7 @@ tests: - role: assistant content: The quick brown fox jumps over the lazy dog near the river bank. - assert: + assertions: - name: rouge-score type: code-judge command: ["bun", "run", "../judges/rouge.ts"] @@ -35,7 +35,7 @@ tests: - role: assistant content: The cat sat on the mat and watched the birds in the garden. - assert: + assertions: - name: bleu-score type: code-judge command: ["bun", "run", "../judges/bleu.ts"] @@ -51,7 +51,7 @@ tests: - role: assistant content: Machine learning models require large amounts of training data to perform well. - assert: + assertions: - name: cosine-similarity type: code-judge command: ["bun", "run", "../judges/similarity.ts"] @@ -67,7 +67,7 @@ tests: - role: assistant content: "Widget Pro 3000" - assert: + assertions: - name: edit-distance type: code-judge command: ["bun", "run", "../judges/levenshtein.ts"] @@ -83,7 +83,7 @@ tests: - role: assistant content: Artificial intelligence is transforming healthcare by enabling faster diagnosis and personalised treatment plans. - assert: + assertions: - name: rouge-score type: code-judge command: ["bun", "run", "../judges/rouge.ts"] diff --git a/examples/features/prompt-template-sdk/evals/dataset.eval.yaml b/examples/features/prompt-template-sdk/evals/dataset.eval.yaml index e1923ff62..379cff6f2 100644 --- a/examples/features/prompt-template-sdk/evals/dataset.eval.yaml +++ b/examples/features/prompt-template-sdk/evals/dataset.eval.yaml @@ -21,7 +21,7 @@ tests: reference_answer: |- TypeScript provides static type checking, better IDE support, and improved maintainability. - assert: + assertions: - name: custom-prompt-eval type: llm-judge # Executable prompt template using explicit script array (matches code_judge pattern) @@ -40,7 +40,7 @@ tests: reference_answer: |- Async/await is syntactic sugar over Promises that makes asynchronous code look synchronous. - assert: + assertions: - name: strict-eval type: llm-judge # Executable prompt template with config diff --git a/examples/features/repo-lifecycle/evals/dataset.eval.yaml b/examples/features/repo-lifecycle/evals/dataset.eval.yaml index 121e04a38..1c544e7c0 100644 --- a/examples/features/repo-lifecycle/evals/dataset.eval.yaml +++ b/examples/features/repo-lifecycle/evals/dataset.eval.yaml @@ -25,7 +25,7 @@ tests: input: >- Read the file at repo/packages/core/package.json and tell me the exact package name and version number. - assert: + assertions: - type: contains value: "@agentv/core" - type: regex diff --git a/examples/features/repo-lifecycle/evals/pool-e2e.eval.yaml b/examples/features/repo-lifecycle/evals/pool-e2e.eval.yaml index 5540a3421..7e7943eee 100644 --- a/examples/features/repo-lifecycle/evals/pool-e2e.eval.yaml +++ b/examples/features/repo-lifecycle/evals/pool-e2e.eval.yaml @@ -23,13 +23,13 @@ tests: - id: test-1-core-name criteria: Report the core package name input: Read repo/packages/core/package.json and tell me the package name. - assert: + assertions: - type: contains value: "@agentv/core" - id: test-2-cli-name criteria: Report the CLI package name input: Read repo/apps/cli/package.json and tell me the package name. - assert: + assertions: - type: contains value: "agentv" diff --git a/examples/features/rubric/evals/dataset.eval.yaml b/examples/features/rubric/evals/dataset.eval.yaml index 2ec6c417a..c546e19e6 100644 --- a/examples/features/rubric/evals/dataset.eval.yaml +++ b/examples/features/rubric/evals/dataset.eval.yaml @@ -33,7 +33,7 @@ tests: - Best/Average: O(n log n) - Worst case: O(n²) when poorly chosen pivots - assert: + assertions: - Mentions divide-and-conquer approach - Explains the partition step - States time complexity correctly @@ -68,7 +68,7 @@ tests: - 500 Internal Server Error: Server-side error # Detailed rubric objects with weights and required flags - assert: + assertions: - type: rubrics criteria: - id: structure @@ -132,7 +132,7 @@ tests: pattern = r'^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$' return bool(re.match(pattern, email)) - assert: + assertions: # Rubric evaluator for semantic checks - type: rubrics criteria: @@ -204,7 +204,7 @@ tests: more extreme weather. Scientists urge urgent emissions cuts and a transition to renewable energy. - assert: + assertions: - type: rubrics criteria: - id: factual_accuracy diff --git a/examples/features/sdk-config-file/evals/dataset.eval.yaml b/examples/features/sdk-config-file/evals/dataset.eval.yaml index 92ab28681..1c2b647a6 100644 --- a/examples/features/sdk-config-file/evals/dataset.eval.yaml +++ b/examples/features/sdk-config-file/evals/dataset.eval.yaml @@ -12,7 +12,7 @@ tests: criteria: Agent responds with a greeting input: "Hello!" expected_output: "Hello! How can I help you?" - assert: + assertions: - type: contains value: "Hello" @@ -24,5 +24,5 @@ tests: - role: user content: "Return status ok" expected_output: '{"status": "ok"}' - assert: + assertions: - type: is-json diff --git a/examples/features/sdk-custom-assertion/README.md b/examples/features/sdk-custom-assertion/README.md index d3f8ddc80..36089f23f 100644 --- a/examples/features/sdk-custom-assertion/README.md +++ b/examples/features/sdk-custom-assertion/README.md @@ -5,7 +5,7 @@ Demonstrates creating a custom assertion type using `defineAssertion()` from `@a ## What It Does 1. Defines a `word-count` assertion in `.agentv/assertions/word-count.ts` -2. Uses it in EVAL.yaml via `type: word-count` under `assert:` +2. Uses it in EVAL.yaml via `type: word-count` under `assertions:` 3. The assertion checks that the output has a minimum word count ## How to Run diff --git a/examples/features/sdk-custom-assertion/evals/dataset.eval.yaml b/examples/features/sdk-custom-assertion/evals/dataset.eval.yaml index 61bfd8328..a25078e06 100644 --- a/examples/features/sdk-custom-assertion/evals/dataset.eval.yaml +++ b/examples/features/sdk-custom-assertion/evals/dataset.eval.yaml @@ -12,7 +12,7 @@ tests: criteria: Agent gives a multi-word greeting input: "Say hello and introduce yourself" expected_output: "Hello! I'm an AI assistant here to help you with your questions." - assert: + assertions: - type: contains value: "Hello" - type: word-count @@ -21,7 +21,7 @@ tests: criteria: Agent gives a short but valid response input: "What is 2+2?" expected_output: "The answer is 4." - assert: + assertions: - type: contains value: "4" - type: word-count @@ -34,7 +34,7 @@ tests: - role: user content: "Return a JSON object with name and age fields." expected_output: '{"name": "Alice", "age": 30}' - assert: + assertions: - type: is-json required: true - type: word-count diff --git a/examples/features/threshold-evaluator/evals/dataset.eval.yaml b/examples/features/threshold-evaluator/evals/dataset.eval.yaml index b2072607a..bec59a029 100644 --- a/examples/features/threshold-evaluator/evals/dataset.eval.yaml +++ b/examples/features/threshold-evaluator/evals/dataset.eval.yaml @@ -18,13 +18,13 @@ tests: Renewable energy reduces greenhouse gas emissions, lowers long-term energy costs, and decreases dependence on finite fossil fuels. criteria: | The response should be accurate, concise, and cover key benefits of renewable energy. - assert: + assertions: - name: flexible_gate type: composite aggregator: type: threshold threshold: 0.5 - assert: + assertions: - name: accuracy_check type: llm-judge prompt: | diff --git a/examples/features/tool-evaluation-plugins/evals/dataset.eval.yaml b/examples/features/tool-evaluation-plugins/evals/dataset.eval.yaml index f9df1d55f..d4c03e326 100644 --- a/examples/features/tool-evaluation-plugins/evals/dataset.eval.yaml +++ b/examples/features/tool-evaluation-plugins/evals/dataset.eval.yaml @@ -26,7 +26,7 @@ tests: - role: user content: What is the weather in Tokyo? Fetch the detailed forecast. - assert: + assertions: - name: tool-f1 type: code-judge command: ["bun", "run", "../judges/tool-call-f1.ts"] @@ -44,7 +44,7 @@ tests: - role: user content: What is the weather in Tokyo? Fetch the detailed forecast. - assert: + assertions: - name: tool-args-f1 type: code-judge command: ["bun", "run", "../judges/tool-args-f1.ts"] @@ -65,7 +65,7 @@ tests: - role: user content: Analyze the quarterly sales data and generate a summary report. - assert: + assertions: # Built-in: verify tool sequence - name: trajectory-check type: tool-trajectory diff --git a/examples/features/tool-trajectory-advanced/evals/trace-file-demo.eval.yaml b/examples/features/tool-trajectory-advanced/evals/trace-file-demo.eval.yaml index 77f1dcac3..07609020a 100644 --- a/examples/features/tool-trajectory-advanced/evals/trace-file-demo.eval.yaml +++ b/examples/features/tool-trajectory-advanced/evals/trace-file-demo.eval.yaml @@ -35,7 +35,7 @@ tests: - role: user content: Research the ThinkPad X1 Carbon Gen 11 laptop and provide a recommendation. - assert: + assertions: - name: search-then-fetch type: tool-trajectory mode: in_order @@ -60,7 +60,7 @@ tests: - role: user content: Research the ThinkPad X1 Carbon Gen 11 laptop and provide a recommendation. - assert: + assertions: - name: exact-workflow type: tool-trajectory mode: exact @@ -87,7 +87,7 @@ tests: - role: user content: Research the ThinkPad X1 Carbon Gen 11 laptop and provide a recommendation. - assert: + assertions: - name: research-depth type: tool-trajectory mode: any_order @@ -112,7 +112,7 @@ tests: - role: user content: Research the ThinkPad X1 Carbon Gen 11 laptop and provide a recommendation. - assert: + assertions: - name: input-validator type: tool-trajectory mode: in_order @@ -141,7 +141,7 @@ tests: - role: user content: Research the ThinkPad X1 Carbon Gen 11 laptop and provide a recommendation. - assert: + assertions: - name: output-validator type: tool-trajectory mode: in_order @@ -247,7 +247,7 @@ tests: - "No discrete GPU option" confidence: "High" - assert: + assertions: # Validate the research workflow sequence - name: workflow-validator type: tool-trajectory diff --git a/examples/features/tool-trajectory-simple/evals/dataset.eval.yaml b/examples/features/tool-trajectory-simple/evals/dataset.eval.yaml index afcdbc25e..5d12a426b 100644 --- a/examples/features/tool-trajectory-simple/evals/dataset.eval.yaml +++ b/examples/features/tool-trajectory-simple/evals/dataset.eval.yaml @@ -53,7 +53,7 @@ tests: - role: user content: Research the key differences between REST and GraphQL APIs. - assert: + assertions: - name: tool-usage-check type: tool-trajectory mode: any_order @@ -76,7 +76,7 @@ tests: - role: user content: Process the customer data from the API endpoint. - assert: + assertions: - name: workflow-sequence type: tool-trajectory mode: in_order @@ -101,7 +101,7 @@ tests: - role: user content: Authenticate the user with provided credentials. - assert: + assertions: - name: auth-sequence-exact type: tool-trajectory mode: exact @@ -125,7 +125,7 @@ tests: - role: user content: What are the current system metrics for CPU and memory? - assert: + assertions: - name: metrics-tools type: tool-trajectory mode: any_order @@ -148,7 +148,7 @@ tests: - role: user content: Search for information and generate a report. - assert: + assertions: - name: tool-check type: tool-trajectory mode: any_order @@ -176,7 +176,7 @@ tests: - role: user content: What's the weather like in Paris? - assert: + assertions: - name: arg-validation type: tool-trajectory mode: in_order @@ -203,7 +203,7 @@ tests: - role: user content: Load customer data, normalize it, and save - assert: + assertions: - name: workflow-sequence-only type: tool-trajectory mode: in_order @@ -238,7 +238,7 @@ tests: - role: user content: Research and analyze the data. - assert: + assertions: - name: required-tools type: tool-trajectory mode: superset @@ -261,7 +261,7 @@ tests: - role: user content: Look up the current system status. - assert: + assertions: - name: safe-tools-only type: tool-trajectory mode: subset @@ -284,7 +284,7 @@ tests: - role: user content: Process the customer data from the API endpoint. - assert: + assertions: - name: mixed-precision type: tool-trajectory mode: in_order @@ -309,7 +309,7 @@ tests: - role: user content: Process the customer data from the API endpoint. - assert: + assertions: - name: check-source-only type: tool-trajectory mode: in_order diff --git a/examples/features/trace-evaluation/evals/dataset.eval.yaml b/examples/features/trace-evaluation/evals/dataset.eval.yaml index e78a38ea9..4910f9fe7 100644 --- a/examples/features/trace-evaluation/evals/dataset.eval.yaml +++ b/examples/features/trace-evaluation/evals/dataset.eval.yaml @@ -21,7 +21,7 @@ tests: - role: user content: Summarize the key points of this document. - assert: + assertions: - name: span-count type: code-judge command: ["bun", "run", "../judges/span-count.ts"] @@ -40,7 +40,7 @@ tests: - role: user content: Look up the weather forecast for today. - assert: + assertions: - name: error-check type: code-judge command: ["bun", "run", "../judges/error-spans.ts"] @@ -59,7 +59,7 @@ tests: - role: user content: Answer the question using only approved data sources. - assert: + assertions: - name: error-and-tool-check type: code-judge command: ["bun", "run", "../judges/error-spans.ts"] @@ -81,7 +81,7 @@ tests: - role: user content: Retrieve and format the latest sales data. - assert: + assertions: - name: duration-check type: code-judge command: ["bun", "run", "../judges/span-duration.ts"] @@ -101,7 +101,7 @@ tests: - role: user content: Process this data and generate a summary report. - assert: + assertions: - name: span-count type: code-judge command: ["bun", "run", "../judges/span-count.ts"] diff --git a/examples/features/trial-output-consistency/README.md b/examples/features/trial-output-consistency/README.md index 1df9b4e1d..2a31eba01 100644 --- a/examples/features/trial-output-consistency/README.md +++ b/examples/features/trial-output-consistency/README.md @@ -40,7 +40,7 @@ When an agent is run multiple times on the same input (trials), outputs may vary ### Eval YAML ```yaml -assert: +assertions: - name: trial-consistency type: code-judge command: ["bun", "run", "../judges/trial-consistency.ts"] @@ -95,7 +95,7 @@ const config = { trialOutputs: outputs }; Wrap the judge in an assertion that enforces a minimum consistency threshold: ```yaml -assert: +assertions: - name: trial-consistency type: code-judge command: ["bun", "run", "../judges/trial-consistency.ts"] diff --git a/examples/features/trial-output-consistency/evals/dataset.eval.yaml b/examples/features/trial-output-consistency/evals/dataset.eval.yaml index fbb566077..b4b0dfa61 100644 --- a/examples/features/trial-output-consistency/evals/dataset.eval.yaml +++ b/examples/features/trial-output-consistency/evals/dataset.eval.yaml @@ -23,7 +23,7 @@ tests: - role: assistant content: The capital of France is Paris. - assert: + assertions: - name: trial-consistency type: code-judge command: ["bun", "run", "../judges/trial-consistency.ts"] @@ -46,7 +46,7 @@ tests: - role: assistant content: Any creative tagline. - assert: + assertions: - name: trial-consistency type: code-judge command: ["bun", "run", "../judges/trial-consistency.ts"] @@ -69,7 +69,7 @@ tests: - role: assistant content: "4" - assert: + assertions: - name: trial-consistency type: code-judge command: ["bun", "run", "../judges/trial-consistency.ts"] @@ -90,7 +90,7 @@ tests: - role: assistant content: Hi - assert: + assertions: - name: trial-consistency type: code-judge command: ["bun", "run", "../judges/trial-consistency.ts"] diff --git a/examples/features/weighted-evaluators/evals/dataset.eval.yaml b/examples/features/weighted-evaluators/evals/dataset.eval.yaml index 69a397ec7..b5feac834 100644 --- a/examples/features/weighted-evaluators/evals/dataset.eval.yaml +++ b/examples/features/weighted-evaluators/evals/dataset.eval.yaml @@ -17,7 +17,7 @@ tests: Neural networks are computational models inspired by biological neurons. They consist of interconnected layers of nodes (neurons) that process information through weighted connections. These networks learn patterns from data by adjusting connection weights during training. criteria: | The response should be accurate, safe, and well-structured. - assert: + assertions: # Safety is most important - weight 3.0 - name: safety-check type: llm-judge @@ -45,7 +45,7 @@ tests: Reinforcement learning is a type of machine learning where an agent learns to make decisions by interacting with an environment. The agent receives rewards or penalties for its actions and learns to maximize cumulative rewards over time. criteria: | The response should be accurate and complete. - assert: + assertions: - name: accuracy type: llm-judge prompt: ../prompts/accuracy-check.md @@ -68,7 +68,7 @@ tests: Deep learning is a subset of machine learning that uses neural networks with multiple layers (deep neural networks). Each layer learns to extract increasingly abstract features from the input data, enabling the model to learn complex patterns and representations. criteria: | The response should be comprehensive and accurate. - assert: + assertions: # Omitting weight defaults to 1.0 - name: correctness type: llm-judge diff --git a/examples/features/workspace-multi-repo/evals/dataset.eval.yaml b/examples/features/workspace-multi-repo/evals/dataset.eval.yaml index 6e85a8177..facb1af6d 100644 --- a/examples/features/workspace-multi-repo/evals/dataset.eval.yaml +++ b/examples/features/workspace-multi-repo/evals/dataset.eval.yaml @@ -40,6 +40,6 @@ tests: input: >- Read agentv/package.json and allagents/package.json. Report the name field from each. - assert: + assertions: - type: contains value: agentv diff --git a/examples/features/workspace-setup-script/evals/dataset-vscode.eval.yaml b/examples/features/workspace-setup-script/evals/dataset-vscode.eval.yaml index 4e6872983..52de5906b 100644 --- a/examples/features/workspace-setup-script/evals/dataset-vscode.eval.yaml +++ b/examples/features/workspace-setup-script/evals/dataset-vscode.eval.yaml @@ -40,6 +40,6 @@ tests: input: >- List the files in the my-repo/ directory and read my-repo/package.json. Report the package name. - assert: + assertions: - type: contains value: "@agentv/workspace" diff --git a/examples/features/workspace-setup-script/evals/dataset.eval.yaml b/examples/features/workspace-setup-script/evals/dataset.eval.yaml index 406b6372a..b37c64d2b 100644 --- a/examples/features/workspace-setup-script/evals/dataset.eval.yaml +++ b/examples/features/workspace-setup-script/evals/dataset.eval.yaml @@ -38,7 +38,7 @@ tests: input: >- List the files in the my-repo/ directory and read my-repo/package.json. Report the package name. - assert: + assertions: - type: contains value: "@agentv/workspace" diff --git a/examples/features/workspace-shared-config/evals/accuracy/dataset.eval.yaml b/examples/features/workspace-shared-config/evals/accuracy/dataset.eval.yaml index 37e91b540..ab71b766a 100644 --- a/examples/features/workspace-shared-config/evals/accuracy/dataset.eval.yaml +++ b/examples/features/workspace-shared-config/evals/accuracy/dataset.eval.yaml @@ -11,6 +11,6 @@ tests: and report the package name from package.json. input: >- Read agentv/package.json and report the name field. - assert: + assertions: - type: contains value: agentv diff --git a/examples/features/workspace-shared-config/evals/regression/dataset.eval.yaml b/examples/features/workspace-shared-config/evals/regression/dataset.eval.yaml index 0162cb0d8..9aced7cbd 100644 --- a/examples/features/workspace-shared-config/evals/regression/dataset.eval.yaml +++ b/examples/features/workspace-shared-config/evals/regression/dataset.eval.yaml @@ -11,6 +11,6 @@ tests: and report its contents. input: >- Read the README.md file in the workspace root and report its contents. - assert: + assertions: - type: contains value: Shared Workspace Template diff --git a/examples/showcase/cross-repo-sync/evals/dataset.eval.yaml b/examples/showcase/cross-repo-sync/evals/dataset.eval.yaml index 071dd637d..c899332cd 100644 --- a/examples/showcase/cross-repo-sync/evals/dataset.eval.yaml +++ b/examples/showcase/cross-repo-sync/evals/dataset.eval.yaml @@ -32,7 +32,7 @@ tests: agentv just merged eval spec v2 (PR #262). Update the agentevals spec docs to reflect: 4 new deterministic assert types, required gates, assert field at test/suite level, tests-as-string-path. - assert: + assertions: - name: sync-check type: code-judge command: ["bash", "../scripts/run-ts.sh", "../scripts/validate-sync.ts"] @@ -52,7 +52,7 @@ tests: content: | agentv renamed cases→tests in the eval schema (PR #240). Update all agentevals spec docs to match. - assert: + assertions: - name: sync-check type: code-judge command: ["bash", "../scripts/run-ts.sh", "../scripts/validate-sync.ts"] @@ -73,7 +73,7 @@ tests: agentv renamed schema fields: eval_cases→cases, expected_outcome→criteria at case level, expected_outcome→outcome at rubric level (PR #202). Update agentevals spec docs accordingly. - assert: + assertions: - name: sync-check type: code-judge command: ["bash", "../scripts/run-ts.sh", "../scripts/validate-sync.ts"] diff --git a/examples/showcase/cw-incident-triage/evals/dataset.eval.yaml b/examples/showcase/cw-incident-triage/evals/dataset.eval.yaml index 593c89d79..c1ec6a0e3 100644 --- a/examples/showcase/cw-incident-triage/evals/dataset.eval.yaml +++ b/examples/showcase/cw-incident-triage/evals/dataset.eval.yaml @@ -9,7 +9,7 @@ description: CargoWise criticality rating (CR1-CR9) classification eval for supp execution: target: default -assert: +assertions: - name: json_schema_validator type: code-judge command: ["uv", "run", "validate_output.py"] diff --git a/examples/showcase/evaluator-conformance/EVAL.yaml b/examples/showcase/evaluator-conformance/EVAL.yaml index 42d8264b0..3a6570b4d 100644 --- a/examples/showcase/evaluator-conformance/EVAL.yaml +++ b/examples/showcase/evaluator-conformance/EVAL.yaml @@ -14,7 +14,7 @@ tests: criteria: "Answer must name the capital city of France." input: "What is the capital of France?" expected_output: "Paris" - assert: + assertions: - name: keyword-judge type: code-judge command: ["bun", "run", "evaluators/keyword-judge.ts"] @@ -23,7 +23,7 @@ tests: criteria: "Answer must mention red, blue, and yellow." input: "Name the primary colors." expected_output: "red, blue, yellow" - assert: + assertions: - name: keyword-judge type: code-judge command: ["bun", "run", "evaluators/keyword-judge.ts"] diff --git a/examples/showcase/export-screening/evals/dataset.eval.yaml b/examples/showcase/export-screening/evals/dataset.eval.yaml index 856c53356..a05468586 100644 --- a/examples/showcase/export-screening/evals/dataset.eval.yaml +++ b/examples/showcase/export-screening/evals/dataset.eval.yaml @@ -20,7 +20,7 @@ description: Export control risk classification eval for trade compliance screen execution: target: default -assert: +assertions: - name: risk_assessment_quality type: code-judge command: ["bun", "run", "validate_risk_output.ts"] diff --git a/examples/showcase/multi-model-benchmark/README.md b/examples/showcase/multi-model-benchmark/README.md index 94ba5d283..238ad4a48 100644 --- a/examples/showcase/multi-model-benchmark/README.md +++ b/examples/showcase/multi-model-benchmark/README.md @@ -107,7 +107,7 @@ execution: Three LLM judges score each response. Weights control their contribution to the aggregate score: ```yaml -assert: +assertions: - name: accuracy weight: 3.0 # Most important — factual correctness - name: completeness @@ -181,10 +181,10 @@ execution: ### Adding an evaluator -Add a new judge prompt in `prompts/` and reference it in the eval's `assert` block: +Add a new judge prompt in `prompts/` and reference it in the eval's `assertions` block: ```yaml -assert: +assertions: - name: safety type: llm-judge prompt: ../prompts/safety-rubric.md diff --git a/examples/showcase/multi-model-benchmark/evals/benchmark.eval.yaml b/examples/showcase/multi-model-benchmark/evals/benchmark.eval.yaml index d14654411..d82accfce 100644 --- a/examples/showcase/multi-model-benchmark/evals/benchmark.eval.yaml +++ b/examples/showcase/multi-model-benchmark/evals/benchmark.eval.yaml @@ -23,7 +23,7 @@ execution: strategy: pass_at_k cost_limit_usd: 2.00 -assert: +assertions: - name: accuracy type: llm-judge prompt: ../prompts/accuracy-rubric.md diff --git a/examples/showcase/offline-judge-benchmark/evals/setup-a.eval.yaml b/examples/showcase/offline-judge-benchmark/evals/setup-a.eval.yaml index f4e5c3190..bb0752a17 100644 --- a/examples/showcase/offline-judge-benchmark/evals/setup-a.eval.yaml +++ b/examples/showcase/offline-judge-benchmark/evals/setup-a.eval.yaml @@ -5,13 +5,13 @@ execution: tests: - file://../fixtures/labeled-judge-export.jsonl -assert: +assertions: - name: judge-panel type: composite aggregator: type: threshold threshold: 0.6 - assert: + assertions: - name: judge-gpt-5-mini type: llm-judge target: judge_gpt_5_mini diff --git a/examples/showcase/offline-judge-benchmark/evals/setup-b.eval.yaml b/examples/showcase/offline-judge-benchmark/evals/setup-b.eval.yaml index e2e1e6cf5..7e442eea3 100644 --- a/examples/showcase/offline-judge-benchmark/evals/setup-b.eval.yaml +++ b/examples/showcase/offline-judge-benchmark/evals/setup-b.eval.yaml @@ -5,13 +5,13 @@ execution: tests: - file://../fixtures/labeled-judge-export.jsonl -assert: +assertions: - name: judge-panel type: composite aggregator: type: threshold threshold: 0.6 - assert: + assertions: - name: judge-gpt-5-mini type: llm-judge target: judge_gpt_5_mini diff --git a/examples/showcase/psychotherapy/evals/encouragement.eval.yaml b/examples/showcase/psychotherapy/evals/encouragement.eval.yaml index 6ed692b73..20df594d4 100644 --- a/examples/showcase/psychotherapy/evals/encouragement.eval.yaml +++ b/examples/showcase/psychotherapy/evals/encouragement.eval.yaml @@ -6,7 +6,7 @@ description: |- execution: target: gemini_base -assert: +assertions: - name: json_schema_validator type: code-judge command: ["uv", "run", "validate_output.py"] diff --git a/examples/showcase/psychotherapy/evals/listening.eval.yaml b/examples/showcase/psychotherapy/evals/listening.eval.yaml index a53e18b70..645d1e80a 100644 --- a/examples/showcase/psychotherapy/evals/listening.eval.yaml +++ b/examples/showcase/psychotherapy/evals/listening.eval.yaml @@ -5,7 +5,7 @@ description: |- execution: target: gemini_base -assert: +assertions: - name: json_schema_validator type: code-judge command: ["uv", "run", "validate_output.py"] diff --git a/examples/showcase/psychotherapy/evals/routing.eval.yaml b/examples/showcase/psychotherapy/evals/routing.eval.yaml index 3982f36ee..109539a52 100644 --- a/examples/showcase/psychotherapy/evals/routing.eval.yaml +++ b/examples/showcase/psychotherapy/evals/routing.eval.yaml @@ -4,7 +4,7 @@ description: |- execution: target: gemini_base -assert: +assertions: - name: json_schema_validator type: code-judge command: ["uv", "run", "validate_output.py"] diff --git a/examples/showcase/tool-evaluation-plugins/tool-eval-demo.yaml b/examples/showcase/tool-evaluation-plugins/tool-eval-demo.yaml index 169dbf359..d0dde7214 100644 --- a/examples/showcase/tool-evaluation-plugins/tool-eval-demo.yaml +++ b/examples/showcase/tool-evaluation-plugins/tool-eval-demo.yaml @@ -28,7 +28,7 @@ tests: - role: user content: Find information about the current weather in Tokyo and fetch the detailed forecast. - assert: + assertions: # Built-in: Check minimum tool calls - name: trajectory-check type: tool-trajectory @@ -57,7 +57,7 @@ tests: - role: user content: Get the current time. - assert: + assertions: # Plugin: Efficiency metrics scoring - name: efficiency-check type: code-judge @@ -81,7 +81,7 @@ tests: - role: user content: Analyze the quarterly sales data and generate a summary report. - assert: + assertions: # Built-in: Verify required workflow sequence - name: workflow-trajectory type: tool-trajectory @@ -125,7 +125,7 @@ tests: 2. Configuration: Edit settings.json 3. Usage: Run the main command - assert: + assertions: # Plugin: Pairwise comparison with position bias mitigation - name: pairwise-quality type: code-judge diff --git a/packages/core/src/evaluation/loaders/evaluator-parser.ts b/packages/core/src/evaluation/loaders/evaluator-parser.ts index d7141c577..4e540f772 100644 --- a/packages/core/src/evaluation/loaders/evaluator-parser.ts +++ b/packages/core/src/evaluation/loaders/evaluator-parser.ts @@ -41,14 +41,14 @@ export async function parseEvaluators( const caseEvaluators = rawEvalCase.assertions ?? rawEvalCase.assert ?? - (executionObject ? executionObject.evaluators : undefined) ?? // deprecated: use assert - rawEvalCase.evaluators; // deprecated: use assert + (executionObject ? executionObject.evaluators : undefined) ?? // deprecated: use assertions + rawEvalCase.evaluators; // deprecated: use assertions // Root-level (default) evaluators: assertions > assert > execution.evaluators (deprecated) const skipDefaults = executionObject?.skip_defaults === true; const rootEvaluators = skipDefaults ? undefined - : (globalExecution?.assertions ?? globalExecution?.assert ?? globalExecution?.evaluators); // deprecated: use assert + : (globalExecution?.assertions ?? globalExecution?.assert ?? globalExecution?.evaluators); // deprecated: use assertions // Parse case-level evaluators const parsedCase = await parseEvaluatorList(caseEvaluators, searchRoots, evalId); @@ -294,7 +294,7 @@ async function parseEvaluatorList( const rawMembers = rawEvaluator.assertions ?? rawEvaluator.assert ?? rawEvaluator.evaluators; // evaluators deprecated if (!Array.isArray(rawMembers)) { logWarning( - `Skipping composite evaluator '${name}' in '${evalId}': missing evaluators (or assert) array`, + `Skipping composite evaluator '${name}' in '${evalId}': missing assertions (or evaluators) array`, ); continue; } diff --git a/packages/core/src/evaluation/loaders/jsonl-parser.ts b/packages/core/src/evaluation/loaders/jsonl-parser.ts index 805bb0493..73dec606b 100644 --- a/packages/core/src/evaluation/loaders/jsonl-parser.ts +++ b/packages/core/src/evaluation/loaders/jsonl-parser.ts @@ -279,7 +279,7 @@ export async function loadTestsFromJsonl( continue; } - // Handle inline rubrics field (deprecated: use assert: [{type: rubrics, criteria: [...]}] instead) + // Handle inline rubrics field (deprecated: use assertions: [{type: rubrics, criteria: [...]}] instead) const inlineRubrics = evalcase.rubrics; if (inlineRubrics !== undefined && Array.isArray(inlineRubrics)) { const rubricEvaluator = parseInlineRubrics(inlineRubrics); diff --git a/packages/core/src/evaluation/validation/eval-file.schema.ts b/packages/core/src/evaluation/validation/eval-file.schema.ts index 5f25174bc..690373b43 100644 --- a/packages/core/src/evaluation/validation/eval-file.schema.ts +++ b/packages/core/src/evaluation/validation/eval-file.schema.ts @@ -386,6 +386,8 @@ export const EvalFileSchema = z.object({ // Execution execution: ExecutionSchema.optional(), // Suite-level assertions + assertions: z.array(EvaluatorSchema).optional(), + /** @deprecated Use `assertions` instead */ assert: z.array(EvaluatorSchema).optional(), // Workspace (inline object or path to external workspace YAML file) workspace: z.union([WorkspaceSchema, z.string()]).optional(), diff --git a/packages/core/src/evaluation/validation/eval-validator.ts b/packages/core/src/evaluation/validation/eval-validator.ts index da85821d5..c9593d5a5 100644 --- a/packages/core/src/evaluation/validation/eval-validator.ts +++ b/packages/core/src/evaluation/validation/eval-validator.ts @@ -244,8 +244,8 @@ export async function validateEvalFile(filePath: string): Promise 0 || evalcase.assert !== undefined; + !!outcome || + expectedMessages.length > 0 || + evalcase.assertions !== undefined || + evalcase.assert !== undefined; if (!id || !hasEvaluationSpec || !testInputMessages || testInputMessages.length === 0) { logError( - `Skipping incomplete test: ${id ?? 'unknown'}. Missing required fields: id, input, and at least one of criteria/expected_output/assert`, + `Skipping incomplete test: ${id ?? 'unknown'}. Missing required fields: id, input, and at least one of criteria/expected_output/assertions`, ); continue; } @@ -421,7 +433,7 @@ async function loadTestsFromYaml( continue; } - // Handle inline rubrics field (deprecated: use assert: [{type: rubrics, criteria: [...]}] instead) + // Handle inline rubrics field (deprecated: use assertions: [{type: rubrics, criteria: [...]}] instead) const inlineRubrics = evalcase.rubrics; if (inlineRubrics !== undefined && Array.isArray(inlineRubrics)) { const rubricEvaluator = parseInlineRubrics(inlineRubrics); diff --git a/packages/core/test/evaluation/criteria-optional.test.ts b/packages/core/test/evaluation/criteria-optional.test.ts index e216b5b52..09ef60abd 100644 --- a/packages/core/test/evaluation/criteria-optional.test.ts +++ b/packages/core/test/evaluation/criteria-optional.test.ts @@ -24,7 +24,7 @@ describe('criteria is optional when expected_output or assert is present', () => - id: test-01 input: "sample prompt" expected_output: "sample expected output" - assert: + assertions: - type: contains value: sample `, @@ -36,13 +36,13 @@ describe('criteria is optional when expected_output or assert is present', () => expect(tests[0].criteria).toBe(''); }); - it('accepts test with assert only and no criteria', async () => { + it('accepts test with assertions only and no criteria', async () => { await writeFile( path.join(tempDir, 'assert-only.eval.yaml'), `tests: - id: test-02 input: "sample prompt" - assert: + assertions: - type: rubrics criteria: - response includes sample expected output @@ -60,7 +60,7 @@ describe('criteria is optional when expected_output or assert is present', () => path.join(tempDir, 'missing-input.eval.yaml'), `tests: - id: test-03 - assert: + assertions: - type: contains value: sample `, @@ -70,7 +70,7 @@ describe('criteria is optional when expected_output or assert is present', () => expect(tests).toHaveLength(0); }); - it('skips test with no criteria, no expected_output, and no assert', async () => { + it('skips test with no criteria, no expected_output, and no assertions', async () => { await writeFile( path.join(tempDir, 'no-eval-spec.eval.yaml'), `tests: diff --git a/packages/core/test/evaluation/loaders/evaluator-parser.test.ts b/packages/core/test/evaluation/loaders/evaluator-parser.test.ts index 2e221870c..ff631d791 100644 --- a/packages/core/test/evaluation/loaders/evaluator-parser.test.ts +++ b/packages/core/test/evaluation/loaders/evaluator-parser.test.ts @@ -1232,10 +1232,10 @@ describe('parseEvaluators - assert field', () => { await rm(tempDir, { recursive: true, force: true }); }); - it('parses assert field as evaluators', async () => { + it('parses assertions field as evaluators', async () => { const evaluators = await parseEvaluators( { - assert: [{ type: 'contains', value: 'DENIED' }], + assertions: [{ type: 'contains', value: 'DENIED' }], }, undefined, [tempDir], @@ -1245,10 +1245,23 @@ describe('parseEvaluators - assert field', () => { expect(evaluators?.[0].type).toBe('contains'); }); - it('assert takes precedence over execution.evaluators', async () => { + it('parses legacy assert field as evaluators (backward compat)', async () => { const evaluators = await parseEvaluators( { assert: [{ type: 'contains', value: 'DENIED' }], + }, + undefined, + [tempDir], + 'test-1', + ); + expect(evaluators).toHaveLength(1); + expect(evaluators?.[0].type).toBe('contains'); + }); + + it('assertions takes precedence over execution.evaluators', async () => { + const evaluators = await parseEvaluators( + { + assertions: [{ type: 'contains', value: 'DENIED' }], execution: { evaluators: [{ name: 'latency-check', type: 'latency', threshold: 5000 }], }, @@ -1261,10 +1274,10 @@ describe('parseEvaluators - assert field', () => { expect(evaluators?.[0].type).toBe('contains'); }); - it('assert takes precedence over top-level evaluators', async () => { + it('assertions takes precedence over top-level evaluators', async () => { const evaluators = await parseEvaluators( { - assert: [{ type: 'contains', value: 'DENIED' }], + assertions: [{ type: 'contains', value: 'DENIED' }], evaluators: [{ name: 'latency-check', type: 'latency', threshold: 5000 }], }, undefined, @@ -1275,12 +1288,12 @@ describe('parseEvaluators - assert field', () => { expect(evaluators?.[0].type).toBe('contains'); }); - it('merges suite-level assert with test-level assert', async () => { + it('merges suite-level assertions with test-level assertions', async () => { const evaluators = await parseEvaluators( { - assert: [{ type: 'contains', value: 'DENIED' }], + assertions: [{ type: 'contains', value: 'DENIED' }], }, - { assert: [{ name: 'latency-check', type: 'latency', threshold: 5000 }] }, + { assertions: [{ name: 'latency-check', type: 'latency', threshold: 5000 }] }, [tempDir], 'test-1', ); @@ -1289,13 +1302,13 @@ describe('parseEvaluators - assert field', () => { expect(evaluators?.[1].type).toBe('latency'); }); - it('skip_defaults prevents suite-level assert from being appended', async () => { + it('skip_defaults prevents suite-level assertions from being appended', async () => { const evaluators = await parseEvaluators( { - assert: [{ type: 'contains', value: 'DENIED' }], + assertions: [{ type: 'contains', value: 'DENIED' }], execution: { skip_defaults: true }, }, - { assert: [{ name: 'latency-check', type: 'latency', threshold: 5000 }] }, + { assertions: [{ name: 'latency-check', type: 'latency', threshold: 5000 }] }, [tempDir], 'test-1', ); @@ -1318,11 +1331,11 @@ describe('parseEvaluators - assert field', () => { expect(evaluators?.[0].type).toBe('latency'); }); - it('suite-level assert takes precedence over suite-level execution.evaluators', async () => { + it('suite-level assertions takes precedence over suite-level execution.evaluators', async () => { const evaluators = await parseEvaluators( {}, { - assert: [{ type: 'contains', value: 'HELLO' }], + assertions: [{ type: 'contains', value: 'HELLO' }], evaluators: [{ name: 'latency-check', type: 'latency', threshold: 5000 }], }, [tempDir], @@ -1332,7 +1345,7 @@ describe('parseEvaluators - assert field', () => { expect(evaluators?.[0].type).toBe('contains'); }); - it('falls back to suite-level execution.evaluators when suite assert is not present', async () => { + it('falls back to suite-level execution.evaluators when suite assertions is not present', async () => { const evaluators = await parseEvaluators( {}, { @@ -1361,7 +1374,7 @@ describe('parseEvaluators - type: rubrics with criteria', () => { it('parses rubrics type with criteria array', async () => { const evaluators = await parseEvaluators( { - assert: [ + assertions: [ { type: 'rubrics', criteria: [ @@ -1385,7 +1398,7 @@ describe('parseEvaluators - type: rubrics with criteria', () => { it('auto-generates name for rubrics type', async () => { const evaluators = await parseEvaluators( { - assert: [ + assertions: [ { type: 'rubrics', criteria: [{ id: 'check-1', outcome: 'Some check', weight: 1.0 }], @@ -1403,7 +1416,7 @@ describe('parseEvaluators - type: rubrics with criteria', () => { it('skips rubrics with empty criteria array', async () => { const evaluators = await parseEvaluators( { - assert: [ + assertions: [ { type: 'rubrics', criteria: [], @@ -1420,7 +1433,7 @@ describe('parseEvaluators - type: rubrics with criteria', () => { it('skips rubrics with missing criteria', async () => { const evaluators = await parseEvaluators( { - assert: [ + assertions: [ { type: 'rubrics', }, @@ -1436,7 +1449,7 @@ describe('parseEvaluators - type: rubrics with criteria', () => { it('supports string shorthand in criteria', async () => { const evaluators = await parseEvaluators( { - assert: [ + assertions: [ { type: 'rubrics', criteria: ['Must be polite', 'Must be accurate'], @@ -1578,7 +1591,7 @@ describe('parseEvaluators - required field', () => { }); }); -describe('parseEvaluators - composite assert field', () => { +describe('parseEvaluators - composite assertions field', () => { let tempDir: string; beforeAll(async () => { @@ -1593,14 +1606,14 @@ describe('parseEvaluators - composite assert field', () => { await rm(tempDir, { recursive: true, force: true }); }); - it('parses composite with assert field (new syntax)', async () => { + it('parses composite with assertions field', async () => { const evaluators = await parseEvaluators( { - assert: [ + assertions: [ { name: 'combined', type: 'composite', - assert: [ + assertions: [ { name: 'safety', type: 'llm-judge', prompt: './safety.md' }, { name: 'quality', type: 'llm-judge', prompt: './quality.md' }, ], @@ -1639,14 +1652,14 @@ describe('parseEvaluators - composite assert field', () => { expect(evaluators?.[0].type).toBe('composite'); }); - it('composite assert takes precedence over evaluators', async () => { + it('composite assertions takes precedence over evaluators', async () => { const evaluators = await parseEvaluators( { - assert: [ + assertions: [ { name: 'combined', type: 'composite', - assert: [{ name: 'safety', type: 'llm-judge', prompt: './safety.md' }], + assertions: [{ name: 'safety', type: 'llm-judge', prompt: './safety.md' }], evaluators: [{ name: 'quality', type: 'llm-judge', prompt: './quality.md' }], aggregator: { type: 'weighted_average' }, }, @@ -1657,18 +1670,18 @@ describe('parseEvaluators - composite assert field', () => { 'test-1', ); expect(evaluators).toHaveLength(1); - // assert takes precedence - only 1 inner evaluator + // assertions takes precedence - only 1 inner evaluator const composite = evaluators?.[0] as CompositeEvaluatorConfig; expect(composite.assertions).toHaveLength(1); expect(composite.assertions[0].name).toBe('safety'); }); }); -describe('parseEvaluators - string shorthand in assert', () => { - it('treats all-string assert array as a single rubrics evaluator', async () => { +describe('parseEvaluators - string shorthand in assertions', () => { + it('treats all-string assertions array as a single rubrics evaluator', async () => { const evaluators = await parseEvaluators( { - assert: [ + assertions: [ 'Mentions divide-and-conquer approach', 'Explains partition step', 'States time complexity', @@ -1697,7 +1710,7 @@ describe('parseEvaluators - string shorthand in assert', () => { it('groups strings into rubrics and preserves object evaluators', async () => { const evaluators = await parseEvaluators( { - assert: [ + assertions: [ 'Mentions divide-and-conquer approach', { name: 'syntax-check', type: 'contains', value: 'quicksort' }, 'States time complexity', @@ -1723,7 +1736,7 @@ describe('parseEvaluators - string shorthand in assert', () => { it('treats a single string as a single-criterion rubrics evaluator', async () => { const evaluators = await parseEvaluators( { - assert: ['Response must be polite'], + assertions: ['Response must be polite'], }, undefined, ['/tmp'], @@ -1741,7 +1754,7 @@ describe('parseEvaluators - string shorthand in assert', () => { it('ignores all-whitespace strings and produces no rubrics evaluator', async () => { const evaluators = await parseEvaluators( { - assert: [' ', ''], + assertions: [' ', ''], }, undefined, ['/tmp'], diff --git a/packages/core/test/evaluation/validation/eval-validator.test.ts b/packages/core/test/evaluation/validation/eval-validator.test.ts index 85d211165..dcf9de2ad 100644 --- a/packages/core/test/evaluation/validation/eval-validator.test.ts +++ b/packages/core/test/evaluation/validation/eval-validator.test.ts @@ -153,7 +153,7 @@ describe('validateEvalFile', () => { `tests: - id: test-1 input: "What is 2+2?" - assert: + assertions: - value: test `, ); @@ -171,7 +171,7 @@ describe('validateEvalFile', () => { `tests: - id: test-1 input: "What is 2+2?" - assert: + assertions: - type: invalid_evaluator value: test `, @@ -190,7 +190,7 @@ describe('validateEvalFile', () => { `tests: - id: test-1 input: "What is 2+2?" - assert: + assertions: - type: contains `, ); @@ -208,7 +208,7 @@ describe('validateEvalFile', () => { `tests: - id: test-1 input: "What is 2+2?" - assert: + assertions: - type: equals `, ); @@ -226,7 +226,7 @@ describe('validateEvalFile', () => { `tests: - id: test-1 input: "What is 2+2?" - assert: + assertions: - type: regex value: "[invalid" `, @@ -245,7 +245,7 @@ describe('validateEvalFile', () => { `tests: - id: test-1 input: "What is 2+2?" - assert: + assertions: - type: regex `, ); @@ -263,7 +263,7 @@ describe('validateEvalFile', () => { `tests: - id: test-1 input: "Return JSON" - assert: + assertions: - type: is_json `, ); @@ -282,7 +282,7 @@ describe('validateEvalFile', () => { `tests: - id: test-1 input: "What is 2+2?" - assert: + assertions: - type: contains value: "4" required: true @@ -303,7 +303,7 @@ describe('validateEvalFile', () => { `tests: - id: test-1 input: "What is 2+2?" - assert: + assertions: - type: contains value: "4" required: 0.8 @@ -324,7 +324,7 @@ describe('validateEvalFile', () => { `tests: - id: test-1 input: "What is 2+2?" - assert: + assertions: - type: contains value: "4" required: "yes" @@ -344,7 +344,7 @@ describe('validateEvalFile', () => { `tests: - id: test-1 input: "What is 2+2?" - assert: + assertions: - type: contains value: "4" required: 0 @@ -364,7 +364,7 @@ describe('validateEvalFile', () => { `tests: - id: test-1 input: "What is 2+2?" - assert: + assertions: - type: contains value: "4" required: 1.5 @@ -384,14 +384,14 @@ describe('validateEvalFile', () => { `tests: - id: test-1 input: "What is 2+2?" - assert: "contains" + assertions: "contains" `, ); const result = await validateEvalFile(filePath); const warnings = result.errors.filter((e) => e.severity === 'warning'); - expect(warnings.some((e) => e.message.includes('assert'))).toBe(true); + expect(warnings.some((e) => e.message.includes('assertions'))).toBe(true); }); it('warns when assert item is not an object', async () => { @@ -401,7 +401,7 @@ describe('validateEvalFile', () => { `tests: - id: test-1 input: "What is 2+2?" - assert: + assertions: - "contains" `, ); @@ -419,7 +419,7 @@ describe('validateEvalFile', () => { `tests: - id: test-1 input: "Is this entity sanctioned?" - assert: + assertions: - type: contains value: DENIED - type: is_json diff --git a/packages/eval/src/assertion.ts b/packages/eval/src/assertion.ts index c44944fc9..2665b1493 100644 --- a/packages/eval/src/assertion.ts +++ b/packages/eval/src/assertion.ts @@ -24,9 +24,9 @@ export type AssertionContext = CodeJudgeInput; /** * Known built-in assertion types. Custom types are extensible via string. * - * Use in EVAL.yaml `assert` blocks: + * Use in EVAL.yaml `assertions` blocks: * ```yaml - * assert: + * assertions: * - type: contains * value: "Paris" * ``` diff --git a/plugins/agentv-dev/skills/agentv-bench/SKILL.md b/plugins/agentv-dev/skills/agentv-bench/SKILL.md index 6cd1dc47e..bf8dac9b2 100644 --- a/plugins/agentv-dev/skills/agentv-bench/SKILL.md +++ b/plugins/agentv-dev/skills/agentv-bench/SKILL.md @@ -113,7 +113,7 @@ tests: - id: basic-code-review input: "Review this TypeScript file for bugs and suggest improvements" criteria: "Identifies the null pointer bug on line 12 and suggests a fix" - assert: + assertions: - type: contains value: "null" - type: llm-judge @@ -418,12 +418,12 @@ Write as EVAL.yaml with top-level input (the user prompt doesn't specify the ski tests: - id: should-trigger-casual-optimize input: "ok so I have this agent that keeps failing on the code review tasks, can you help me figure out why and fix it" - assert: + assertions: - type: contains value: "agentv-bench" - id: should-not-trigger-build-error input: "my TypeScript build is failing with type errors in src/auth.ts" - assert: + assertions: - type: not-contains value: "agentv-bench" ``` diff --git a/plugins/agentv-dev/skills/agentv-chat-to-eval/README.md b/plugins/agentv-dev/skills/agentv-chat-to-eval/README.md index b661e5f84..e14135ca9 100644 --- a/plugins/agentv-dev/skills/agentv-chat-to-eval/README.md +++ b/plugins/agentv-dev/skills/agentv-chat-to-eval/README.md @@ -44,7 +44,7 @@ tests: criteria: "Correctly identify the capital of France" input: "What's the capital of France?" expected_output: "The capital of France is Paris." - assert: + assertions: - type: rubrics criteria: - States Paris as the capital @@ -54,7 +54,7 @@ tests: criteria: "Explain how to reverse a list in Python" input: "How do I reverse a list in Python?" expected_output: "Use the `reverse()` method or slicing: `my_list[::-1]`" - assert: + assertions: - type: rubrics criteria: - Provides at least one valid method to reverse a list @@ -62,7 +62,7 @@ tests: - Explanation is clear and actionable # Suggested additional evaluators: -# assert: +# assertions: # - name: quality # type: llm-judge # prompt: ./prompts/quality.md diff --git a/plugins/agentv-dev/skills/agentv-chat-to-eval/SKILL.md b/plugins/agentv-dev/skills/agentv-chat-to-eval/SKILL.md index 53e151e36..e3c9d4f8e 100644 --- a/plugins/agentv-dev/skills/agentv-chat-to-eval/SKILL.md +++ b/plugins/agentv-dev/skills/agentv-chat-to-eval/SKILL.md @@ -73,7 +73,7 @@ tests: criteria: "" input: "" expected_output: "" - assert: + assertions: - type: rubrics criteria: - @@ -86,7 +86,7 @@ tests: - Write `criteria` as a concise statement of what a good response achieves - Use `input` for single user messages; use `input` for multi-turn - Set `expected_output` to the actual assistant response from the transcript -- Include 2–4 rubrics per test as `type: rubrics` under `assert` capturing distinct quality dimensions +- Include 2–4 rubrics per test as `type: rubrics` under `assertions` capturing distinct quality dimensions ### 5. Suggest Evaluators @@ -94,7 +94,7 @@ Append a commented evaluator configuration based on the test content: ```yaml # Suggested additional evaluators: -# assert: +# assertions: # - name: quality # type: llm-judge # prompt: ./prompts/quality.md @@ -128,7 +128,7 @@ tests: - role: user content: "What's my name?" expected_output: "Your name is Alice." - assert: + assertions: - type: rubrics criteria: - Correctly recalls the user's name from earlier in the conversation diff --git a/plugins/agentv-dev/skills/agentv-chat-to-eval/examples/transcript-json.md b/plugins/agentv-dev/skills/agentv-chat-to-eval/examples/transcript-json.md index 39fc34d90..5db95e207 100644 --- a/plugins/agentv-dev/skills/agentv-chat-to-eval/examples/transcript-json.md +++ b/plugins/agentv-dev/skills/agentv-chat-to-eval/examples/transcript-json.md @@ -23,7 +23,7 @@ tests: criteria: "Convert Fahrenheit to Celsius with correct formula" input: "Convert 72°F to Celsius" expected_output: "72°F is approximately 22.2°C. The formula is: (°F - 32) × 5/9 = °C" - assert: + assertions: - type: rubrics criteria: - Provides the correct converted value (approximately 22.2°C) @@ -40,7 +40,7 @@ tests: - role: user content: "What about -40?" expected_output: "-40°F equals exactly -40°C. This is the unique point where Fahrenheit and Celsius scales intersect." - assert: + assertions: - type: rubrics criteria: - States that -40°F equals -40°C @@ -50,14 +50,14 @@ tests: criteria: "Confirm absolute zero equivalence and explain its significance" input: "Is 0 Kelvin the same as -273.15°C?" expected_output: "Yes, 0 Kelvin (absolute zero) equals -273.15°C. It's the theoretical lowest possible temperature where all molecular motion stops." - assert: + assertions: - type: rubrics criteria: - Confirms the equivalence of 0K and -273.15°C - Explains the physical significance of absolute zero # Suggested additional evaluators: -# assert: +# assertions: # - name: accuracy # type: code-judge # command: [./scripts/check_conversion.py] diff --git a/plugins/agentv-dev/skills/agentv-chat-to-eval/examples/transcript-markdown.md b/plugins/agentv-dev/skills/agentv-chat-to-eval/examples/transcript-markdown.md index 4d6d3d888..035f85874 100644 --- a/plugins/agentv-dev/skills/agentv-chat-to-eval/examples/transcript-markdown.md +++ b/plugins/agentv-dev/skills/agentv-chat-to-eval/examples/transcript-markdown.md @@ -46,7 +46,7 @@ tests: 3. **Liskov Substitution** — Subtypes must be substitutable for their base types 4. **Interface Segregation** — Prefer small, specific interfaces over large ones 5. **Dependency Inversion** — Depend on abstractions, not concretions - assert: + assertions: - type: rubrics criteria: - Lists all five SOLID principles by name @@ -69,7 +69,7 @@ tests: Good: `AuthService` handles login/registration. `EmailService` handles sending emails. Each class has one job, making them easier to test and maintain. - assert: + assertions: - type: rubrics criteria: - Shows a bad example that violates SRP @@ -86,7 +86,7 @@ tests: - **Abstract class**: Can have both abstract and concrete methods. A class can extend only one. Can hold state. Use interfaces for "can-do" relationships (e.g., `Serializable`). Use abstract classes for "is-a" relationships with shared behavior (e.g., `Animal` base class). - assert: + assertions: - type: rubrics criteria: - Correctly distinguishes interfaces from abstract classes @@ -94,7 +94,7 @@ tests: - Provides guidance on when to use each # Suggested additional evaluators: -# assert: +# assertions: # - name: quality # type: llm-judge # prompt: ./prompts/quality.md diff --git a/plugins/agentv-dev/skills/agentv-eval-analyzer/SKILL.md b/plugins/agentv-dev/skills/agentv-eval-analyzer/SKILL.md index 65354c873..d059dc8d3 100644 --- a/plugins/agentv-dev/skills/agentv-eval-analyzer/SKILL.md +++ b/plugins/agentv-dev/skills/agentv-eval-analyzer/SKILL.md @@ -83,7 +83,7 @@ The analyzer report includes concrete YAML snippets for each suggestion. To appl Before (LLM-judge doing substring work): ```yaml -assert: +assertions: - name: has-error-code type: llm-judge prompt: "Check if the response contains the error code 404" @@ -91,7 +91,7 @@ assert: After (deterministic, zero LLM cost): ```yaml -assert: +assertions: - name: has-error-code type: contains value: "404" diff --git a/plugins/agentv-dev/skills/agentv-eval-writer/SKILL.md b/plugins/agentv-dev/skills/agentv-eval-writer/SKILL.md index 5eed3df68..0b9f6c00e 100644 --- a/plugins/agentv-dev/skills/agentv-eval-writer/SKILL.md +++ b/plugins/agentv-dev/skills/agentv-eval-writer/SKILL.md @@ -52,7 +52,7 @@ tests: criteria: Friendly greeting input: "Say hello" expected_output: "Hello! How can I help you?" - assert: + assertions: - type: rubrics criteria: - Greeting is friendly and warm @@ -62,7 +62,7 @@ tests: ## Eval File Structure **Required:** `tests` (array or string path) -**Optional:** `name`, `description`, `version`, `author`, `tags`, `license`, `requires`, `execution`, `dataset`, `workspace`, `assert`, `input` +**Optional:** `name`, `description`, `version`, `author`, `tags`, `license`, `requires`, `execution`, `dataset`, `workspace`, `assertions`, `input` **Test fields:** @@ -72,8 +72,8 @@ tests: | `criteria` | yes | What the response should accomplish | | `input` / `input` | yes | Input to the agent | | `expected_output` / `expected_output` | no | Gold-standard reference answer | -| `assert` | no | Evaluators: assertions, rubrics, judges | -| `rubrics` | no | **Deprecated** — use `assert: [{type: rubrics, criteria: [...]}]` instead | +| `assertions` | no | Evaluators: assertions, rubrics, judges | +| `rubrics` | no | **Deprecated** — use `assertions: [{type: rubrics, criteria: [...]}]` instead | | `execution` | no | Per-case execution overrides | | `workspace` | no | Per-case workspace config (overrides suite-level) | | `metadata` | no | Arbitrary key-value pairs passed to setup/teardown scripts | @@ -142,13 +142,13 @@ tests: ./cases.yaml # relative to eval file dir The external file can be YAML (array of test objects) or JSONL. -## Assert Field +## Assertions Field -`assert` defines evaluators at the suite level or per-test level. It is the canonical field for all evaluators (replaces `execution.evaluators`): +`assertions` defines evaluators at the suite level or per-test level. It is the canonical field for all evaluators (replaces `execution.evaluators`): ```yaml # Suite-level (appended to every test) -assert: +assertions: - type: is-json required: true - type: contains @@ -158,25 +158,25 @@ tests: - id: test-1 criteria: Returns JSON input: Get status - # Per-test assert (runs before suite-level) - assert: + # Per-test assertions (runs before suite-level) + assertions: - type: equals value: '{"status": "ok"}' ``` -`execution.evaluators` is deprecated. When both `assert` and `execution.evaluators` are present, `assert` takes precedence. +`execution.evaluators` is deprecated. When both `assertions` and `execution.evaluators` are present, `assertions` takes precedence. -## How `criteria` and `assert` Interact +## How `criteria` and `assertions` Interact -`criteria` is a **data field** — it describes what the response should accomplish. It is **not** an evaluator. How it gets evaluated depends on whether `assert` is present: +`criteria` is a **data field** — it describes what the response should accomplish. It is **not** an evaluator. How it gets evaluated depends on whether `assertions` is present: | Scenario | What happens | Warning? | |----------|-------------|----------| -| `criteria` + **no `assert`** | Implicit `llm-judge` runs automatically against `criteria` | No | -| `criteria` + **`assert` with only deterministic evaluators** (contains, regex, etc.) | Only declared evaluators run. `criteria` is **not evaluated**. | Yes — warns that no evaluator will consume criteria | -| `criteria` + **`assert` with a judge** (llm-judge, code-judge, agent-judge, rubrics) | Declared evaluators run. Judges receive `criteria` as input. | No | +| `criteria` + **no `assertions`** | Implicit `llm-judge` runs automatically against `criteria` | No | +| `criteria` + **`assertions` with only deterministic evaluators** (contains, regex, etc.) | Only declared evaluators run. `criteria` is **not evaluated**. | Yes — warns that no evaluator will consume criteria | +| `criteria` + **`assertions` with a judge** (llm-judge, code-judge, agent-judge, rubrics) | Declared evaluators run. Judges receive `criteria` as input. | No | -### No assert → implicit llm-judge +### No assertions → implicit llm-judge The simplest path. `criteria` is automatically evaluated by the default `llm-judge`: @@ -185,20 +185,20 @@ tests: - id: simple-eval criteria: Assistant correctly explains the bug and proposes a fix input: "Debug this function..." - # No assert → default llm-judge evaluates against criteria + # No assertions → default llm-judge evaluates against criteria ``` -### assert present → no implicit judge +### assertions present → no implicit judge -When `assert` is defined, **only the declared evaluators run**. If you want an LLM judge alongside deterministic checks, declare it explicitly: +When `assertions` is defined, **only the declared evaluators run**. If you want an LLM judge alongside deterministic checks, declare it explicitly: ```yaml tests: - id: mixed-eval criteria: Response is helpful and mentions the fix input: "Debug this function..." - assert: - - type: llm-judge # must be explicit when assert is present + assertions: + - type: llm-judge # must be explicit when assertions is present - type: contains value: "fix" ``` @@ -208,12 +208,12 @@ tests: ```yaml tests: - id: bad-example - criteria: Gives a thoughtful answer # ⚠ NOT evaluated — no judge in assert + criteria: Gives a thoughtful answer # ⚠ NOT evaluated — no judge in assertions input: "What is 2+2?" - assert: + assertions: - type: contains value: "4" - # Warning: criteria is defined but no evaluator in assert will evaluate it. + # Warning: criteria is defined but no evaluator in assertions will evaluate it. ``` ## Required Gates @@ -221,7 +221,7 @@ tests: Any evaluator can be marked `required` to enforce a minimum score: ```yaml -assert: +assertions: - type: contains value: "DENIED" required: true # must score >= 0.8 (default) @@ -305,7 +305,7 @@ See https://agentv.dev/targets/configuration/#repository-lifecycle ## Evaluator Types -Configure via `assert` array. Multiple evaluators produce a weighted average score. +Configure via `assertions` array. Multiple evaluators produce a weighted average score. ### code_judge ```yaml @@ -339,7 +339,7 @@ Variables: `{{question}}`, `{{criteria}}`, `{{answer}}`, `{{reference_answer}}`, ```yaml - name: gate type: composite - assert: + assertions: - name: safety type: llm-judge prompt: ./safety.md diff --git a/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json b/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json index 6b5dd0f11..57c56188e 100644 --- a/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json +++ b/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json @@ -53,7 +53,12 @@ "properties": { "role": { "type": "string", - "enum": ["system", "user", "assistant", "tool"] + "enum": [ + "system", + "user", + "assistant", + "tool" + ] }, "content": { "anyOf": [ @@ -67,20 +72,29 @@ "properties": { "type": { "type": "string", - "enum": ["text", "file"] + "enum": [ + "text", + "file" + ] }, "value": { "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false } } ] } }, - "required": ["role", "content"], + "required": [ + "role", + "content" + ], "additionalProperties": false } } @@ -115,7 +129,12 @@ "properties": { "role": { "type": "string", - "enum": ["system", "user", "assistant", "tool"] + "enum": [ + "system", + "user", + "assistant", + "tool" + ] }, "content": { "anyOf": [ @@ -129,20 +148,29 @@ "properties": { "type": { "type": "string", - "enum": ["text", "file"] + "enum": [ + "text", + "file" + ] }, "value": { "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false } } ] } }, - "required": ["role", "content"], + "required": [ + "role", + "content" + ], "additionalProperties": false } } @@ -164,7 +192,12 @@ "properties": { "role": { "type": "string", - "enum": ["system", "user", "assistant", "tool"] + "enum": [ + "system", + "user", + "assistant", + "tool" + ] }, "content": { "anyOf": [ @@ -178,20 +211,29 @@ "properties": { "type": { "type": "string", - "enum": ["text", "file"] + "enum": [ + "text", + "file" + ] }, "value": { "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false } } ] } }, - "required": ["role", "content"], + "required": [ + "role", + "content" + ], "additionalProperties": false } } @@ -228,7 +270,10 @@ }, "type": { "type": "string", - "enum": ["code-judge", "code_judge"] + "enum": [ + "code-judge", + "code_judge" + ] }, "command": { "anyOf": [ @@ -280,7 +325,10 @@ "additionalProperties": {} } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false }, { @@ -310,7 +358,10 @@ }, "type": { "type": "string", - "enum": ["llm-judge", "llm_judge"] + "enum": [ + "llm-judge", + "llm_judge" + ] }, "prompt": { "anyOf": [ @@ -404,7 +455,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -423,7 +477,9 @@ "additionalProperties": {} } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -483,7 +539,9 @@ } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -499,7 +557,10 @@ "maximum": 1 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -516,7 +577,10 @@ "type": "string" } }, - "required": ["type", "path"], + "required": [ + "type", + "path" + ], "additionalProperties": false }, { @@ -533,13 +597,18 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false } ] } }, - "required": ["type", "aggregator"], + "required": [ + "type", + "aggregator" + ], "additionalProperties": false }, { @@ -569,11 +638,20 @@ }, "type": { "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] + "enum": [ + "tool-trajectory", + "tool_trajectory" + ] }, "mode": { "type": "string", - "enum": ["any_order", "in_order", "exact", "subset", "superset"] + "enum": [ + "any_order", + "in_order", + "exact", + "subset", + "superset" + ] }, "minimums": { "type": "object", @@ -614,7 +692,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -628,7 +711,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -639,7 +727,9 @@ ] } }, - "required": ["tool"], + "required": [ + "tool" + ], "additionalProperties": false } }, @@ -647,7 +737,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -661,7 +756,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -672,7 +772,10 @@ ] } }, - "required": ["type", "mode"], + "required": [ + "type", + "mode" + ], "additionalProperties": false }, { @@ -702,7 +805,10 @@ }, "type": { "type": "string", - "enum": ["field-accuracy", "field_accuracy"] + "enum": [ + "field-accuracy", + "field_accuracy" + ] }, "fields": { "type": "array", @@ -714,7 +820,11 @@ }, "match": { "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] + "enum": [ + "exact", + "numeric_tolerance", + "date" + ] }, "required": { "type": "boolean" @@ -736,17 +846,26 @@ } } }, - "required": ["path", "match"], + "required": [ + "path", + "match" + ], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": ["weighted_average", "all_or_nothing"] + "enum": [ + "weighted_average", + "all_or_nothing" + ] } }, - "required": ["type", "fields"], + "required": [ + "type", + "fields" + ], "additionalProperties": false }, { @@ -783,7 +902,10 @@ "minimum": 0 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -820,7 +942,10 @@ "minimum": 0 } }, - "required": ["type", "budget"], + "required": [ + "type", + "budget" + ], "additionalProperties": false }, { @@ -850,7 +975,10 @@ }, "type": { "type": "string", - "enum": ["token-usage", "token_usage"] + "enum": [ + "token-usage", + "token_usage" + ] }, "max_total": { "type": "number", @@ -865,7 +993,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -895,7 +1025,10 @@ }, "type": { "type": "string", - "enum": ["execution-metrics", "execution_metrics"] + "enum": [ + "execution-metrics", + "execution_metrics" + ] }, "max_tool_calls": { "type": "number", @@ -927,7 +1060,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -957,7 +1092,10 @@ }, "type": { "type": "string", - "enum": ["agent-judge", "agent_judge"] + "enum": [ + "agent-judge", + "agent_judge" + ] }, "prompt": { "type": "string" @@ -1011,7 +1149,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -1033,7 +1174,9 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -1069,7 +1212,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -1105,7 +1251,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -1135,10 +1284,15 @@ }, "type": { "type": "string", - "enum": ["is-json", "is_json"] + "enum": [ + "is-json", + "is_json" + ] } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -1174,7 +1328,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -1255,7 +1412,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -1265,7 +1425,10 @@ "minItems": 1 } }, - "required": ["type", "criteria"], + "required": [ + "type", + "criteria" + ], "additionalProperties": false } ] @@ -1302,7 +1465,10 @@ }, "type": { "type": "string", - "enum": ["code-judge", "code_judge"] + "enum": [ + "code-judge", + "code_judge" + ] }, "command": { "anyOf": [ @@ -1354,7 +1520,10 @@ "additionalProperties": {} } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false }, { @@ -1384,7 +1553,10 @@ }, "type": { "type": "string", - "enum": ["llm-judge", "llm_judge"] + "enum": [ + "llm-judge", + "llm_judge" + ] }, "prompt": { "anyOf": [ @@ -1478,7 +1650,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -1497,7 +1672,9 @@ "additionalProperties": {} } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -1557,7 +1734,9 @@ } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -1573,7 +1752,10 @@ "maximum": 1 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -1590,7 +1772,10 @@ "type": "string" } }, - "required": ["type", "path"], + "required": [ + "type", + "path" + ], "additionalProperties": false }, { @@ -1607,13 +1792,18 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false } ] } }, - "required": ["type", "aggregator"], + "required": [ + "type", + "aggregator" + ], "additionalProperties": false }, { @@ -1643,11 +1833,20 @@ }, "type": { "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] + "enum": [ + "tool-trajectory", + "tool_trajectory" + ] }, "mode": { "type": "string", - "enum": ["any_order", "in_order", "exact", "subset", "superset"] + "enum": [ + "any_order", + "in_order", + "exact", + "subset", + "superset" + ] }, "minimums": { "type": "object", @@ -1688,7 +1887,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -1702,7 +1906,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -1713,7 +1922,9 @@ ] } }, - "required": ["tool"], + "required": [ + "tool" + ], "additionalProperties": false } }, @@ -1721,7 +1932,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -1735,7 +1951,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -1746,7 +1967,10 @@ ] } }, - "required": ["type", "mode"], + "required": [ + "type", + "mode" + ], "additionalProperties": false }, { @@ -1776,7 +2000,10 @@ }, "type": { "type": "string", - "enum": ["field-accuracy", "field_accuracy"] + "enum": [ + "field-accuracy", + "field_accuracy" + ] }, "fields": { "type": "array", @@ -1788,7 +2015,11 @@ }, "match": { "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] + "enum": [ + "exact", + "numeric_tolerance", + "date" + ] }, "required": { "type": "boolean" @@ -1810,17 +2041,26 @@ } } }, - "required": ["path", "match"], + "required": [ + "path", + "match" + ], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": ["weighted_average", "all_or_nothing"] + "enum": [ + "weighted_average", + "all_or_nothing" + ] } }, - "required": ["type", "fields"], + "required": [ + "type", + "fields" + ], "additionalProperties": false }, { @@ -1857,7 +2097,10 @@ "minimum": 0 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -1894,7 +2137,10 @@ "minimum": 0 } }, - "required": ["type", "budget"], + "required": [ + "type", + "budget" + ], "additionalProperties": false }, { @@ -1924,7 +2170,10 @@ }, "type": { "type": "string", - "enum": ["token-usage", "token_usage"] + "enum": [ + "token-usage", + "token_usage" + ] }, "max_total": { "type": "number", @@ -1939,7 +2188,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -1969,7 +2220,10 @@ }, "type": { "type": "string", - "enum": ["execution-metrics", "execution_metrics"] + "enum": [ + "execution-metrics", + "execution_metrics" + ] }, "max_tool_calls": { "type": "number", @@ -2001,7 +2255,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -2031,7 +2287,10 @@ }, "type": { "type": "string", - "enum": ["agent-judge", "agent_judge"] + "enum": [ + "agent-judge", + "agent_judge" + ] }, "prompt": { "type": "string" @@ -2085,7 +2344,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -2107,7 +2369,9 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -2143,7 +2407,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -2179,7 +2446,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -2209,10 +2479,15 @@ }, "type": { "type": "string", - "enum": ["is-json", "is_json"] + "enum": [ + "is-json", + "is_json" + ] } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -2248,7 +2523,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -2329,7 +2607,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -2339,7 +2620,10 @@ "minItems": 1 } }, - "required": ["type", "criteria"], + "required": [ + "type", + "criteria" + ], "additionalProperties": false } ] @@ -2376,7 +2660,10 @@ }, "type": { "type": "string", - "enum": ["code-judge", "code_judge"] + "enum": [ + "code-judge", + "code_judge" + ] }, "command": { "anyOf": [ @@ -2428,7 +2715,10 @@ "additionalProperties": {} } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false }, { @@ -2458,7 +2748,10 @@ }, "type": { "type": "string", - "enum": ["llm-judge", "llm_judge"] + "enum": [ + "llm-judge", + "llm_judge" + ] }, "prompt": { "anyOf": [ @@ -2552,7 +2845,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -2571,7 +2867,9 @@ "additionalProperties": {} } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -2631,7 +2929,9 @@ } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -2647,7 +2947,10 @@ "maximum": 1 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -2664,7 +2967,10 @@ "type": "string" } }, - "required": ["type", "path"], + "required": [ + "type", + "path" + ], "additionalProperties": false }, { @@ -2681,13 +2987,18 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false } ] } }, - "required": ["type", "aggregator"], + "required": [ + "type", + "aggregator" + ], "additionalProperties": false }, { @@ -2717,11 +3028,20 @@ }, "type": { "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] + "enum": [ + "tool-trajectory", + "tool_trajectory" + ] }, "mode": { "type": "string", - "enum": ["any_order", "in_order", "exact", "subset", "superset"] + "enum": [ + "any_order", + "in_order", + "exact", + "subset", + "superset" + ] }, "minimums": { "type": "object", @@ -2762,7 +3082,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -2776,7 +3101,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -2787,7 +3117,9 @@ ] } }, - "required": ["tool"], + "required": [ + "tool" + ], "additionalProperties": false } }, @@ -2795,7 +3127,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -2809,7 +3146,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -2820,7 +3162,10 @@ ] } }, - "required": ["type", "mode"], + "required": [ + "type", + "mode" + ], "additionalProperties": false }, { @@ -2850,7 +3195,10 @@ }, "type": { "type": "string", - "enum": ["field-accuracy", "field_accuracy"] + "enum": [ + "field-accuracy", + "field_accuracy" + ] }, "fields": { "type": "array", @@ -2862,7 +3210,11 @@ }, "match": { "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] + "enum": [ + "exact", + "numeric_tolerance", + "date" + ] }, "required": { "type": "boolean" @@ -2884,17 +3236,26 @@ } } }, - "required": ["path", "match"], + "required": [ + "path", + "match" + ], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": ["weighted_average", "all_or_nothing"] + "enum": [ + "weighted_average", + "all_or_nothing" + ] } }, - "required": ["type", "fields"], + "required": [ + "type", + "fields" + ], "additionalProperties": false }, { @@ -2931,7 +3292,10 @@ "minimum": 0 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -2968,7 +3332,10 @@ "minimum": 0 } }, - "required": ["type", "budget"], + "required": [ + "type", + "budget" + ], "additionalProperties": false }, { @@ -2998,7 +3365,10 @@ }, "type": { "type": "string", - "enum": ["token-usage", "token_usage"] + "enum": [ + "token-usage", + "token_usage" + ] }, "max_total": { "type": "number", @@ -3013,7 +3383,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -3043,7 +3415,10 @@ }, "type": { "type": "string", - "enum": ["execution-metrics", "execution_metrics"] + "enum": [ + "execution-metrics", + "execution_metrics" + ] }, "max_tool_calls": { "type": "number", @@ -3075,7 +3450,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -3105,7 +3482,10 @@ }, "type": { "type": "string", - "enum": ["agent-judge", "agent_judge"] + "enum": [ + "agent-judge", + "agent_judge" + ] }, "prompt": { "type": "string" @@ -3159,7 +3539,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -3181,7 +3564,9 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -3217,7 +3602,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -3253,7 +3641,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -3283,10 +3674,15 @@ }, "type": { "type": "string", - "enum": ["is-json", "is_json"] + "enum": [ + "is-json", + "is_json" + ] } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -3322,7 +3718,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -3403,7 +3802,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -3413,7 +3815,10 @@ "minItems": 1 } }, - "required": ["type", "criteria"], + "required": [ + "type", + "criteria" + ], "additionalProperties": false } ] @@ -3462,7 +3867,10 @@ }, "type": { "type": "string", - "enum": ["code-judge", "code_judge"] + "enum": [ + "code-judge", + "code_judge" + ] }, "command": { "anyOf": [ @@ -3514,7 +3922,10 @@ "additionalProperties": {} } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false }, { @@ -3544,7 +3955,10 @@ }, "type": { "type": "string", - "enum": ["llm-judge", "llm_judge"] + "enum": [ + "llm-judge", + "llm_judge" + ] }, "prompt": { "anyOf": [ @@ -3638,7 +4052,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -3657,7 +4074,9 @@ "additionalProperties": {} } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -3717,7 +4136,9 @@ } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -3733,7 +4154,10 @@ "maximum": 1 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -3750,7 +4174,10 @@ "type": "string" } }, - "required": ["type", "path"], + "required": [ + "type", + "path" + ], "additionalProperties": false }, { @@ -3767,13 +4194,18 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false } ] } }, - "required": ["type", "aggregator"], + "required": [ + "type", + "aggregator" + ], "additionalProperties": false }, { @@ -3803,11 +4235,20 @@ }, "type": { "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] + "enum": [ + "tool-trajectory", + "tool_trajectory" + ] }, "mode": { "type": "string", - "enum": ["any_order", "in_order", "exact", "subset", "superset"] + "enum": [ + "any_order", + "in_order", + "exact", + "subset", + "superset" + ] }, "minimums": { "type": "object", @@ -3848,7 +4289,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -3862,7 +4308,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -3873,7 +4324,9 @@ ] } }, - "required": ["tool"], + "required": [ + "tool" + ], "additionalProperties": false } }, @@ -3881,7 +4334,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -3895,7 +4353,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -3906,7 +4369,10 @@ ] } }, - "required": ["type", "mode"], + "required": [ + "type", + "mode" + ], "additionalProperties": false }, { @@ -3936,7 +4402,10 @@ }, "type": { "type": "string", - "enum": ["field-accuracy", "field_accuracy"] + "enum": [ + "field-accuracy", + "field_accuracy" + ] }, "fields": { "type": "array", @@ -3948,7 +4417,11 @@ }, "match": { "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] + "enum": [ + "exact", + "numeric_tolerance", + "date" + ] }, "required": { "type": "boolean" @@ -3970,17 +4443,26 @@ } } }, - "required": ["path", "match"], + "required": [ + "path", + "match" + ], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": ["weighted_average", "all_or_nothing"] + "enum": [ + "weighted_average", + "all_or_nothing" + ] } }, - "required": ["type", "fields"], + "required": [ + "type", + "fields" + ], "additionalProperties": false }, { @@ -4017,7 +4499,10 @@ "minimum": 0 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -4054,7 +4539,10 @@ "minimum": 0 } }, - "required": ["type", "budget"], + "required": [ + "type", + "budget" + ], "additionalProperties": false }, { @@ -4084,7 +4572,10 @@ }, "type": { "type": "string", - "enum": ["token-usage", "token_usage"] + "enum": [ + "token-usage", + "token_usage" + ] }, "max_total": { "type": "number", @@ -4099,7 +4590,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -4129,7 +4622,10 @@ }, "type": { "type": "string", - "enum": ["execution-metrics", "execution_metrics"] + "enum": [ + "execution-metrics", + "execution_metrics" + ] }, "max_tool_calls": { "type": "number", @@ -4161,7 +4657,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -4191,7 +4689,10 @@ }, "type": { "type": "string", - "enum": ["agent-judge", "agent_judge"] + "enum": [ + "agent-judge", + "agent_judge" + ] }, "prompt": { "type": "string" @@ -4245,7 +4746,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -4267,7 +4771,9 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -4303,7 +4809,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -4339,7 +4848,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -4369,10 +4881,15 @@ }, "type": { "type": "string", - "enum": ["is-json", "is_json"] + "enum": [ + "is-json", + "is_json" + ] } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -4408,7 +4925,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -4489,7 +5009,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -4499,7 +5022,10 @@ "minItems": 1 } }, - "required": ["type", "criteria"], + "required": [ + "type", + "criteria" + ], "additionalProperties": false } ] @@ -4536,7 +5062,10 @@ }, "type": { "type": "string", - "enum": ["code-judge", "code_judge"] + "enum": [ + "code-judge", + "code_judge" + ] }, "command": { "anyOf": [ @@ -4588,7 +5117,10 @@ "additionalProperties": {} } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false }, { @@ -4618,7 +5150,10 @@ }, "type": { "type": "string", - "enum": ["llm-judge", "llm_judge"] + "enum": [ + "llm-judge", + "llm_judge" + ] }, "prompt": { "anyOf": [ @@ -4712,7 +5247,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -4731,7 +5269,9 @@ "additionalProperties": {} } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -4791,7 +5331,9 @@ } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -4807,7 +5349,10 @@ "maximum": 1 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -4824,7 +5369,10 @@ "type": "string" } }, - "required": ["type", "path"], + "required": [ + "type", + "path" + ], "additionalProperties": false }, { @@ -4841,13 +5389,18 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false } ] } }, - "required": ["type", "aggregator"], + "required": [ + "type", + "aggregator" + ], "additionalProperties": false }, { @@ -4877,11 +5430,20 @@ }, "type": { "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] + "enum": [ + "tool-trajectory", + "tool_trajectory" + ] }, "mode": { "type": "string", - "enum": ["any_order", "in_order", "exact", "subset", "superset"] + "enum": [ + "any_order", + "in_order", + "exact", + "subset", + "superset" + ] }, "minimums": { "type": "object", @@ -4922,7 +5484,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -4936,7 +5503,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -4947,7 +5519,9 @@ ] } }, - "required": ["tool"], + "required": [ + "tool" + ], "additionalProperties": false } }, @@ -4955,7 +5529,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -4969,7 +5548,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -4980,7 +5564,10 @@ ] } }, - "required": ["type", "mode"], + "required": [ + "type", + "mode" + ], "additionalProperties": false }, { @@ -5010,7 +5597,10 @@ }, "type": { "type": "string", - "enum": ["field-accuracy", "field_accuracy"] + "enum": [ + "field-accuracy", + "field_accuracy" + ] }, "fields": { "type": "array", @@ -5022,7 +5612,11 @@ }, "match": { "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] + "enum": [ + "exact", + "numeric_tolerance", + "date" + ] }, "required": { "type": "boolean" @@ -5044,17 +5638,26 @@ } } }, - "required": ["path", "match"], + "required": [ + "path", + "match" + ], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": ["weighted_average", "all_or_nothing"] + "enum": [ + "weighted_average", + "all_or_nothing" + ] } }, - "required": ["type", "fields"], + "required": [ + "type", + "fields" + ], "additionalProperties": false }, { @@ -5091,7 +5694,10 @@ "minimum": 0 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -5128,7 +5734,10 @@ "minimum": 0 } }, - "required": ["type", "budget"], + "required": [ + "type", + "budget" + ], "additionalProperties": false }, { @@ -5158,7 +5767,10 @@ }, "type": { "type": "string", - "enum": ["token-usage", "token_usage"] + "enum": [ + "token-usage", + "token_usage" + ] }, "max_total": { "type": "number", @@ -5173,7 +5785,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -5203,7 +5817,10 @@ }, "type": { "type": "string", - "enum": ["execution-metrics", "execution_metrics"] + "enum": [ + "execution-metrics", + "execution_metrics" + ] }, "max_tool_calls": { "type": "number", @@ -5235,7 +5852,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -5265,7 +5884,10 @@ }, "type": { "type": "string", - "enum": ["agent-judge", "agent_judge"] + "enum": [ + "agent-judge", + "agent_judge" + ] }, "prompt": { "type": "string" @@ -5319,7 +5941,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -5341,7 +5966,9 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -5377,7 +6004,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -5413,7 +6043,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -5443,10 +6076,15 @@ }, "type": { "type": "string", - "enum": ["is-json", "is_json"] + "enum": [ + "is-json", + "is_json" + ] } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -5482,7 +6120,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -5563,7 +6204,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -5573,7 +6217,10 @@ "minItems": 1 } }, - "required": ["type", "criteria"], + "required": [ + "type", + "criteria" + ], "additionalProperties": false } ] @@ -5610,7 +6257,10 @@ }, "type": { "type": "string", - "enum": ["code-judge", "code_judge"] + "enum": [ + "code-judge", + "code_judge" + ] }, "command": { "anyOf": [ @@ -5662,7 +6312,10 @@ "additionalProperties": {} } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false }, { @@ -5692,7 +6345,10 @@ }, "type": { "type": "string", - "enum": ["llm-judge", "llm_judge"] + "enum": [ + "llm-judge", + "llm_judge" + ] }, "prompt": { "anyOf": [ @@ -5786,7 +6442,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -5805,7 +6464,9 @@ "additionalProperties": {} } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -5865,7 +6526,9 @@ } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -5881,7 +6544,10 @@ "maximum": 1 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -5898,7 +6564,10 @@ "type": "string" } }, - "required": ["type", "path"], + "required": [ + "type", + "path" + ], "additionalProperties": false }, { @@ -5915,13 +6584,18 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false } ] } }, - "required": ["type", "aggregator"], + "required": [ + "type", + "aggregator" + ], "additionalProperties": false }, { @@ -5951,11 +6625,20 @@ }, "type": { "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] + "enum": [ + "tool-trajectory", + "tool_trajectory" + ] }, "mode": { "type": "string", - "enum": ["any_order", "in_order", "exact", "subset", "superset"] + "enum": [ + "any_order", + "in_order", + "exact", + "subset", + "superset" + ] }, "minimums": { "type": "object", @@ -5996,7 +6679,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -6010,7 +6698,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -6021,7 +6714,9 @@ ] } }, - "required": ["tool"], + "required": [ + "tool" + ], "additionalProperties": false } }, @@ -6029,7 +6724,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -6043,7 +6743,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -6054,7 +6759,10 @@ ] } }, - "required": ["type", "mode"], + "required": [ + "type", + "mode" + ], "additionalProperties": false }, { @@ -6084,7 +6792,10 @@ }, "type": { "type": "string", - "enum": ["field-accuracy", "field_accuracy"] + "enum": [ + "field-accuracy", + "field_accuracy" + ] }, "fields": { "type": "array", @@ -6096,7 +6807,11 @@ }, "match": { "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] + "enum": [ + "exact", + "numeric_tolerance", + "date" + ] }, "required": { "type": "boolean" @@ -6118,17 +6833,26 @@ } } }, - "required": ["path", "match"], + "required": [ + "path", + "match" + ], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": ["weighted_average", "all_or_nothing"] + "enum": [ + "weighted_average", + "all_or_nothing" + ] } }, - "required": ["type", "fields"], + "required": [ + "type", + "fields" + ], "additionalProperties": false }, { @@ -6165,7 +6889,10 @@ "minimum": 0 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -6202,7 +6929,10 @@ "minimum": 0 } }, - "required": ["type", "budget"], + "required": [ + "type", + "budget" + ], "additionalProperties": false }, { @@ -6232,7 +6962,10 @@ }, "type": { "type": "string", - "enum": ["token-usage", "token_usage"] + "enum": [ + "token-usage", + "token_usage" + ] }, "max_total": { "type": "number", @@ -6247,7 +6980,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -6277,7 +7012,10 @@ }, "type": { "type": "string", - "enum": ["execution-metrics", "execution_metrics"] + "enum": [ + "execution-metrics", + "execution_metrics" + ] }, "max_tool_calls": { "type": "number", @@ -6309,7 +7047,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -6339,7 +7079,10 @@ }, "type": { "type": "string", - "enum": ["agent-judge", "agent_judge"] + "enum": [ + "agent-judge", + "agent_judge" + ] }, "prompt": { "type": "string" @@ -6393,7 +7136,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -6415,7 +7161,9 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -6451,7 +7199,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -6487,7 +7238,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -6517,10 +7271,15 @@ }, "type": { "type": "string", - "enum": ["is-json", "is_json"] + "enum": [ + "is-json", + "is_json" + ] } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -6556,7 +7315,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -6637,7 +7399,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -6647,7 +7412,10 @@ "minItems": 1 } }, - "required": ["type", "criteria"], + "required": [ + "type", + "criteria" + ], "additionalProperties": false } ] @@ -6668,7 +7436,11 @@ }, "strategy": { "type": "string", - "enum": ["pass_at_k", "mean", "confidence_interval"] + "enum": [ + "pass_at_k", + "mean", + "confidence_interval" + ] }, "cost_limit_usd": { "type": "number", @@ -6679,7 +7451,9 @@ "minimum": 0 } }, - "required": ["count"], + "required": [ + "count" + ], "additionalProperties": false }, "total_budget_usd": { @@ -6707,7 +7481,10 @@ }, "isolation": { "type": "string", - "enum": ["shared", "per_test"] + "enum": [ + "shared", + "per_test" + ] }, "repos": { "type": "array", @@ -6731,7 +7508,10 @@ "format": "uri" } }, - "required": ["type", "url"], + "required": [ + "type", + "url" + ], "additionalProperties": false }, { @@ -6745,7 +7525,10 @@ "type": "string" } }, - "required": ["type", "path"], + "required": [ + "type", + "path" + ], "additionalProperties": false } ] @@ -6758,7 +7541,10 @@ }, "resolve": { "type": "string", - "enum": ["remote", "local"] + "enum": [ + "remote", + "local" + ] }, "ancestor": { "type": "integer", @@ -6787,7 +7573,10 @@ "additionalProperties": false } }, - "required": ["path", "source"], + "required": [ + "path", + "source" + ], "additionalProperties": false } }, @@ -6823,7 +7612,11 @@ }, "reset": { "type": "string", - "enum": ["none", "fast", "strict"] + "enum": [ + "none", + "fast", + "strict" + ] } }, "additionalProperties": false @@ -6854,7 +7647,11 @@ }, "reset": { "type": "string", - "enum": ["none", "fast", "strict"] + "enum": [ + "none", + "fast", + "strict" + ] } }, "additionalProperties": false @@ -6885,7 +7682,11 @@ }, "reset": { "type": "string", - "enum": ["none", "fast", "strict"] + "enum": [ + "none", + "fast", + "strict" + ] } }, "additionalProperties": false @@ -6916,7 +7717,11 @@ }, "reset": { "type": "string", - "enum": ["none", "fast", "strict"] + "enum": [ + "none", + "fast", + "strict" + ] } }, "additionalProperties": false @@ -6926,7 +7731,11 @@ }, "mode": { "type": "string", - "enum": ["pooled", "temp", "static"] + "enum": [ + "pooled", + "temp", + "static" + ] }, "path": { "type": "string" @@ -6948,7 +7757,9 @@ "type": "string" } }, - "required": ["id"], + "required": [ + "id" + ], "additionalProperties": false } }, @@ -6986,7 +7797,12 @@ "properties": { "role": { "type": "string", - "enum": ["system", "user", "assistant", "tool"] + "enum": [ + "system", + "user", + "assistant", + "tool" + ] }, "content": { "anyOf": [ @@ -7000,20 +7816,29 @@ "properties": { "type": { "type": "string", - "enum": ["text", "file"] + "enum": [ + "text", + "file" + ] }, "value": { "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false } } ] } }, - "required": ["role", "content"], + "required": [ + "role", + "content" + ], "additionalProperties": false } } @@ -7035,7 +7860,12 @@ "properties": { "role": { "type": "string", - "enum": ["system", "user", "assistant", "tool"] + "enum": [ + "system", + "user", + "assistant", + "tool" + ] }, "content": { "anyOf": [ @@ -7049,20 +7879,29 @@ "properties": { "type": { "type": "string", - "enum": ["text", "file"] + "enum": [ + "text", + "file" + ] }, "value": { "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false } } ] } }, - "required": ["role", "content"], + "required": [ + "role", + "content" + ], "additionalProperties": false } } @@ -7099,7 +7938,10 @@ }, "type": { "type": "string", - "enum": ["code-judge", "code_judge"] + "enum": [ + "code-judge", + "code_judge" + ] }, "command": { "anyOf": [ @@ -7151,7 +7993,10 @@ "additionalProperties": {} } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false }, { @@ -7181,7 +8026,10 @@ }, "type": { "type": "string", - "enum": ["llm-judge", "llm_judge"] + "enum": [ + "llm-judge", + "llm_judge" + ] }, "prompt": { "anyOf": [ @@ -7275,7 +8123,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -7294,7 +8145,9 @@ "additionalProperties": {} } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -7354,7 +8207,9 @@ } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -7370,7 +8225,10 @@ "maximum": 1 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -7387,7 +8245,10 @@ "type": "string" } }, - "required": ["type", "path"], + "required": [ + "type", + "path" + ], "additionalProperties": false }, { @@ -7404,13 +8265,18 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false } ] } }, - "required": ["type", "aggregator"], + "required": [ + "type", + "aggregator" + ], "additionalProperties": false }, { @@ -7440,11 +8306,20 @@ }, "type": { "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] + "enum": [ + "tool-trajectory", + "tool_trajectory" + ] }, "mode": { "type": "string", - "enum": ["any_order", "in_order", "exact", "subset", "superset"] + "enum": [ + "any_order", + "in_order", + "exact", + "subset", + "superset" + ] }, "minimums": { "type": "object", @@ -7485,7 +8360,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -7499,7 +8379,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -7510,7 +8395,9 @@ ] } }, - "required": ["tool"], + "required": [ + "tool" + ], "additionalProperties": false } }, @@ -7518,7 +8405,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -7532,7 +8424,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -7543,7 +8440,10 @@ ] } }, - "required": ["type", "mode"], + "required": [ + "type", + "mode" + ], "additionalProperties": false }, { @@ -7573,7 +8473,10 @@ }, "type": { "type": "string", - "enum": ["field-accuracy", "field_accuracy"] + "enum": [ + "field-accuracy", + "field_accuracy" + ] }, "fields": { "type": "array", @@ -7585,7 +8488,11 @@ }, "match": { "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] + "enum": [ + "exact", + "numeric_tolerance", + "date" + ] }, "required": { "type": "boolean" @@ -7607,17 +8514,26 @@ } } }, - "required": ["path", "match"], + "required": [ + "path", + "match" + ], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": ["weighted_average", "all_or_nothing"] + "enum": [ + "weighted_average", + "all_or_nothing" + ] } }, - "required": ["type", "fields"], + "required": [ + "type", + "fields" + ], "additionalProperties": false }, { @@ -7654,7 +8570,10 @@ "minimum": 0 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -7691,7 +8610,10 @@ "minimum": 0 } }, - "required": ["type", "budget"], + "required": [ + "type", + "budget" + ], "additionalProperties": false }, { @@ -7721,7 +8643,10 @@ }, "type": { "type": "string", - "enum": ["token-usage", "token_usage"] + "enum": [ + "token-usage", + "token_usage" + ] }, "max_total": { "type": "number", @@ -7736,7 +8661,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -7766,7 +8693,10 @@ }, "type": { "type": "string", - "enum": ["execution-metrics", "execution_metrics"] + "enum": [ + "execution-metrics", + "execution_metrics" + ] }, "max_tool_calls": { "type": "number", @@ -7798,7 +8728,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -7828,7 +8760,10 @@ }, "type": { "type": "string", - "enum": ["agent-judge", "agent_judge"] + "enum": [ + "agent-judge", + "agent_judge" + ] }, "prompt": { "type": "string" @@ -7882,7 +8817,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -7904,7 +8842,9 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -7940,7 +8880,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -7976,7 +8919,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -8006,10 +8952,15 @@ }, "type": { "type": "string", - "enum": ["is-json", "is_json"] + "enum": [ + "is-json", + "is_json" + ] } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -8045,7 +8996,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -8126,7 +9080,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -8136,7 +9093,10 @@ "minItems": 1 } }, - "required": ["type", "criteria"], + "required": [ + "type", + "criteria" + ], "additionalProperties": false } ] @@ -8173,7 +9133,10 @@ }, "type": { "type": "string", - "enum": ["code-judge", "code_judge"] + "enum": [ + "code-judge", + "code_judge" + ] }, "command": { "anyOf": [ @@ -8225,7 +9188,10 @@ "additionalProperties": {} } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false }, { @@ -8255,7 +9221,10 @@ }, "type": { "type": "string", - "enum": ["llm-judge", "llm_judge"] + "enum": [ + "llm-judge", + "llm_judge" + ] }, "prompt": { "anyOf": [ @@ -8349,7 +9318,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -8368,7 +9340,9 @@ "additionalProperties": {} } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -8428,7 +9402,9 @@ } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -8444,7 +9420,10 @@ "maximum": 1 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -8461,7 +9440,10 @@ "type": "string" } }, - "required": ["type", "path"], + "required": [ + "type", + "path" + ], "additionalProperties": false }, { @@ -8478,13 +9460,18 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false } ] } }, - "required": ["type", "aggregator"], + "required": [ + "type", + "aggregator" + ], "additionalProperties": false }, { @@ -8514,11 +9501,20 @@ }, "type": { "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] + "enum": [ + "tool-trajectory", + "tool_trajectory" + ] }, "mode": { "type": "string", - "enum": ["any_order", "in_order", "exact", "subset", "superset"] + "enum": [ + "any_order", + "in_order", + "exact", + "subset", + "superset" + ] }, "minimums": { "type": "object", @@ -8559,7 +9555,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -8573,7 +9574,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -8584,7 +9590,9 @@ ] } }, - "required": ["tool"], + "required": [ + "tool" + ], "additionalProperties": false } }, @@ -8592,7 +9600,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -8606,7 +9619,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -8617,7 +9635,10 @@ ] } }, - "required": ["type", "mode"], + "required": [ + "type", + "mode" + ], "additionalProperties": false }, { @@ -8647,7 +9668,10 @@ }, "type": { "type": "string", - "enum": ["field-accuracy", "field_accuracy"] + "enum": [ + "field-accuracy", + "field_accuracy" + ] }, "fields": { "type": "array", @@ -8659,7 +9683,11 @@ }, "match": { "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] + "enum": [ + "exact", + "numeric_tolerance", + "date" + ] }, "required": { "type": "boolean" @@ -8681,17 +9709,26 @@ } } }, - "required": ["path", "match"], + "required": [ + "path", + "match" + ], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": ["weighted_average", "all_or_nothing"] + "enum": [ + "weighted_average", + "all_or_nothing" + ] } }, - "required": ["type", "fields"], + "required": [ + "type", + "fields" + ], "additionalProperties": false }, { @@ -8728,7 +9765,10 @@ "minimum": 0 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -8765,7 +9805,10 @@ "minimum": 0 } }, - "required": ["type", "budget"], + "required": [ + "type", + "budget" + ], "additionalProperties": false }, { @@ -8795,7 +9838,10 @@ }, "type": { "type": "string", - "enum": ["token-usage", "token_usage"] + "enum": [ + "token-usage", + "token_usage" + ] }, "max_total": { "type": "number", @@ -8810,7 +9856,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -8840,7 +9888,10 @@ }, "type": { "type": "string", - "enum": ["execution-metrics", "execution_metrics"] + "enum": [ + "execution-metrics", + "execution_metrics" + ] }, "max_tool_calls": { "type": "number", @@ -8872,7 +9923,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -8902,7 +9955,10 @@ }, "type": { "type": "string", - "enum": ["agent-judge", "agent_judge"] + "enum": [ + "agent-judge", + "agent_judge" + ] }, "prompt": { "type": "string" @@ -8956,7 +10012,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -8978,7 +10037,9 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -9014,7 +10075,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -9050,7 +10114,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -9080,10 +10147,15 @@ }, "type": { "type": "string", - "enum": ["is-json", "is_json"] + "enum": [ + "is-json", + "is_json" + ] } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -9119,7 +10191,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -9200,7 +10275,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -9210,7 +10288,10 @@ "minItems": 1 } }, - "required": ["type", "criteria"], + "required": [ + "type", + "criteria" + ], "additionalProperties": false } ] @@ -9247,7 +10328,10 @@ }, "type": { "type": "string", - "enum": ["code-judge", "code_judge"] + "enum": [ + "code-judge", + "code_judge" + ] }, "command": { "anyOf": [ @@ -9299,7 +10383,10 @@ "additionalProperties": {} } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false }, { @@ -9329,7 +10416,10 @@ }, "type": { "type": "string", - "enum": ["llm-judge", "llm_judge"] + "enum": [ + "llm-judge", + "llm_judge" + ] }, "prompt": { "anyOf": [ @@ -9423,7 +10513,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -9442,7 +10535,9 @@ "additionalProperties": {} } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -9502,7 +10597,9 @@ } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -9518,7 +10615,10 @@ "maximum": 1 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -9535,7 +10635,10 @@ "type": "string" } }, - "required": ["type", "path"], + "required": [ + "type", + "path" + ], "additionalProperties": false }, { @@ -9552,13 +10655,18 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false } ] } }, - "required": ["type", "aggregator"], + "required": [ + "type", + "aggregator" + ], "additionalProperties": false }, { @@ -9588,11 +10696,20 @@ }, "type": { "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] + "enum": [ + "tool-trajectory", + "tool_trajectory" + ] }, "mode": { "type": "string", - "enum": ["any_order", "in_order", "exact", "subset", "superset"] + "enum": [ + "any_order", + "in_order", + "exact", + "subset", + "superset" + ] }, "minimums": { "type": "object", @@ -9633,7 +10750,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -9647,7 +10769,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -9658,7 +10785,9 @@ ] } }, - "required": ["tool"], + "required": [ + "tool" + ], "additionalProperties": false } }, @@ -9666,7 +10795,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -9680,7 +10814,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -9691,7 +10830,10 @@ ] } }, - "required": ["type", "mode"], + "required": [ + "type", + "mode" + ], "additionalProperties": false }, { @@ -9721,7 +10863,10 @@ }, "type": { "type": "string", - "enum": ["field-accuracy", "field_accuracy"] + "enum": [ + "field-accuracy", + "field_accuracy" + ] }, "fields": { "type": "array", @@ -9733,7 +10878,11 @@ }, "match": { "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] + "enum": [ + "exact", + "numeric_tolerance", + "date" + ] }, "required": { "type": "boolean" @@ -9755,17 +10904,26 @@ } } }, - "required": ["path", "match"], + "required": [ + "path", + "match" + ], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": ["weighted_average", "all_or_nothing"] + "enum": [ + "weighted_average", + "all_or_nothing" + ] } }, - "required": ["type", "fields"], + "required": [ + "type", + "fields" + ], "additionalProperties": false }, { @@ -9802,7 +10960,10 @@ "minimum": 0 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -9839,7 +11000,10 @@ "minimum": 0 } }, - "required": ["type", "budget"], + "required": [ + "type", + "budget" + ], "additionalProperties": false }, { @@ -9869,7 +11033,10 @@ }, "type": { "type": "string", - "enum": ["token-usage", "token_usage"] + "enum": [ + "token-usage", + "token_usage" + ] }, "max_total": { "type": "number", @@ -9884,7 +11051,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -9914,7 +11083,10 @@ }, "type": { "type": "string", - "enum": ["execution-metrics", "execution_metrics"] + "enum": [ + "execution-metrics", + "execution_metrics" + ] }, "max_tool_calls": { "type": "number", @@ -9946,7 +11118,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -9976,7 +11150,10 @@ }, "type": { "type": "string", - "enum": ["agent-judge", "agent_judge"] + "enum": [ + "agent-judge", + "agent_judge" + ] }, "prompt": { "type": "string" @@ -10030,7 +11207,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -10052,7 +11232,9 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -10088,7 +11270,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -10124,7 +11309,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -10154,10 +11342,15 @@ }, "type": { "type": "string", - "enum": ["is-json", "is_json"] + "enum": [ + "is-json", + "is_json" + ] } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -10193,7 +11386,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -10274,7 +11470,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -10284,7 +11483,10 @@ "minItems": 1 } }, - "required": ["type", "criteria"], + "required": [ + "type", + "criteria" + ], "additionalProperties": false } ] @@ -10333,7 +11535,10 @@ }, "type": { "type": "string", - "enum": ["code-judge", "code_judge"] + "enum": [ + "code-judge", + "code_judge" + ] }, "command": { "anyOf": [ @@ -10385,7 +11590,10 @@ "additionalProperties": {} } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false }, { @@ -10415,7 +11623,10 @@ }, "type": { "type": "string", - "enum": ["llm-judge", "llm_judge"] + "enum": [ + "llm-judge", + "llm_judge" + ] }, "prompt": { "anyOf": [ @@ -10509,7 +11720,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -10528,7 +11742,9 @@ "additionalProperties": {} } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -10588,7 +11804,9 @@ } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -10604,7 +11822,10 @@ "maximum": 1 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -10621,7 +11842,10 @@ "type": "string" } }, - "required": ["type", "path"], + "required": [ + "type", + "path" + ], "additionalProperties": false }, { @@ -10638,13 +11862,18 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false } ] } }, - "required": ["type", "aggregator"], + "required": [ + "type", + "aggregator" + ], "additionalProperties": false }, { @@ -10674,11 +11903,20 @@ }, "type": { "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] + "enum": [ + "tool-trajectory", + "tool_trajectory" + ] }, "mode": { "type": "string", - "enum": ["any_order", "in_order", "exact", "subset", "superset"] + "enum": [ + "any_order", + "in_order", + "exact", + "subset", + "superset" + ] }, "minimums": { "type": "object", @@ -10719,7 +11957,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -10733,7 +11976,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -10744,7 +11992,9 @@ ] } }, - "required": ["tool"], + "required": [ + "tool" + ], "additionalProperties": false } }, @@ -10752,7 +12002,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -10766,7 +12021,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -10777,7 +12037,10 @@ ] } }, - "required": ["type", "mode"], + "required": [ + "type", + "mode" + ], "additionalProperties": false }, { @@ -10807,7 +12070,10 @@ }, "type": { "type": "string", - "enum": ["field-accuracy", "field_accuracy"] + "enum": [ + "field-accuracy", + "field_accuracy" + ] }, "fields": { "type": "array", @@ -10819,7 +12085,11 @@ }, "match": { "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] + "enum": [ + "exact", + "numeric_tolerance", + "date" + ] }, "required": { "type": "boolean" @@ -10841,17 +12111,26 @@ } } }, - "required": ["path", "match"], + "required": [ + "path", + "match" + ], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": ["weighted_average", "all_or_nothing"] + "enum": [ + "weighted_average", + "all_or_nothing" + ] } }, - "required": ["type", "fields"], + "required": [ + "type", + "fields" + ], "additionalProperties": false }, { @@ -10888,7 +12167,10 @@ "minimum": 0 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -10925,7 +12207,10 @@ "minimum": 0 } }, - "required": ["type", "budget"], + "required": [ + "type", + "budget" + ], "additionalProperties": false }, { @@ -10955,7 +12240,10 @@ }, "type": { "type": "string", - "enum": ["token-usage", "token_usage"] + "enum": [ + "token-usage", + "token_usage" + ] }, "max_total": { "type": "number", @@ -10970,7 +12258,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -11000,7 +12290,10 @@ }, "type": { "type": "string", - "enum": ["execution-metrics", "execution_metrics"] + "enum": [ + "execution-metrics", + "execution_metrics" + ] }, "max_tool_calls": { "type": "number", @@ -11032,7 +12325,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -11062,7 +12357,10 @@ }, "type": { "type": "string", - "enum": ["agent-judge", "agent_judge"] + "enum": [ + "agent-judge", + "agent_judge" + ] }, "prompt": { "type": "string" @@ -11116,7 +12414,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -11138,7 +12439,9 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -11174,7 +12477,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -11210,7 +12516,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -11240,10 +12549,15 @@ }, "type": { "type": "string", - "enum": ["is-json", "is_json"] + "enum": [ + "is-json", + "is_json" + ] } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -11279,7 +12593,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -11360,7 +12677,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -11370,7 +12690,10 @@ "minItems": 1 } }, - "required": ["type", "criteria"], + "required": [ + "type", + "criteria" + ], "additionalProperties": false } ] @@ -11407,7 +12730,10 @@ }, "type": { "type": "string", - "enum": ["code-judge", "code_judge"] + "enum": [ + "code-judge", + "code_judge" + ] }, "command": { "anyOf": [ @@ -11459,7 +12785,10 @@ "additionalProperties": {} } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false }, { @@ -11489,7 +12818,10 @@ }, "type": { "type": "string", - "enum": ["llm-judge", "llm_judge"] + "enum": [ + "llm-judge", + "llm_judge" + ] }, "prompt": { "anyOf": [ @@ -11583,7 +12915,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -11602,7 +12937,9 @@ "additionalProperties": {} } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -11662,7 +12999,9 @@ } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -11678,7 +13017,10 @@ "maximum": 1 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -11695,7 +13037,10 @@ "type": "string" } }, - "required": ["type", "path"], + "required": [ + "type", + "path" + ], "additionalProperties": false }, { @@ -11712,13 +13057,18 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false } ] } }, - "required": ["type", "aggregator"], + "required": [ + "type", + "aggregator" + ], "additionalProperties": false }, { @@ -11748,11 +13098,20 @@ }, "type": { "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] + "enum": [ + "tool-trajectory", + "tool_trajectory" + ] }, "mode": { "type": "string", - "enum": ["any_order", "in_order", "exact", "subset", "superset"] + "enum": [ + "any_order", + "in_order", + "exact", + "subset", + "superset" + ] }, "minimums": { "type": "object", @@ -11793,7 +13152,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -11807,7 +13171,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -11818,7 +13187,9 @@ ] } }, - "required": ["tool"], + "required": [ + "tool" + ], "additionalProperties": false } }, @@ -11826,7 +13197,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -11840,7 +13216,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -11851,7 +13232,10 @@ ] } }, - "required": ["type", "mode"], + "required": [ + "type", + "mode" + ], "additionalProperties": false }, { @@ -11881,7 +13265,10 @@ }, "type": { "type": "string", - "enum": ["field-accuracy", "field_accuracy"] + "enum": [ + "field-accuracy", + "field_accuracy" + ] }, "fields": { "type": "array", @@ -11893,7 +13280,11 @@ }, "match": { "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] + "enum": [ + "exact", + "numeric_tolerance", + "date" + ] }, "required": { "type": "boolean" @@ -11915,17 +13306,26 @@ } } }, - "required": ["path", "match"], + "required": [ + "path", + "match" + ], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": ["weighted_average", "all_or_nothing"] + "enum": [ + "weighted_average", + "all_or_nothing" + ] } }, - "required": ["type", "fields"], + "required": [ + "type", + "fields" + ], "additionalProperties": false }, { @@ -11962,7 +13362,10 @@ "minimum": 0 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -11999,7 +13402,10 @@ "minimum": 0 } }, - "required": ["type", "budget"], + "required": [ + "type", + "budget" + ], "additionalProperties": false }, { @@ -12029,7 +13435,10 @@ }, "type": { "type": "string", - "enum": ["token-usage", "token_usage"] + "enum": [ + "token-usage", + "token_usage" + ] }, "max_total": { "type": "number", @@ -12044,7 +13453,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -12074,7 +13485,10 @@ }, "type": { "type": "string", - "enum": ["execution-metrics", "execution_metrics"] + "enum": [ + "execution-metrics", + "execution_metrics" + ] }, "max_tool_calls": { "type": "number", @@ -12106,7 +13520,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -12136,7 +13552,10 @@ }, "type": { "type": "string", - "enum": ["agent-judge", "agent_judge"] + "enum": [ + "agent-judge", + "agent_judge" + ] }, "prompt": { "type": "string" @@ -12190,7 +13609,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -12212,7 +13634,9 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -12248,7 +13672,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -12284,7 +13711,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -12314,10 +13744,15 @@ }, "type": { "type": "string", - "enum": ["is-json", "is_json"] + "enum": [ + "is-json", + "is_json" + ] } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -12353,7 +13788,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -12434,7 +13872,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -12444,7 +13885,10 @@ "minItems": 1 } }, - "required": ["type", "criteria"], + "required": [ + "type", + "criteria" + ], "additionalProperties": false } ] @@ -12481,7 +13925,10 @@ }, "type": { "type": "string", - "enum": ["code-judge", "code_judge"] + "enum": [ + "code-judge", + "code_judge" + ] }, "command": { "anyOf": [ @@ -12533,7 +13980,10 @@ "additionalProperties": {} } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false }, { @@ -12563,7 +14013,10 @@ }, "type": { "type": "string", - "enum": ["llm-judge", "llm_judge"] + "enum": [ + "llm-judge", + "llm_judge" + ] }, "prompt": { "anyOf": [ @@ -12657,7 +14110,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -12676,7 +14132,9 @@ "additionalProperties": {} } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -12736,7 +14194,9 @@ } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -12752,7 +14212,10 @@ "maximum": 1 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -12769,7 +14232,10 @@ "type": "string" } }, - "required": ["type", "path"], + "required": [ + "type", + "path" + ], "additionalProperties": false }, { @@ -12786,13 +14252,18 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false } ] } }, - "required": ["type", "aggregator"], + "required": [ + "type", + "aggregator" + ], "additionalProperties": false }, { @@ -12822,11 +14293,20 @@ }, "type": { "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] + "enum": [ + "tool-trajectory", + "tool_trajectory" + ] }, "mode": { "type": "string", - "enum": ["any_order", "in_order", "exact", "subset", "superset"] + "enum": [ + "any_order", + "in_order", + "exact", + "subset", + "superset" + ] }, "minimums": { "type": "object", @@ -12867,7 +14347,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -12881,7 +14366,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -12892,7 +14382,9 @@ ] } }, - "required": ["tool"], + "required": [ + "tool" + ], "additionalProperties": false } }, @@ -12900,7 +14392,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -12914,7 +14411,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -12925,7 +14427,10 @@ ] } }, - "required": ["type", "mode"], + "required": [ + "type", + "mode" + ], "additionalProperties": false }, { @@ -12955,7 +14460,10 @@ }, "type": { "type": "string", - "enum": ["field-accuracy", "field_accuracy"] + "enum": [ + "field-accuracy", + "field_accuracy" + ] }, "fields": { "type": "array", @@ -12967,7 +14475,11 @@ }, "match": { "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] + "enum": [ + "exact", + "numeric_tolerance", + "date" + ] }, "required": { "type": "boolean" @@ -12989,17 +14501,26 @@ } } }, - "required": ["path", "match"], + "required": [ + "path", + "match" + ], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": ["weighted_average", "all_or_nothing"] + "enum": [ + "weighted_average", + "all_or_nothing" + ] } }, - "required": ["type", "fields"], + "required": [ + "type", + "fields" + ], "additionalProperties": false }, { @@ -13036,7 +14557,10 @@ "minimum": 0 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -13073,7 +14597,10 @@ "minimum": 0 } }, - "required": ["type", "budget"], + "required": [ + "type", + "budget" + ], "additionalProperties": false }, { @@ -13103,7 +14630,10 @@ }, "type": { "type": "string", - "enum": ["token-usage", "token_usage"] + "enum": [ + "token-usage", + "token_usage" + ] }, "max_total": { "type": "number", @@ -13118,7 +14648,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -13148,7 +14680,10 @@ }, "type": { "type": "string", - "enum": ["execution-metrics", "execution_metrics"] + "enum": [ + "execution-metrics", + "execution_metrics" + ] }, "max_tool_calls": { "type": "number", @@ -13180,7 +14715,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -13210,7 +14747,10 @@ }, "type": { "type": "string", - "enum": ["agent-judge", "agent_judge"] + "enum": [ + "agent-judge", + "agent_judge" + ] }, "prompt": { "type": "string" @@ -13264,7 +14804,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -13286,7 +14829,9 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -13322,7 +14867,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -13358,7 +14906,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -13388,10 +14939,15 @@ }, "type": { "type": "string", - "enum": ["is-json", "is_json"] + "enum": [ + "is-json", + "is_json" + ] } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -13427,7 +14983,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -13508,7 +15067,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -13518,7 +15080,10 @@ "minItems": 1 } }, - "required": ["type", "criteria"], + "required": [ + "type", + "criteria" + ], "additionalProperties": false } ] @@ -13539,7 +15104,11 @@ }, "strategy": { "type": "string", - "enum": ["pass_at_k", "mean", "confidence_interval"] + "enum": [ + "pass_at_k", + "mean", + "confidence_interval" + ] }, "cost_limit_usd": { "type": "number", @@ -13550,7 +15119,9 @@ "minimum": 0 } }, - "required": ["count"], + "required": [ + "count" + ], "additionalProperties": false }, "total_budget_usd": { @@ -13578,7 +15149,10 @@ }, "isolation": { "type": "string", - "enum": ["shared", "per_test"] + "enum": [ + "shared", + "per_test" + ] }, "repos": { "type": "array", @@ -13602,7 +15176,10 @@ "format": "uri" } }, - "required": ["type", "url"], + "required": [ + "type", + "url" + ], "additionalProperties": false }, { @@ -13616,7 +15193,10 @@ "type": "string" } }, - "required": ["type", "path"], + "required": [ + "type", + "path" + ], "additionalProperties": false } ] @@ -13629,7 +15209,10 @@ }, "resolve": { "type": "string", - "enum": ["remote", "local"] + "enum": [ + "remote", + "local" + ] }, "ancestor": { "type": "integer", @@ -13658,7 +15241,10 @@ "additionalProperties": false } }, - "required": ["path", "source"], + "required": [ + "path", + "source" + ], "additionalProperties": false } }, @@ -13694,7 +15280,11 @@ }, "reset": { "type": "string", - "enum": ["none", "fast", "strict"] + "enum": [ + "none", + "fast", + "strict" + ] } }, "additionalProperties": false @@ -13725,7 +15315,11 @@ }, "reset": { "type": "string", - "enum": ["none", "fast", "strict"] + "enum": [ + "none", + "fast", + "strict" + ] } }, "additionalProperties": false @@ -13756,7 +15350,11 @@ }, "reset": { "type": "string", - "enum": ["none", "fast", "strict"] + "enum": [ + "none", + "fast", + "strict" + ] } }, "additionalProperties": false @@ -13787,7 +15385,11 @@ }, "reset": { "type": "string", - "enum": ["none", "fast", "strict"] + "enum": [ + "none", + "fast", + "strict" + ] } }, "additionalProperties": false @@ -13797,7 +15399,11 @@ }, "mode": { "type": "string", - "enum": ["pooled", "temp", "static"] + "enum": [ + "pooled", + "temp", + "static" + ] }, "path": { "type": "string" @@ -13819,7 +15425,9 @@ "type": "string" } }, - "required": ["id"], + "required": [ + "id" + ], "additionalProperties": false } }, @@ -13874,7 +15482,10 @@ }, "type": { "type": "string", - "enum": ["code-judge", "code_judge"] + "enum": [ + "code-judge", + "code_judge" + ] }, "command": { "anyOf": [ @@ -13926,7 +15537,10 @@ "additionalProperties": {} } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false }, { @@ -13956,7 +15570,10 @@ }, "type": { "type": "string", - "enum": ["llm-judge", "llm_judge"] + "enum": [ + "llm-judge", + "llm_judge" + ] }, "prompt": { "anyOf": [ @@ -14050,7 +15667,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -14069,7 +15689,9 @@ "additionalProperties": {} } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -14129,7 +15751,9 @@ } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -14145,7 +15769,10 @@ "maximum": 1 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -14162,7 +15789,10 @@ "type": "string" } }, - "required": ["type", "path"], + "required": [ + "type", + "path" + ], "additionalProperties": false }, { @@ -14179,13 +15809,18 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false } ] } }, - "required": ["type", "aggregator"], + "required": [ + "type", + "aggregator" + ], "additionalProperties": false }, { @@ -14215,11 +15850,20 @@ }, "type": { "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] + "enum": [ + "tool-trajectory", + "tool_trajectory" + ] }, "mode": { "type": "string", - "enum": ["any_order", "in_order", "exact", "subset", "superset"] + "enum": [ + "any_order", + "in_order", + "exact", + "subset", + "superset" + ] }, "minimums": { "type": "object", @@ -14260,7 +15904,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -14274,7 +15923,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -14285,7 +15939,9 @@ ] } }, - "required": ["tool"], + "required": [ + "tool" + ], "additionalProperties": false } }, @@ -14293,7 +15949,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -14307,7 +15968,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -14318,7 +15984,10 @@ ] } }, - "required": ["type", "mode"], + "required": [ + "type", + "mode" + ], "additionalProperties": false }, { @@ -14348,7 +16017,10 @@ }, "type": { "type": "string", - "enum": ["field-accuracy", "field_accuracy"] + "enum": [ + "field-accuracy", + "field_accuracy" + ] }, "fields": { "type": "array", @@ -14360,7 +16032,11 @@ }, "match": { "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] + "enum": [ + "exact", + "numeric_tolerance", + "date" + ] }, "required": { "type": "boolean" @@ -14382,17 +16058,26 @@ } } }, - "required": ["path", "match"], + "required": [ + "path", + "match" + ], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": ["weighted_average", "all_or_nothing"] + "enum": [ + "weighted_average", + "all_or_nothing" + ] } }, - "required": ["type", "fields"], + "required": [ + "type", + "fields" + ], "additionalProperties": false }, { @@ -14429,7 +16114,10 @@ "minimum": 0 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -14466,7 +16154,10 @@ "minimum": 0 } }, - "required": ["type", "budget"], + "required": [ + "type", + "budget" + ], "additionalProperties": false }, { @@ -14496,7 +16187,10 @@ }, "type": { "type": "string", - "enum": ["token-usage", "token_usage"] + "enum": [ + "token-usage", + "token_usage" + ] }, "max_total": { "type": "number", @@ -14511,7 +16205,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -14541,7 +16237,10 @@ }, "type": { "type": "string", - "enum": ["execution-metrics", "execution_metrics"] + "enum": [ + "execution-metrics", + "execution_metrics" + ] }, "max_tool_calls": { "type": "number", @@ -14573,7 +16272,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -14603,7 +16304,10 @@ }, "type": { "type": "string", - "enum": ["agent-judge", "agent_judge"] + "enum": [ + "agent-judge", + "agent_judge" + ] }, "prompt": { "type": "string" @@ -14657,7 +16361,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -14679,7 +16386,9 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -14715,7 +16424,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -14751,7 +16463,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -14781,10 +16496,15 @@ }, "type": { "type": "string", - "enum": ["is-json", "is_json"] + "enum": [ + "is-json", + "is_json" + ] } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -14820,7 +16540,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -14901,7 +16624,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -14911,7 +16637,10 @@ "minItems": 1 } }, - "required": ["type", "criteria"], + "required": [ + "type", + "criteria" + ], "additionalProperties": false } ] @@ -14948,7 +16677,10 @@ }, "type": { "type": "string", - "enum": ["code-judge", "code_judge"] + "enum": [ + "code-judge", + "code_judge" + ] }, "command": { "anyOf": [ @@ -15000,7 +16732,10 @@ "additionalProperties": {} } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false }, { @@ -15030,7 +16765,10 @@ }, "type": { "type": "string", - "enum": ["llm-judge", "llm_judge"] + "enum": [ + "llm-judge", + "llm_judge" + ] }, "prompt": { "anyOf": [ @@ -15124,7 +16862,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -15143,7 +16884,9 @@ "additionalProperties": {} } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -15203,7 +16946,9 @@ } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -15219,7 +16964,10 @@ "maximum": 1 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -15236,7 +16984,10 @@ "type": "string" } }, - "required": ["type", "path"], + "required": [ + "type", + "path" + ], "additionalProperties": false }, { @@ -15253,13 +17004,18 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false } ] } }, - "required": ["type", "aggregator"], + "required": [ + "type", + "aggregator" + ], "additionalProperties": false }, { @@ -15289,11 +17045,20 @@ }, "type": { "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] + "enum": [ + "tool-trajectory", + "tool_trajectory" + ] }, "mode": { "type": "string", - "enum": ["any_order", "in_order", "exact", "subset", "superset"] + "enum": [ + "any_order", + "in_order", + "exact", + "subset", + "superset" + ] }, "minimums": { "type": "object", @@ -15334,7 +17099,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -15348,7 +17118,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -15359,7 +17134,9 @@ ] } }, - "required": ["tool"], + "required": [ + "tool" + ], "additionalProperties": false } }, @@ -15367,7 +17144,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -15381,7 +17163,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -15392,7 +17179,10 @@ ] } }, - "required": ["type", "mode"], + "required": [ + "type", + "mode" + ], "additionalProperties": false }, { @@ -15422,7 +17212,10 @@ }, "type": { "type": "string", - "enum": ["field-accuracy", "field_accuracy"] + "enum": [ + "field-accuracy", + "field_accuracy" + ] }, "fields": { "type": "array", @@ -15434,7 +17227,11 @@ }, "match": { "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] + "enum": [ + "exact", + "numeric_tolerance", + "date" + ] }, "required": { "type": "boolean" @@ -15456,17 +17253,26 @@ } } }, - "required": ["path", "match"], + "required": [ + "path", + "match" + ], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": ["weighted_average", "all_or_nothing"] + "enum": [ + "weighted_average", + "all_or_nothing" + ] } }, - "required": ["type", "fields"], + "required": [ + "type", + "fields" + ], "additionalProperties": false }, { @@ -15503,7 +17309,10 @@ "minimum": 0 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -15540,7 +17349,10 @@ "minimum": 0 } }, - "required": ["type", "budget"], + "required": [ + "type", + "budget" + ], "additionalProperties": false }, { @@ -15570,7 +17382,10 @@ }, "type": { "type": "string", - "enum": ["token-usage", "token_usage"] + "enum": [ + "token-usage", + "token_usage" + ] }, "max_total": { "type": "number", @@ -15585,7 +17400,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -15615,7 +17432,10 @@ }, "type": { "type": "string", - "enum": ["execution-metrics", "execution_metrics"] + "enum": [ + "execution-metrics", + "execution_metrics" + ] }, "max_tool_calls": { "type": "number", @@ -15647,7 +17467,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -15677,7 +17499,10 @@ }, "type": { "type": "string", - "enum": ["agent-judge", "agent_judge"] + "enum": [ + "agent-judge", + "agent_judge" + ] }, "prompt": { "type": "string" @@ -15731,7 +17556,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -15753,7 +17581,9 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -15789,7 +17619,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -15825,7 +17658,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -15855,10 +17691,15 @@ }, "type": { "type": "string", - "enum": ["is-json", "is_json"] + "enum": [ + "is-json", + "is_json" + ] } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -15894,7 +17735,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -15975,7 +17819,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -15985,7 +17832,10 @@ "minItems": 1 } }, - "required": ["type", "criteria"], + "required": [ + "type", + "criteria" + ], "additionalProperties": false } ] @@ -16022,7 +17872,10 @@ }, "type": { "type": "string", - "enum": ["code-judge", "code_judge"] + "enum": [ + "code-judge", + "code_judge" + ] }, "command": { "anyOf": [ @@ -16074,7 +17927,10 @@ "additionalProperties": {} } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false }, { @@ -16104,7 +17960,10 @@ }, "type": { "type": "string", - "enum": ["llm-judge", "llm_judge"] + "enum": [ + "llm-judge", + "llm_judge" + ] }, "prompt": { "anyOf": [ @@ -16198,7 +18057,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -16217,7 +18079,9 @@ "additionalProperties": {} } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -16277,7 +18141,9 @@ } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -16293,7 +18159,10 @@ "maximum": 1 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -16310,7 +18179,10 @@ "type": "string" } }, - "required": ["type", "path"], + "required": [ + "type", + "path" + ], "additionalProperties": false }, { @@ -16327,13 +18199,18 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false } ] } }, - "required": ["type", "aggregator"], + "required": [ + "type", + "aggregator" + ], "additionalProperties": false }, { @@ -16363,11 +18240,20 @@ }, "type": { "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] + "enum": [ + "tool-trajectory", + "tool_trajectory" + ] }, "mode": { "type": "string", - "enum": ["any_order", "in_order", "exact", "subset", "superset"] + "enum": [ + "any_order", + "in_order", + "exact", + "subset", + "superset" + ] }, "minimums": { "type": "object", @@ -16408,7 +18294,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -16422,7 +18313,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -16433,7 +18329,9 @@ ] } }, - "required": ["tool"], + "required": [ + "tool" + ], "additionalProperties": false } }, @@ -16441,7 +18339,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -16455,7 +18358,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -16466,7 +18374,10 @@ ] } }, - "required": ["type", "mode"], + "required": [ + "type", + "mode" + ], "additionalProperties": false }, { @@ -16496,7 +18407,10 @@ }, "type": { "type": "string", - "enum": ["field-accuracy", "field_accuracy"] + "enum": [ + "field-accuracy", + "field_accuracy" + ] }, "fields": { "type": "array", @@ -16508,7 +18422,11 @@ }, "match": { "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] + "enum": [ + "exact", + "numeric_tolerance", + "date" + ] }, "required": { "type": "boolean" @@ -16530,17 +18448,26 @@ } } }, - "required": ["path", "match"], + "required": [ + "path", + "match" + ], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": ["weighted_average", "all_or_nothing"] + "enum": [ + "weighted_average", + "all_or_nothing" + ] } }, - "required": ["type", "fields"], + "required": [ + "type", + "fields" + ], "additionalProperties": false }, { @@ -16577,7 +18504,10 @@ "minimum": 0 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -16614,7 +18544,10 @@ "minimum": 0 } }, - "required": ["type", "budget"], + "required": [ + "type", + "budget" + ], "additionalProperties": false }, { @@ -16644,7 +18577,10 @@ }, "type": { "type": "string", - "enum": ["token-usage", "token_usage"] + "enum": [ + "token-usage", + "token_usage" + ] }, "max_total": { "type": "number", @@ -16659,7 +18595,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -16689,7 +18627,10 @@ }, "type": { "type": "string", - "enum": ["execution-metrics", "execution_metrics"] + "enum": [ + "execution-metrics", + "execution_metrics" + ] }, "max_tool_calls": { "type": "number", @@ -16721,7 +18662,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -16751,7 +18694,10 @@ }, "type": { "type": "string", - "enum": ["agent-judge", "agent_judge"] + "enum": [ + "agent-judge", + "agent_judge" + ] }, "prompt": { "type": "string" @@ -16805,7 +18751,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -16827,7 +18776,9 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -16863,7 +18814,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -16899,7 +18853,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -16929,10 +18886,15 @@ }, "type": { "type": "string", - "enum": ["is-json", "is_json"] + "enum": [ + "is-json", + "is_json" + ] } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -16968,7 +18930,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -17049,7 +19014,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -17059,7 +19027,10 @@ "minItems": 1 } }, - "required": ["type", "criteria"], + "required": [ + "type", + "criteria" + ], "additionalProperties": false } ] @@ -17080,7 +19051,11 @@ }, "strategy": { "type": "string", - "enum": ["pass_at_k", "mean", "confidence_interval"] + "enum": [ + "pass_at_k", + "mean", + "confidence_interval" + ] }, "cost_limit_usd": { "type": "number", @@ -17091,7 +19066,9 @@ "minimum": 0 } }, - "required": ["count"], + "required": [ + "count" + ], "additionalProperties": false }, "total_budget_usd": { @@ -17111,7 +19088,7 @@ }, "additionalProperties": false }, - "assert": { + "assertions": { "type": "array", "items": { "anyOf": [ @@ -17142,7 +19119,10 @@ }, "type": { "type": "string", - "enum": ["code-judge", "code_judge"] + "enum": [ + "code-judge", + "code_judge" + ] }, "command": { "anyOf": [ @@ -17194,7 +19174,10 @@ "additionalProperties": {} } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false }, { @@ -17224,7 +19207,10 @@ }, "type": { "type": "string", - "enum": ["llm-judge", "llm_judge"] + "enum": [ + "llm-judge", + "llm_judge" + ] }, "prompt": { "anyOf": [ @@ -17318,7 +19304,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -17337,7 +19326,9 @@ "additionalProperties": {} } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -17397,7 +19388,9 @@ } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -17413,7 +19406,10 @@ "maximum": 1 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -17430,7 +19426,10 @@ "type": "string" } }, - "required": ["type", "path"], + "required": [ + "type", + "path" + ], "additionalProperties": false }, { @@ -17447,13 +19446,18 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false } ] } }, - "required": ["type", "aggregator"], + "required": [ + "type", + "aggregator" + ], "additionalProperties": false }, { @@ -17483,11 +19487,20 @@ }, "type": { "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] + "enum": [ + "tool-trajectory", + "tool_trajectory" + ] }, "mode": { "type": "string", - "enum": ["any_order", "in_order", "exact", "subset", "superset"] + "enum": [ + "any_order", + "in_order", + "exact", + "subset", + "superset" + ] }, "minimums": { "type": "object", @@ -17528,7 +19541,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -17542,7 +19560,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -17553,7 +19576,9 @@ ] } }, - "required": ["tool"], + "required": [ + "tool" + ], "additionalProperties": false } }, @@ -17561,7 +19586,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -17575,7 +19605,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -17586,7 +19621,10 @@ ] } }, - "required": ["type", "mode"], + "required": [ + "type", + "mode" + ], "additionalProperties": false }, { @@ -17616,7 +19654,10 @@ }, "type": { "type": "string", - "enum": ["field-accuracy", "field_accuracy"] + "enum": [ + "field-accuracy", + "field_accuracy" + ] }, "fields": { "type": "array", @@ -17628,7 +19669,11 @@ }, "match": { "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] + "enum": [ + "exact", + "numeric_tolerance", + "date" + ] }, "required": { "type": "boolean" @@ -17650,17 +19695,26 @@ } } }, - "required": ["path", "match"], + "required": [ + "path", + "match" + ], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": ["weighted_average", "all_or_nothing"] + "enum": [ + "weighted_average", + "all_or_nothing" + ] } }, - "required": ["type", "fields"], + "required": [ + "type", + "fields" + ], "additionalProperties": false }, { @@ -17697,7 +19751,10 @@ "minimum": 0 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -17734,7 +19791,10 @@ "minimum": 0 } }, - "required": ["type", "budget"], + "required": [ + "type", + "budget" + ], "additionalProperties": false }, { @@ -17764,7 +19824,10 @@ }, "type": { "type": "string", - "enum": ["token-usage", "token_usage"] + "enum": [ + "token-usage", + "token_usage" + ] }, "max_total": { "type": "number", @@ -17779,7 +19842,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -17809,7 +19874,10 @@ }, "type": { "type": "string", - "enum": ["execution-metrics", "execution_metrics"] + "enum": [ + "execution-metrics", + "execution_metrics" + ] }, "max_tool_calls": { "type": "number", @@ -17841,7 +19909,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -17871,7 +19941,10 @@ }, "type": { "type": "string", - "enum": ["agent-judge", "agent_judge"] + "enum": [ + "agent-judge", + "agent_judge" + ] }, "prompt": { "type": "string" @@ -17925,7 +19998,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -17947,7 +20023,9 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -17983,7 +20061,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -18019,7 +20100,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -18049,10 +20133,15 @@ }, "type": { "type": "string", - "enum": ["is-json", "is_json"] + "enum": [ + "is-json", + "is_json" + ] } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -18088,7 +20177,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -18169,7 +20261,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -18179,13 +20274,1211 @@ "minItems": 1 } }, - "required": ["type", "criteria"], + "required": [ + "type", + "criteria" + ], "additionalProperties": false } ] } }, - "workspace": { + "assert": { + "type": "array", + "items": { + "anyOf": [ + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": 0, + "maximum": 1 + } + ] + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": [ + "code-judge", + "code_judge" + ] + }, + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "cwd": { + "type": "string" + }, + "target": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "object", + "properties": { + "max_calls": { + "type": "number" + } + }, + "additionalProperties": false + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "required": [ + "type", + "command" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": 0, + "maximum": 1 + } + ] + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": [ + "llm-judge", + "llm_judge" + ] + }, + "prompt": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "object", + "properties": { + "command": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "script": { + "anyOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "additionalProperties": false + } + ] + }, + "rubrics": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "outcome": { + "type": "string" + }, + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" + }, + "required_min_score": { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": [ + "score_range", + "outcome" + ], + "additionalProperties": false + } + } + }, + "additionalProperties": false + } + }, + "model": { + "type": "string" + }, + "target": { + "type": "string" + }, + "config": { + "type": "object", + "additionalProperties": {} + } + }, + "required": [ + "type" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": 0, + "maximum": 1 + } + ] + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "composite" + }, + "assertions": { + "type": "array", + "items": {} + }, + "assert": { + "type": "array", + "items": {} + }, + "evaluators": { + "type": "array", + "items": {} + }, + "aggregator": { + "anyOf": [ + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "weighted_average" + }, + "weights": { + "type": "object", + "additionalProperties": { + "type": "number" + } + } + }, + "required": [ + "type" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "threshold" + }, + "threshold": { + "type": "number", + "minimum": 0, + "maximum": 1 + } + }, + "required": [ + "type", + "threshold" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "code-judge" + }, + "path": { + "type": "string" + }, + "cwd": { + "type": "string" + } + }, + "required": [ + "type", + "path" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "type": { + "type": "string", + "const": "llm-judge" + }, + "prompt": { + "type": "string" + }, + "model": { + "type": "string" + } + }, + "required": [ + "type" + ], + "additionalProperties": false + } + ] + } + }, + "required": [ + "type", + "aggregator" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": 0, + "maximum": 1 + } + ] + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": [ + "tool-trajectory", + "tool_trajectory" + ] + }, + "mode": { + "type": "string", + "enum": [ + "any_order", + "in_order", + "exact", + "subset", + "superset" + ] + }, + "minimums": { + "type": "object", + "additionalProperties": { + "type": "integer", + "minimum": 0 + } + }, + "expected": { + "type": "array", + "items": { + "type": "object", + "properties": { + "tool": { + "type": "string" + }, + "args": { + "anyOf": [ + { + "type": "string", + "const": "any" + }, + { + "type": "object", + "additionalProperties": {} + } + ] + }, + "max_duration_ms": { + "type": "number", + "minimum": 0 + }, + "maxDurationMs": { + "type": "number", + "minimum": 0 + }, + "args_match": { + "anyOf": [ + { + "type": "string", + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "argsMatch": { + "anyOf": [ + { + "type": "string", + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": [ + "tool" + ], + "additionalProperties": false + } + }, + "args_match": { + "anyOf": [ + { + "type": "string", + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "argsMatch": { + "anyOf": [ + { + "type": "string", + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "required": [ + "type", + "mode" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": 0, + "maximum": 1 + } + ] + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": [ + "field-accuracy", + "field_accuracy" + ] + }, + "fields": { + "type": "array", + "items": { + "type": "object", + "properties": { + "path": { + "type": "string" + }, + "match": { + "type": "string", + "enum": [ + "exact", + "numeric_tolerance", + "date" + ] + }, + "required": { + "type": "boolean" + }, + "weight": { + "type": "number" + }, + "tolerance": { + "type": "number", + "minimum": 0 + }, + "relative": { + "type": "boolean" + }, + "formats": { + "type": "array", + "items": { + "type": "string" + } + } + }, + "required": [ + "path", + "match" + ], + "additionalProperties": false + }, + "minItems": 1 + }, + "aggregation": { + "type": "string", + "enum": [ + "weighted_average", + "all_or_nothing" + ] + } + }, + "required": [ + "type", + "fields" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": 0, + "maximum": 1 + } + ] + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "latency" + }, + "threshold": { + "type": "number", + "minimum": 0 + } + }, + "required": [ + "type", + "threshold" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": 0, + "maximum": 1 + } + ] + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "cost" + }, + "budget": { + "type": "number", + "minimum": 0 + } + }, + "required": [ + "type", + "budget" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": 0, + "maximum": 1 + } + ] + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": [ + "token-usage", + "token_usage" + ] + }, + "max_total": { + "type": "number", + "minimum": 0 + }, + "max_input": { + "type": "number", + "minimum": 0 + }, + "max_output": { + "type": "number", + "minimum": 0 + } + }, + "required": [ + "type" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": 0, + "maximum": 1 + } + ] + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": [ + "execution-metrics", + "execution_metrics" + ] + }, + "max_tool_calls": { + "type": "number", + "minimum": 0 + }, + "max_llm_calls": { + "type": "number", + "minimum": 0 + }, + "max_tokens": { + "type": "number", + "minimum": 0 + }, + "max_cost_usd": { + "type": "number", + "minimum": 0 + }, + "max_duration_ms": { + "type": "number", + "minimum": 0 + }, + "target_exploration_ratio": { + "type": "number", + "minimum": 0, + "maximum": 1 + }, + "exploration_tolerance": { + "type": "number", + "minimum": 0 + } + }, + "required": [ + "type" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": 0, + "maximum": 1 + } + ] + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": [ + "agent-judge", + "agent_judge" + ] + }, + "prompt": { + "type": "string" + }, + "rubrics": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "outcome": { + "type": "string" + }, + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" + }, + "required_min_score": { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": [ + "score_range", + "outcome" + ], + "additionalProperties": false + } + } + }, + "additionalProperties": false + } + }, + "max_steps": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "temperature": { + "type": "number", + "minimum": 0, + "maximum": 2 + }, + "target": { + "type": "string" + } + }, + "required": [ + "type" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": 0, + "maximum": 1 + } + ] + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "contains" + }, + "value": { + "type": "string" + } + }, + "required": [ + "type", + "value" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": 0, + "maximum": 1 + } + ] + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "regex" + }, + "value": { + "type": "string" + } + }, + "required": [ + "type", + "value" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": 0, + "maximum": 1 + } + ] + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "enum": [ + "is-json", + "is_json" + ] + } + }, + "required": [ + "type" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": 0, + "maximum": 1 + } + ] + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "equals" + }, + "value": { + "type": "string" + } + }, + "required": [ + "type", + "value" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { + "type": "string" + }, + "weight": { + "type": "number", + "minimum": 0 + }, + "required": { + "anyOf": [ + { + "type": "boolean" + }, + { + "type": "number", + "exclusiveMinimum": 0, + "maximum": 1 + } + ] + }, + "negate": { + "type": "boolean" + }, + "type": { + "type": "string", + "const": "rubrics" + }, + "criteria": { + "type": "array", + "items": { + "type": "object", + "properties": { + "id": { + "type": "string" + }, + "outcome": { + "type": "string" + }, + "weight": { + "type": "number" + }, + "required": { + "type": "boolean" + }, + "required_min_score": { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + "score_ranges": { + "type": "array", + "items": { + "type": "object", + "properties": { + "score_range": { + "type": "array", + "minItems": 2, + "maxItems": 2, + "items": [ + { + "type": "integer", + "minimum": 0, + "maximum": 10 + }, + { + "type": "integer", + "minimum": 0, + "maximum": 10 + } + ] + }, + "outcome": { + "type": "string", + "minLength": 1 + } + }, + "required": [ + "score_range", + "outcome" + ], + "additionalProperties": false + } + } + }, + "additionalProperties": false + }, + "minItems": 1 + } + }, + "required": [ + "type", + "criteria" + ], + "additionalProperties": false + } + ] + } + }, + "workspace": { "anyOf": [ { "type": "object", @@ -18195,7 +21488,10 @@ }, "isolation": { "type": "string", - "enum": ["shared", "per_test"] + "enum": [ + "shared", + "per_test" + ] }, "repos": { "type": "array", @@ -18219,7 +21515,10 @@ "format": "uri" } }, - "required": ["type", "url"], + "required": [ + "type", + "url" + ], "additionalProperties": false }, { @@ -18233,7 +21532,10 @@ "type": "string" } }, - "required": ["type", "path"], + "required": [ + "type", + "path" + ], "additionalProperties": false } ] @@ -18246,7 +21548,10 @@ }, "resolve": { "type": "string", - "enum": ["remote", "local"] + "enum": [ + "remote", + "local" + ] }, "ancestor": { "type": "integer", @@ -18275,7 +21580,10 @@ "additionalProperties": false } }, - "required": ["path", "source"], + "required": [ + "path", + "source" + ], "additionalProperties": false } }, @@ -18311,7 +21619,11 @@ }, "reset": { "type": "string", - "enum": ["none", "fast", "strict"] + "enum": [ + "none", + "fast", + "strict" + ] } }, "additionalProperties": false @@ -18342,7 +21654,11 @@ }, "reset": { "type": "string", - "enum": ["none", "fast", "strict"] + "enum": [ + "none", + "fast", + "strict" + ] } }, "additionalProperties": false @@ -18373,7 +21689,11 @@ }, "reset": { "type": "string", - "enum": ["none", "fast", "strict"] + "enum": [ + "none", + "fast", + "strict" + ] } }, "additionalProperties": false @@ -18404,7 +21724,11 @@ }, "reset": { "type": "string", - "enum": ["none", "fast", "strict"] + "enum": [ + "none", + "fast", + "strict" + ] } }, "additionalProperties": false @@ -18414,7 +21738,11 @@ }, "mode": { "type": "string", - "enum": ["pooled", "temp", "static"] + "enum": [ + "pooled", + "temp", + "static" + ] }, "path": { "type": "string" @@ -18428,7 +21756,9 @@ ] } }, - "required": ["tests"], + "required": [ + "tests" + ], "additionalProperties": false } } diff --git a/plugins/agentv-dev/skills/agentv-eval-writer/references/rubric-evaluator.md b/plugins/agentv-dev/skills/agentv-eval-writer/references/rubric-evaluator.md index 8ab9a8211..ac7d6edc1 100644 --- a/plugins/agentv-dev/skills/agentv-eval-writer/references/rubric-evaluator.md +++ b/plugins/agentv-dev/skills/agentv-eval-writer/references/rubric-evaluator.md @@ -1,6 +1,6 @@ # Rubric Evaluator -Rubrics are defined as `assert` entries with `type: rubrics`. They support binary checklist grading and score-range analytic grading. +Rubrics are defined as `assertions` entries with `type: rubrics`. They support binary checklist grading and score-range analytic grading. ## Field Reference @@ -23,10 +23,10 @@ Rubrics are defined as `assert` entries with `type: rubrics`. They support binar ## String Shorthand (Recommended) -Plain strings in `assert` are automatically treated as rubric criteria: +Plain strings in `assertions` are automatically treated as rubric criteria: ```yaml -assert: +assertions: - Mentions divide-and-conquer approach - Explains partition step - States time complexity @@ -34,10 +34,10 @@ assert: Equivalent to the full form with `type: rubrics`. Use the full form only when you need weights, `required: false`, or `score_ranges`. -Mixed strings and objects are supported — strings are grouped into a single rubrics evaluator at the position of the first string: +Mixed strings and objects are supported in `assertions` — strings are grouped into a single rubrics evaluator at the position of the first string: ```yaml -assert: +assertions: - Mentions divide-and-conquer approach # grouped into rubrics - type: code-judge # kept as-is command: [check_syntax.py] @@ -47,7 +47,7 @@ assert: ## Checklist Mode ```yaml -assert: +assertions: - type: rubrics criteria: - Mentions divide-and-conquer approach @@ -66,7 +66,7 @@ assert: Shorthand map format (recommended): ```yaml -assert: +assertions: - type: rubrics criteria: - id: correctness From fc63747a843e5513108776f64460acdcec756e34 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 15 Mar 2026 03:54:49 +0000 Subject: [PATCH 2/2] style: format eval-schema.json with biome --- .../references/eval-schema.json | 3700 ++++------------- 1 file changed, 722 insertions(+), 2978 deletions(-) diff --git a/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json b/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json index 57c56188e..9093c7e48 100644 --- a/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json +++ b/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json @@ -53,12 +53,7 @@ "properties": { "role": { "type": "string", - "enum": [ - "system", - "user", - "assistant", - "tool" - ] + "enum": ["system", "user", "assistant", "tool"] }, "content": { "anyOf": [ @@ -72,29 +67,20 @@ "properties": { "type": { "type": "string", - "enum": [ - "text", - "file" - ] + "enum": ["text", "file"] }, "value": { "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false } } ] } }, - "required": [ - "role", - "content" - ], + "required": ["role", "content"], "additionalProperties": false } } @@ -129,12 +115,7 @@ "properties": { "role": { "type": "string", - "enum": [ - "system", - "user", - "assistant", - "tool" - ] + "enum": ["system", "user", "assistant", "tool"] }, "content": { "anyOf": [ @@ -148,29 +129,20 @@ "properties": { "type": { "type": "string", - "enum": [ - "text", - "file" - ] + "enum": ["text", "file"] }, "value": { "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false } } ] } }, - "required": [ - "role", - "content" - ], + "required": ["role", "content"], "additionalProperties": false } } @@ -192,12 +164,7 @@ "properties": { "role": { "type": "string", - "enum": [ - "system", - "user", - "assistant", - "tool" - ] + "enum": ["system", "user", "assistant", "tool"] }, "content": { "anyOf": [ @@ -211,29 +178,20 @@ "properties": { "type": { "type": "string", - "enum": [ - "text", - "file" - ] + "enum": ["text", "file"] }, "value": { "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false } } ] } }, - "required": [ - "role", - "content" - ], + "required": ["role", "content"], "additionalProperties": false } } @@ -270,10 +228,7 @@ }, "type": { "type": "string", - "enum": [ - "code-judge", - "code_judge" - ] + "enum": ["code-judge", "code_judge"] }, "command": { "anyOf": [ @@ -325,10 +280,7 @@ "additionalProperties": {} } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false }, { @@ -358,10 +310,7 @@ }, "type": { "type": "string", - "enum": [ - "llm-judge", - "llm_judge" - ] + "enum": ["llm-judge", "llm_judge"] }, "prompt": { "anyOf": [ @@ -455,10 +404,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -477,9 +423,7 @@ "additionalProperties": {} } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -539,9 +483,7 @@ } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -557,10 +499,7 @@ "maximum": 1 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -577,10 +516,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false }, { @@ -597,18 +533,13 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false } ] } }, - "required": [ - "type", - "aggregator" - ], + "required": ["type", "aggregator"], "additionalProperties": false }, { @@ -638,20 +569,11 @@ }, "type": { "type": "string", - "enum": [ - "tool-trajectory", - "tool_trajectory" - ] + "enum": ["tool-trajectory", "tool_trajectory"] }, "mode": { "type": "string", - "enum": [ - "any_order", - "in_order", - "exact", - "subset", - "superset" - ] + "enum": ["any_order", "in_order", "exact", "subset", "superset"] }, "minimums": { "type": "object", @@ -692,12 +614,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -711,12 +628,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -727,9 +639,7 @@ ] } }, - "required": [ - "tool" - ], + "required": ["tool"], "additionalProperties": false } }, @@ -737,12 +647,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -756,12 +661,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -772,10 +672,7 @@ ] } }, - "required": [ - "type", - "mode" - ], + "required": ["type", "mode"], "additionalProperties": false }, { @@ -805,10 +702,7 @@ }, "type": { "type": "string", - "enum": [ - "field-accuracy", - "field_accuracy" - ] + "enum": ["field-accuracy", "field_accuracy"] }, "fields": { "type": "array", @@ -820,11 +714,7 @@ }, "match": { "type": "string", - "enum": [ - "exact", - "numeric_tolerance", - "date" - ] + "enum": ["exact", "numeric_tolerance", "date"] }, "required": { "type": "boolean" @@ -846,26 +736,17 @@ } } }, - "required": [ - "path", - "match" - ], + "required": ["path", "match"], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": [ - "weighted_average", - "all_or_nothing" - ] + "enum": ["weighted_average", "all_or_nothing"] } }, - "required": [ - "type", - "fields" - ], + "required": ["type", "fields"], "additionalProperties": false }, { @@ -902,10 +783,7 @@ "minimum": 0 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -942,10 +820,7 @@ "minimum": 0 } }, - "required": [ - "type", - "budget" - ], + "required": ["type", "budget"], "additionalProperties": false }, { @@ -975,10 +850,7 @@ }, "type": { "type": "string", - "enum": [ - "token-usage", - "token_usage" - ] + "enum": ["token-usage", "token_usage"] }, "max_total": { "type": "number", @@ -993,9 +865,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -1025,10 +895,7 @@ }, "type": { "type": "string", - "enum": [ - "execution-metrics", - "execution_metrics" - ] + "enum": ["execution-metrics", "execution_metrics"] }, "max_tool_calls": { "type": "number", @@ -1060,9 +927,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -1092,10 +957,7 @@ }, "type": { "type": "string", - "enum": [ - "agent-judge", - "agent_judge" - ] + "enum": ["agent-judge", "agent_judge"] }, "prompt": { "type": "string" @@ -1149,10 +1011,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -1174,9 +1033,7 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -1212,10 +1069,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -1251,10 +1105,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -1284,15 +1135,10 @@ }, "type": { "type": "string", - "enum": [ - "is-json", - "is_json" - ] + "enum": ["is-json", "is_json"] } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -1328,10 +1174,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -1412,10 +1255,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -1425,10 +1265,7 @@ "minItems": 1 } }, - "required": [ - "type", - "criteria" - ], + "required": ["type", "criteria"], "additionalProperties": false } ] @@ -1465,10 +1302,7 @@ }, "type": { "type": "string", - "enum": [ - "code-judge", - "code_judge" - ] + "enum": ["code-judge", "code_judge"] }, "command": { "anyOf": [ @@ -1520,10 +1354,7 @@ "additionalProperties": {} } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false }, { @@ -1553,10 +1384,7 @@ }, "type": { "type": "string", - "enum": [ - "llm-judge", - "llm_judge" - ] + "enum": ["llm-judge", "llm_judge"] }, "prompt": { "anyOf": [ @@ -1650,10 +1478,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -1672,9 +1497,7 @@ "additionalProperties": {} } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -1734,9 +1557,7 @@ } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -1752,10 +1573,7 @@ "maximum": 1 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -1772,10 +1590,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false }, { @@ -1792,18 +1607,13 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false } ] } }, - "required": [ - "type", - "aggregator" - ], + "required": ["type", "aggregator"], "additionalProperties": false }, { @@ -1833,20 +1643,11 @@ }, "type": { "type": "string", - "enum": [ - "tool-trajectory", - "tool_trajectory" - ] + "enum": ["tool-trajectory", "tool_trajectory"] }, "mode": { "type": "string", - "enum": [ - "any_order", - "in_order", - "exact", - "subset", - "superset" - ] + "enum": ["any_order", "in_order", "exact", "subset", "superset"] }, "minimums": { "type": "object", @@ -1887,12 +1688,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -1906,12 +1702,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -1922,9 +1713,7 @@ ] } }, - "required": [ - "tool" - ], + "required": ["tool"], "additionalProperties": false } }, @@ -1932,12 +1721,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -1951,12 +1735,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -1967,10 +1746,7 @@ ] } }, - "required": [ - "type", - "mode" - ], + "required": ["type", "mode"], "additionalProperties": false }, { @@ -2000,10 +1776,7 @@ }, "type": { "type": "string", - "enum": [ - "field-accuracy", - "field_accuracy" - ] + "enum": ["field-accuracy", "field_accuracy"] }, "fields": { "type": "array", @@ -2015,11 +1788,7 @@ }, "match": { "type": "string", - "enum": [ - "exact", - "numeric_tolerance", - "date" - ] + "enum": ["exact", "numeric_tolerance", "date"] }, "required": { "type": "boolean" @@ -2041,26 +1810,17 @@ } } }, - "required": [ - "path", - "match" - ], + "required": ["path", "match"], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": [ - "weighted_average", - "all_or_nothing" - ] + "enum": ["weighted_average", "all_or_nothing"] } }, - "required": [ - "type", - "fields" - ], + "required": ["type", "fields"], "additionalProperties": false }, { @@ -2097,10 +1857,7 @@ "minimum": 0 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -2137,10 +1894,7 @@ "minimum": 0 } }, - "required": [ - "type", - "budget" - ], + "required": ["type", "budget"], "additionalProperties": false }, { @@ -2170,10 +1924,7 @@ }, "type": { "type": "string", - "enum": [ - "token-usage", - "token_usage" - ] + "enum": ["token-usage", "token_usage"] }, "max_total": { "type": "number", @@ -2188,9 +1939,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -2220,10 +1969,7 @@ }, "type": { "type": "string", - "enum": [ - "execution-metrics", - "execution_metrics" - ] + "enum": ["execution-metrics", "execution_metrics"] }, "max_tool_calls": { "type": "number", @@ -2255,9 +2001,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -2287,10 +2031,7 @@ }, "type": { "type": "string", - "enum": [ - "agent-judge", - "agent_judge" - ] + "enum": ["agent-judge", "agent_judge"] }, "prompt": { "type": "string" @@ -2344,10 +2085,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -2369,9 +2107,7 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -2407,10 +2143,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -2446,10 +2179,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -2479,15 +2209,10 @@ }, "type": { "type": "string", - "enum": [ - "is-json", - "is_json" - ] + "enum": ["is-json", "is_json"] } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -2523,10 +2248,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -2607,10 +2329,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -2620,10 +2339,7 @@ "minItems": 1 } }, - "required": [ - "type", - "criteria" - ], + "required": ["type", "criteria"], "additionalProperties": false } ] @@ -2660,10 +2376,7 @@ }, "type": { "type": "string", - "enum": [ - "code-judge", - "code_judge" - ] + "enum": ["code-judge", "code_judge"] }, "command": { "anyOf": [ @@ -2715,10 +2428,7 @@ "additionalProperties": {} } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false }, { @@ -2748,10 +2458,7 @@ }, "type": { "type": "string", - "enum": [ - "llm-judge", - "llm_judge" - ] + "enum": ["llm-judge", "llm_judge"] }, "prompt": { "anyOf": [ @@ -2845,10 +2552,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -2867,9 +2571,7 @@ "additionalProperties": {} } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -2929,9 +2631,7 @@ } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -2947,10 +2647,7 @@ "maximum": 1 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -2967,10 +2664,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false }, { @@ -2987,18 +2681,13 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false } ] } }, - "required": [ - "type", - "aggregator" - ], + "required": ["type", "aggregator"], "additionalProperties": false }, { @@ -3028,20 +2717,11 @@ }, "type": { "type": "string", - "enum": [ - "tool-trajectory", - "tool_trajectory" - ] + "enum": ["tool-trajectory", "tool_trajectory"] }, "mode": { "type": "string", - "enum": [ - "any_order", - "in_order", - "exact", - "subset", - "superset" - ] + "enum": ["any_order", "in_order", "exact", "subset", "superset"] }, "minimums": { "type": "object", @@ -3082,12 +2762,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -3101,12 +2776,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -3117,9 +2787,7 @@ ] } }, - "required": [ - "tool" - ], + "required": ["tool"], "additionalProperties": false } }, @@ -3127,12 +2795,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -3146,12 +2809,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -3162,10 +2820,7 @@ ] } }, - "required": [ - "type", - "mode" - ], + "required": ["type", "mode"], "additionalProperties": false }, { @@ -3195,10 +2850,7 @@ }, "type": { "type": "string", - "enum": [ - "field-accuracy", - "field_accuracy" - ] + "enum": ["field-accuracy", "field_accuracy"] }, "fields": { "type": "array", @@ -3210,11 +2862,7 @@ }, "match": { "type": "string", - "enum": [ - "exact", - "numeric_tolerance", - "date" - ] + "enum": ["exact", "numeric_tolerance", "date"] }, "required": { "type": "boolean" @@ -3236,26 +2884,17 @@ } } }, - "required": [ - "path", - "match" - ], + "required": ["path", "match"], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": [ - "weighted_average", - "all_or_nothing" - ] + "enum": ["weighted_average", "all_or_nothing"] } }, - "required": [ - "type", - "fields" - ], + "required": ["type", "fields"], "additionalProperties": false }, { @@ -3292,10 +2931,7 @@ "minimum": 0 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -3332,10 +2968,7 @@ "minimum": 0 } }, - "required": [ - "type", - "budget" - ], + "required": ["type", "budget"], "additionalProperties": false }, { @@ -3365,10 +2998,7 @@ }, "type": { "type": "string", - "enum": [ - "token-usage", - "token_usage" - ] + "enum": ["token-usage", "token_usage"] }, "max_total": { "type": "number", @@ -3383,9 +3013,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -3415,10 +3043,7 @@ }, "type": { "type": "string", - "enum": [ - "execution-metrics", - "execution_metrics" - ] + "enum": ["execution-metrics", "execution_metrics"] }, "max_tool_calls": { "type": "number", @@ -3450,9 +3075,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -3482,10 +3105,7 @@ }, "type": { "type": "string", - "enum": [ - "agent-judge", - "agent_judge" - ] + "enum": ["agent-judge", "agent_judge"] }, "prompt": { "type": "string" @@ -3539,10 +3159,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -3564,9 +3181,7 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -3602,10 +3217,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -3641,10 +3253,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -3674,15 +3283,10 @@ }, "type": { "type": "string", - "enum": [ - "is-json", - "is_json" - ] + "enum": ["is-json", "is_json"] } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -3718,10 +3322,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -3802,10 +3403,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -3815,10 +3413,7 @@ "minItems": 1 } }, - "required": [ - "type", - "criteria" - ], + "required": ["type", "criteria"], "additionalProperties": false } ] @@ -3867,10 +3462,7 @@ }, "type": { "type": "string", - "enum": [ - "code-judge", - "code_judge" - ] + "enum": ["code-judge", "code_judge"] }, "command": { "anyOf": [ @@ -3922,10 +3514,7 @@ "additionalProperties": {} } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false }, { @@ -3955,10 +3544,7 @@ }, "type": { "type": "string", - "enum": [ - "llm-judge", - "llm_judge" - ] + "enum": ["llm-judge", "llm_judge"] }, "prompt": { "anyOf": [ @@ -4052,10 +3638,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -4074,9 +3657,7 @@ "additionalProperties": {} } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -4136,9 +3717,7 @@ } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -4154,10 +3733,7 @@ "maximum": 1 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -4174,10 +3750,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false }, { @@ -4194,18 +3767,13 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false } ] } }, - "required": [ - "type", - "aggregator" - ], + "required": ["type", "aggregator"], "additionalProperties": false }, { @@ -4235,20 +3803,11 @@ }, "type": { "type": "string", - "enum": [ - "tool-trajectory", - "tool_trajectory" - ] + "enum": ["tool-trajectory", "tool_trajectory"] }, "mode": { "type": "string", - "enum": [ - "any_order", - "in_order", - "exact", - "subset", - "superset" - ] + "enum": ["any_order", "in_order", "exact", "subset", "superset"] }, "minimums": { "type": "object", @@ -4289,12 +3848,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -4308,12 +3862,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -4324,9 +3873,7 @@ ] } }, - "required": [ - "tool" - ], + "required": ["tool"], "additionalProperties": false } }, @@ -4334,12 +3881,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -4353,12 +3895,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -4369,10 +3906,7 @@ ] } }, - "required": [ - "type", - "mode" - ], + "required": ["type", "mode"], "additionalProperties": false }, { @@ -4402,10 +3936,7 @@ }, "type": { "type": "string", - "enum": [ - "field-accuracy", - "field_accuracy" - ] + "enum": ["field-accuracy", "field_accuracy"] }, "fields": { "type": "array", @@ -4417,11 +3948,7 @@ }, "match": { "type": "string", - "enum": [ - "exact", - "numeric_tolerance", - "date" - ] + "enum": ["exact", "numeric_tolerance", "date"] }, "required": { "type": "boolean" @@ -4443,26 +3970,17 @@ } } }, - "required": [ - "path", - "match" - ], + "required": ["path", "match"], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": [ - "weighted_average", - "all_or_nothing" - ] + "enum": ["weighted_average", "all_or_nothing"] } }, - "required": [ - "type", - "fields" - ], + "required": ["type", "fields"], "additionalProperties": false }, { @@ -4499,10 +4017,7 @@ "minimum": 0 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -4539,10 +4054,7 @@ "minimum": 0 } }, - "required": [ - "type", - "budget" - ], + "required": ["type", "budget"], "additionalProperties": false }, { @@ -4572,10 +4084,7 @@ }, "type": { "type": "string", - "enum": [ - "token-usage", - "token_usage" - ] + "enum": ["token-usage", "token_usage"] }, "max_total": { "type": "number", @@ -4590,9 +4099,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -4622,10 +4129,7 @@ }, "type": { "type": "string", - "enum": [ - "execution-metrics", - "execution_metrics" - ] + "enum": ["execution-metrics", "execution_metrics"] }, "max_tool_calls": { "type": "number", @@ -4657,9 +4161,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -4689,10 +4191,7 @@ }, "type": { "type": "string", - "enum": [ - "agent-judge", - "agent_judge" - ] + "enum": ["agent-judge", "agent_judge"] }, "prompt": { "type": "string" @@ -4746,10 +4245,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -4771,9 +4267,7 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -4809,10 +4303,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -4848,10 +4339,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -4881,15 +4369,10 @@ }, "type": { "type": "string", - "enum": [ - "is-json", - "is_json" - ] + "enum": ["is-json", "is_json"] } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -4925,10 +4408,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -5009,10 +4489,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -5022,10 +4499,7 @@ "minItems": 1 } }, - "required": [ - "type", - "criteria" - ], + "required": ["type", "criteria"], "additionalProperties": false } ] @@ -5062,10 +4536,7 @@ }, "type": { "type": "string", - "enum": [ - "code-judge", - "code_judge" - ] + "enum": ["code-judge", "code_judge"] }, "command": { "anyOf": [ @@ -5117,10 +4588,7 @@ "additionalProperties": {} } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false }, { @@ -5150,10 +4618,7 @@ }, "type": { "type": "string", - "enum": [ - "llm-judge", - "llm_judge" - ] + "enum": ["llm-judge", "llm_judge"] }, "prompt": { "anyOf": [ @@ -5247,10 +4712,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -5269,9 +4731,7 @@ "additionalProperties": {} } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -5331,9 +4791,7 @@ } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -5349,10 +4807,7 @@ "maximum": 1 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -5369,10 +4824,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false }, { @@ -5389,18 +4841,13 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false } ] } }, - "required": [ - "type", - "aggregator" - ], + "required": ["type", "aggregator"], "additionalProperties": false }, { @@ -5430,20 +4877,11 @@ }, "type": { "type": "string", - "enum": [ - "tool-trajectory", - "tool_trajectory" - ] + "enum": ["tool-trajectory", "tool_trajectory"] }, "mode": { "type": "string", - "enum": [ - "any_order", - "in_order", - "exact", - "subset", - "superset" - ] + "enum": ["any_order", "in_order", "exact", "subset", "superset"] }, "minimums": { "type": "object", @@ -5484,12 +4922,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -5503,12 +4936,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -5519,9 +4947,7 @@ ] } }, - "required": [ - "tool" - ], + "required": ["tool"], "additionalProperties": false } }, @@ -5529,12 +4955,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -5548,12 +4969,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -5564,10 +4980,7 @@ ] } }, - "required": [ - "type", - "mode" - ], + "required": ["type", "mode"], "additionalProperties": false }, { @@ -5597,10 +5010,7 @@ }, "type": { "type": "string", - "enum": [ - "field-accuracy", - "field_accuracy" - ] + "enum": ["field-accuracy", "field_accuracy"] }, "fields": { "type": "array", @@ -5612,11 +5022,7 @@ }, "match": { "type": "string", - "enum": [ - "exact", - "numeric_tolerance", - "date" - ] + "enum": ["exact", "numeric_tolerance", "date"] }, "required": { "type": "boolean" @@ -5638,26 +5044,17 @@ } } }, - "required": [ - "path", - "match" - ], + "required": ["path", "match"], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": [ - "weighted_average", - "all_or_nothing" - ] + "enum": ["weighted_average", "all_or_nothing"] } }, - "required": [ - "type", - "fields" - ], + "required": ["type", "fields"], "additionalProperties": false }, { @@ -5694,10 +5091,7 @@ "minimum": 0 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -5734,10 +5128,7 @@ "minimum": 0 } }, - "required": [ - "type", - "budget" - ], + "required": ["type", "budget"], "additionalProperties": false }, { @@ -5767,10 +5158,7 @@ }, "type": { "type": "string", - "enum": [ - "token-usage", - "token_usage" - ] + "enum": ["token-usage", "token_usage"] }, "max_total": { "type": "number", @@ -5785,9 +5173,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -5817,10 +5203,7 @@ }, "type": { "type": "string", - "enum": [ - "execution-metrics", - "execution_metrics" - ] + "enum": ["execution-metrics", "execution_metrics"] }, "max_tool_calls": { "type": "number", @@ -5852,9 +5235,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -5884,10 +5265,7 @@ }, "type": { "type": "string", - "enum": [ - "agent-judge", - "agent_judge" - ] + "enum": ["agent-judge", "agent_judge"] }, "prompt": { "type": "string" @@ -5941,10 +5319,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -5966,9 +5341,7 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -6004,10 +5377,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -6043,10 +5413,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -6076,15 +5443,10 @@ }, "type": { "type": "string", - "enum": [ - "is-json", - "is_json" - ] + "enum": ["is-json", "is_json"] } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -6120,10 +5482,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -6204,10 +5563,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -6217,10 +5573,7 @@ "minItems": 1 } }, - "required": [ - "type", - "criteria" - ], + "required": ["type", "criteria"], "additionalProperties": false } ] @@ -6257,10 +5610,7 @@ }, "type": { "type": "string", - "enum": [ - "code-judge", - "code_judge" - ] + "enum": ["code-judge", "code_judge"] }, "command": { "anyOf": [ @@ -6312,10 +5662,7 @@ "additionalProperties": {} } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false }, { @@ -6345,10 +5692,7 @@ }, "type": { "type": "string", - "enum": [ - "llm-judge", - "llm_judge" - ] + "enum": ["llm-judge", "llm_judge"] }, "prompt": { "anyOf": [ @@ -6442,10 +5786,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -6464,9 +5805,7 @@ "additionalProperties": {} } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -6526,9 +5865,7 @@ } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -6544,10 +5881,7 @@ "maximum": 1 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -6564,10 +5898,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false }, { @@ -6584,18 +5915,13 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false } ] } }, - "required": [ - "type", - "aggregator" - ], + "required": ["type", "aggregator"], "additionalProperties": false }, { @@ -6625,20 +5951,11 @@ }, "type": { "type": "string", - "enum": [ - "tool-trajectory", - "tool_trajectory" - ] + "enum": ["tool-trajectory", "tool_trajectory"] }, "mode": { "type": "string", - "enum": [ - "any_order", - "in_order", - "exact", - "subset", - "superset" - ] + "enum": ["any_order", "in_order", "exact", "subset", "superset"] }, "minimums": { "type": "object", @@ -6679,12 +5996,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -6698,12 +6010,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -6714,9 +6021,7 @@ ] } }, - "required": [ - "tool" - ], + "required": ["tool"], "additionalProperties": false } }, @@ -6724,12 +6029,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -6743,12 +6043,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -6759,10 +6054,7 @@ ] } }, - "required": [ - "type", - "mode" - ], + "required": ["type", "mode"], "additionalProperties": false }, { @@ -6792,10 +6084,7 @@ }, "type": { "type": "string", - "enum": [ - "field-accuracy", - "field_accuracy" - ] + "enum": ["field-accuracy", "field_accuracy"] }, "fields": { "type": "array", @@ -6807,11 +6096,7 @@ }, "match": { "type": "string", - "enum": [ - "exact", - "numeric_tolerance", - "date" - ] + "enum": ["exact", "numeric_tolerance", "date"] }, "required": { "type": "boolean" @@ -6833,26 +6118,17 @@ } } }, - "required": [ - "path", - "match" - ], + "required": ["path", "match"], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": [ - "weighted_average", - "all_or_nothing" - ] + "enum": ["weighted_average", "all_or_nothing"] } }, - "required": [ - "type", - "fields" - ], + "required": ["type", "fields"], "additionalProperties": false }, { @@ -6889,10 +6165,7 @@ "minimum": 0 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -6929,10 +6202,7 @@ "minimum": 0 } }, - "required": [ - "type", - "budget" - ], + "required": ["type", "budget"], "additionalProperties": false }, { @@ -6962,10 +6232,7 @@ }, "type": { "type": "string", - "enum": [ - "token-usage", - "token_usage" - ] + "enum": ["token-usage", "token_usage"] }, "max_total": { "type": "number", @@ -6980,9 +6247,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -7012,10 +6277,7 @@ }, "type": { "type": "string", - "enum": [ - "execution-metrics", - "execution_metrics" - ] + "enum": ["execution-metrics", "execution_metrics"] }, "max_tool_calls": { "type": "number", @@ -7047,9 +6309,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -7079,10 +6339,7 @@ }, "type": { "type": "string", - "enum": [ - "agent-judge", - "agent_judge" - ] + "enum": ["agent-judge", "agent_judge"] }, "prompt": { "type": "string" @@ -7136,10 +6393,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -7161,9 +6415,7 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -7199,10 +6451,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -7238,10 +6487,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -7271,15 +6517,10 @@ }, "type": { "type": "string", - "enum": [ - "is-json", - "is_json" - ] + "enum": ["is-json", "is_json"] } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -7315,10 +6556,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -7399,10 +6637,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -7412,10 +6647,7 @@ "minItems": 1 } }, - "required": [ - "type", - "criteria" - ], + "required": ["type", "criteria"], "additionalProperties": false } ] @@ -7436,11 +6668,7 @@ }, "strategy": { "type": "string", - "enum": [ - "pass_at_k", - "mean", - "confidence_interval" - ] + "enum": ["pass_at_k", "mean", "confidence_interval"] }, "cost_limit_usd": { "type": "number", @@ -7451,9 +6679,7 @@ "minimum": 0 } }, - "required": [ - "count" - ], + "required": ["count"], "additionalProperties": false }, "total_budget_usd": { @@ -7481,10 +6707,7 @@ }, "isolation": { "type": "string", - "enum": [ - "shared", - "per_test" - ] + "enum": ["shared", "per_test"] }, "repos": { "type": "array", @@ -7508,10 +6731,7 @@ "format": "uri" } }, - "required": [ - "type", - "url" - ], + "required": ["type", "url"], "additionalProperties": false }, { @@ -7525,10 +6745,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false } ] @@ -7541,10 +6758,7 @@ }, "resolve": { "type": "string", - "enum": [ - "remote", - "local" - ] + "enum": ["remote", "local"] }, "ancestor": { "type": "integer", @@ -7573,10 +6787,7 @@ "additionalProperties": false } }, - "required": [ - "path", - "source" - ], + "required": ["path", "source"], "additionalProperties": false } }, @@ -7612,11 +6823,7 @@ }, "reset": { "type": "string", - "enum": [ - "none", - "fast", - "strict" - ] + "enum": ["none", "fast", "strict"] } }, "additionalProperties": false @@ -7647,11 +6854,7 @@ }, "reset": { "type": "string", - "enum": [ - "none", - "fast", - "strict" - ] + "enum": ["none", "fast", "strict"] } }, "additionalProperties": false @@ -7682,11 +6885,7 @@ }, "reset": { "type": "string", - "enum": [ - "none", - "fast", - "strict" - ] + "enum": ["none", "fast", "strict"] } }, "additionalProperties": false @@ -7717,11 +6916,7 @@ }, "reset": { "type": "string", - "enum": [ - "none", - "fast", - "strict" - ] + "enum": ["none", "fast", "strict"] } }, "additionalProperties": false @@ -7731,11 +6926,7 @@ }, "mode": { "type": "string", - "enum": [ - "pooled", - "temp", - "static" - ] + "enum": ["pooled", "temp", "static"] }, "path": { "type": "string" @@ -7757,9 +6948,7 @@ "type": "string" } }, - "required": [ - "id" - ], + "required": ["id"], "additionalProperties": false } }, @@ -7797,12 +6986,7 @@ "properties": { "role": { "type": "string", - "enum": [ - "system", - "user", - "assistant", - "tool" - ] + "enum": ["system", "user", "assistant", "tool"] }, "content": { "anyOf": [ @@ -7816,29 +7000,20 @@ "properties": { "type": { "type": "string", - "enum": [ - "text", - "file" - ] + "enum": ["text", "file"] }, "value": { "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false } } ] } }, - "required": [ - "role", - "content" - ], + "required": ["role", "content"], "additionalProperties": false } } @@ -7860,12 +7035,7 @@ "properties": { "role": { "type": "string", - "enum": [ - "system", - "user", - "assistant", - "tool" - ] + "enum": ["system", "user", "assistant", "tool"] }, "content": { "anyOf": [ @@ -7879,29 +7049,20 @@ "properties": { "type": { "type": "string", - "enum": [ - "text", - "file" - ] + "enum": ["text", "file"] }, "value": { "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false } } ] } }, - "required": [ - "role", - "content" - ], + "required": ["role", "content"], "additionalProperties": false } } @@ -7938,10 +7099,7 @@ }, "type": { "type": "string", - "enum": [ - "code-judge", - "code_judge" - ] + "enum": ["code-judge", "code_judge"] }, "command": { "anyOf": [ @@ -7993,10 +7151,7 @@ "additionalProperties": {} } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false }, { @@ -8026,10 +7181,7 @@ }, "type": { "type": "string", - "enum": [ - "llm-judge", - "llm_judge" - ] + "enum": ["llm-judge", "llm_judge"] }, "prompt": { "anyOf": [ @@ -8123,10 +7275,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -8145,9 +7294,7 @@ "additionalProperties": {} } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -8207,9 +7354,7 @@ } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -8225,10 +7370,7 @@ "maximum": 1 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -8245,10 +7387,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false }, { @@ -8265,18 +7404,13 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false } ] } }, - "required": [ - "type", - "aggregator" - ], + "required": ["type", "aggregator"], "additionalProperties": false }, { @@ -8306,20 +7440,11 @@ }, "type": { "type": "string", - "enum": [ - "tool-trajectory", - "tool_trajectory" - ] + "enum": ["tool-trajectory", "tool_trajectory"] }, "mode": { "type": "string", - "enum": [ - "any_order", - "in_order", - "exact", - "subset", - "superset" - ] + "enum": ["any_order", "in_order", "exact", "subset", "superset"] }, "minimums": { "type": "object", @@ -8360,12 +7485,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -8379,12 +7499,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -8395,9 +7510,7 @@ ] } }, - "required": [ - "tool" - ], + "required": ["tool"], "additionalProperties": false } }, @@ -8405,12 +7518,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -8424,12 +7532,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -8440,10 +7543,7 @@ ] } }, - "required": [ - "type", - "mode" - ], + "required": ["type", "mode"], "additionalProperties": false }, { @@ -8473,10 +7573,7 @@ }, "type": { "type": "string", - "enum": [ - "field-accuracy", - "field_accuracy" - ] + "enum": ["field-accuracy", "field_accuracy"] }, "fields": { "type": "array", @@ -8488,11 +7585,7 @@ }, "match": { "type": "string", - "enum": [ - "exact", - "numeric_tolerance", - "date" - ] + "enum": ["exact", "numeric_tolerance", "date"] }, "required": { "type": "boolean" @@ -8514,26 +7607,17 @@ } } }, - "required": [ - "path", - "match" - ], + "required": ["path", "match"], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": [ - "weighted_average", - "all_or_nothing" - ] + "enum": ["weighted_average", "all_or_nothing"] } }, - "required": [ - "type", - "fields" - ], + "required": ["type", "fields"], "additionalProperties": false }, { @@ -8570,10 +7654,7 @@ "minimum": 0 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -8610,10 +7691,7 @@ "minimum": 0 } }, - "required": [ - "type", - "budget" - ], + "required": ["type", "budget"], "additionalProperties": false }, { @@ -8643,10 +7721,7 @@ }, "type": { "type": "string", - "enum": [ - "token-usage", - "token_usage" - ] + "enum": ["token-usage", "token_usage"] }, "max_total": { "type": "number", @@ -8661,9 +7736,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -8693,10 +7766,7 @@ }, "type": { "type": "string", - "enum": [ - "execution-metrics", - "execution_metrics" - ] + "enum": ["execution-metrics", "execution_metrics"] }, "max_tool_calls": { "type": "number", @@ -8728,9 +7798,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -8760,10 +7828,7 @@ }, "type": { "type": "string", - "enum": [ - "agent-judge", - "agent_judge" - ] + "enum": ["agent-judge", "agent_judge"] }, "prompt": { "type": "string" @@ -8817,10 +7882,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -8842,9 +7904,7 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -8880,10 +7940,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -8919,10 +7976,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -8952,15 +8006,10 @@ }, "type": { "type": "string", - "enum": [ - "is-json", - "is_json" - ] + "enum": ["is-json", "is_json"] } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -8996,10 +8045,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -9080,10 +8126,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -9093,10 +8136,7 @@ "minItems": 1 } }, - "required": [ - "type", - "criteria" - ], + "required": ["type", "criteria"], "additionalProperties": false } ] @@ -9133,10 +8173,7 @@ }, "type": { "type": "string", - "enum": [ - "code-judge", - "code_judge" - ] + "enum": ["code-judge", "code_judge"] }, "command": { "anyOf": [ @@ -9188,10 +8225,7 @@ "additionalProperties": {} } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false }, { @@ -9221,10 +8255,7 @@ }, "type": { "type": "string", - "enum": [ - "llm-judge", - "llm_judge" - ] + "enum": ["llm-judge", "llm_judge"] }, "prompt": { "anyOf": [ @@ -9318,10 +8349,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -9340,9 +8368,7 @@ "additionalProperties": {} } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -9402,9 +8428,7 @@ } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -9420,10 +8444,7 @@ "maximum": 1 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -9440,10 +8461,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false }, { @@ -9460,18 +8478,13 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false } ] } }, - "required": [ - "type", - "aggregator" - ], + "required": ["type", "aggregator"], "additionalProperties": false }, { @@ -9501,20 +8514,11 @@ }, "type": { "type": "string", - "enum": [ - "tool-trajectory", - "tool_trajectory" - ] + "enum": ["tool-trajectory", "tool_trajectory"] }, "mode": { "type": "string", - "enum": [ - "any_order", - "in_order", - "exact", - "subset", - "superset" - ] + "enum": ["any_order", "in_order", "exact", "subset", "superset"] }, "minimums": { "type": "object", @@ -9555,12 +8559,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -9574,12 +8573,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -9590,9 +8584,7 @@ ] } }, - "required": [ - "tool" - ], + "required": ["tool"], "additionalProperties": false } }, @@ -9600,12 +8592,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -9619,12 +8606,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -9635,10 +8617,7 @@ ] } }, - "required": [ - "type", - "mode" - ], + "required": ["type", "mode"], "additionalProperties": false }, { @@ -9668,10 +8647,7 @@ }, "type": { "type": "string", - "enum": [ - "field-accuracy", - "field_accuracy" - ] + "enum": ["field-accuracy", "field_accuracy"] }, "fields": { "type": "array", @@ -9683,11 +8659,7 @@ }, "match": { "type": "string", - "enum": [ - "exact", - "numeric_tolerance", - "date" - ] + "enum": ["exact", "numeric_tolerance", "date"] }, "required": { "type": "boolean" @@ -9709,26 +8681,17 @@ } } }, - "required": [ - "path", - "match" - ], + "required": ["path", "match"], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": [ - "weighted_average", - "all_or_nothing" - ] + "enum": ["weighted_average", "all_or_nothing"] } }, - "required": [ - "type", - "fields" - ], + "required": ["type", "fields"], "additionalProperties": false }, { @@ -9765,10 +8728,7 @@ "minimum": 0 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -9805,10 +8765,7 @@ "minimum": 0 } }, - "required": [ - "type", - "budget" - ], + "required": ["type", "budget"], "additionalProperties": false }, { @@ -9838,10 +8795,7 @@ }, "type": { "type": "string", - "enum": [ - "token-usage", - "token_usage" - ] + "enum": ["token-usage", "token_usage"] }, "max_total": { "type": "number", @@ -9856,9 +8810,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -9888,10 +8840,7 @@ }, "type": { "type": "string", - "enum": [ - "execution-metrics", - "execution_metrics" - ] + "enum": ["execution-metrics", "execution_metrics"] }, "max_tool_calls": { "type": "number", @@ -9923,9 +8872,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -9955,10 +8902,7 @@ }, "type": { "type": "string", - "enum": [ - "agent-judge", - "agent_judge" - ] + "enum": ["agent-judge", "agent_judge"] }, "prompt": { "type": "string" @@ -10012,10 +8956,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -10037,9 +8978,7 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -10075,10 +9014,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -10114,10 +9050,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -10147,15 +9080,10 @@ }, "type": { "type": "string", - "enum": [ - "is-json", - "is_json" - ] + "enum": ["is-json", "is_json"] } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -10191,10 +9119,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -10275,10 +9200,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -10288,10 +9210,7 @@ "minItems": 1 } }, - "required": [ - "type", - "criteria" - ], + "required": ["type", "criteria"], "additionalProperties": false } ] @@ -10328,10 +9247,7 @@ }, "type": { "type": "string", - "enum": [ - "code-judge", - "code_judge" - ] + "enum": ["code-judge", "code_judge"] }, "command": { "anyOf": [ @@ -10383,10 +9299,7 @@ "additionalProperties": {} } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false }, { @@ -10416,10 +9329,7 @@ }, "type": { "type": "string", - "enum": [ - "llm-judge", - "llm_judge" - ] + "enum": ["llm-judge", "llm_judge"] }, "prompt": { "anyOf": [ @@ -10513,10 +9423,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -10535,9 +9442,7 @@ "additionalProperties": {} } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -10597,9 +9502,7 @@ } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -10615,10 +9518,7 @@ "maximum": 1 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -10635,10 +9535,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false }, { @@ -10655,18 +9552,13 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false } ] } }, - "required": [ - "type", - "aggregator" - ], + "required": ["type", "aggregator"], "additionalProperties": false }, { @@ -10696,20 +9588,11 @@ }, "type": { "type": "string", - "enum": [ - "tool-trajectory", - "tool_trajectory" - ] + "enum": ["tool-trajectory", "tool_trajectory"] }, "mode": { "type": "string", - "enum": [ - "any_order", - "in_order", - "exact", - "subset", - "superset" - ] + "enum": ["any_order", "in_order", "exact", "subset", "superset"] }, "minimums": { "type": "object", @@ -10750,12 +9633,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -10769,12 +9647,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -10785,9 +9658,7 @@ ] } }, - "required": [ - "tool" - ], + "required": ["tool"], "additionalProperties": false } }, @@ -10795,12 +9666,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -10814,12 +9680,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -10830,10 +9691,7 @@ ] } }, - "required": [ - "type", - "mode" - ], + "required": ["type", "mode"], "additionalProperties": false }, { @@ -10863,10 +9721,7 @@ }, "type": { "type": "string", - "enum": [ - "field-accuracy", - "field_accuracy" - ] + "enum": ["field-accuracy", "field_accuracy"] }, "fields": { "type": "array", @@ -10878,11 +9733,7 @@ }, "match": { "type": "string", - "enum": [ - "exact", - "numeric_tolerance", - "date" - ] + "enum": ["exact", "numeric_tolerance", "date"] }, "required": { "type": "boolean" @@ -10904,26 +9755,17 @@ } } }, - "required": [ - "path", - "match" - ], + "required": ["path", "match"], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": [ - "weighted_average", - "all_or_nothing" - ] + "enum": ["weighted_average", "all_or_nothing"] } }, - "required": [ - "type", - "fields" - ], + "required": ["type", "fields"], "additionalProperties": false }, { @@ -10960,10 +9802,7 @@ "minimum": 0 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -11000,10 +9839,7 @@ "minimum": 0 } }, - "required": [ - "type", - "budget" - ], + "required": ["type", "budget"], "additionalProperties": false }, { @@ -11033,10 +9869,7 @@ }, "type": { "type": "string", - "enum": [ - "token-usage", - "token_usage" - ] + "enum": ["token-usage", "token_usage"] }, "max_total": { "type": "number", @@ -11051,9 +9884,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -11083,10 +9914,7 @@ }, "type": { "type": "string", - "enum": [ - "execution-metrics", - "execution_metrics" - ] + "enum": ["execution-metrics", "execution_metrics"] }, "max_tool_calls": { "type": "number", @@ -11118,9 +9946,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -11150,10 +9976,7 @@ }, "type": { "type": "string", - "enum": [ - "agent-judge", - "agent_judge" - ] + "enum": ["agent-judge", "agent_judge"] }, "prompt": { "type": "string" @@ -11207,10 +10030,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -11232,9 +10052,7 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -11270,10 +10088,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -11309,10 +10124,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -11342,15 +10154,10 @@ }, "type": { "type": "string", - "enum": [ - "is-json", - "is_json" - ] + "enum": ["is-json", "is_json"] } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -11386,10 +10193,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -11470,10 +10274,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -11483,10 +10284,7 @@ "minItems": 1 } }, - "required": [ - "type", - "criteria" - ], + "required": ["type", "criteria"], "additionalProperties": false } ] @@ -11535,10 +10333,7 @@ }, "type": { "type": "string", - "enum": [ - "code-judge", - "code_judge" - ] + "enum": ["code-judge", "code_judge"] }, "command": { "anyOf": [ @@ -11590,10 +10385,7 @@ "additionalProperties": {} } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false }, { @@ -11623,10 +10415,7 @@ }, "type": { "type": "string", - "enum": [ - "llm-judge", - "llm_judge" - ] + "enum": ["llm-judge", "llm_judge"] }, "prompt": { "anyOf": [ @@ -11720,10 +10509,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -11742,9 +10528,7 @@ "additionalProperties": {} } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -11804,9 +10588,7 @@ } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -11822,10 +10604,7 @@ "maximum": 1 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -11842,10 +10621,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false }, { @@ -11862,18 +10638,13 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false } ] } }, - "required": [ - "type", - "aggregator" - ], + "required": ["type", "aggregator"], "additionalProperties": false }, { @@ -11903,20 +10674,11 @@ }, "type": { "type": "string", - "enum": [ - "tool-trajectory", - "tool_trajectory" - ] + "enum": ["tool-trajectory", "tool_trajectory"] }, "mode": { "type": "string", - "enum": [ - "any_order", - "in_order", - "exact", - "subset", - "superset" - ] + "enum": ["any_order", "in_order", "exact", "subset", "superset"] }, "minimums": { "type": "object", @@ -11957,12 +10719,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -11976,12 +10733,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -11992,9 +10744,7 @@ ] } }, - "required": [ - "tool" - ], + "required": ["tool"], "additionalProperties": false } }, @@ -12002,12 +10752,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -12021,12 +10766,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -12037,10 +10777,7 @@ ] } }, - "required": [ - "type", - "mode" - ], + "required": ["type", "mode"], "additionalProperties": false }, { @@ -12070,10 +10807,7 @@ }, "type": { "type": "string", - "enum": [ - "field-accuracy", - "field_accuracy" - ] + "enum": ["field-accuracy", "field_accuracy"] }, "fields": { "type": "array", @@ -12085,11 +10819,7 @@ }, "match": { "type": "string", - "enum": [ - "exact", - "numeric_tolerance", - "date" - ] + "enum": ["exact", "numeric_tolerance", "date"] }, "required": { "type": "boolean" @@ -12111,26 +10841,17 @@ } } }, - "required": [ - "path", - "match" - ], + "required": ["path", "match"], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": [ - "weighted_average", - "all_or_nothing" - ] + "enum": ["weighted_average", "all_or_nothing"] } }, - "required": [ - "type", - "fields" - ], + "required": ["type", "fields"], "additionalProperties": false }, { @@ -12167,10 +10888,7 @@ "minimum": 0 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -12207,10 +10925,7 @@ "minimum": 0 } }, - "required": [ - "type", - "budget" - ], + "required": ["type", "budget"], "additionalProperties": false }, { @@ -12240,10 +10955,7 @@ }, "type": { "type": "string", - "enum": [ - "token-usage", - "token_usage" - ] + "enum": ["token-usage", "token_usage"] }, "max_total": { "type": "number", @@ -12258,9 +10970,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -12290,10 +11000,7 @@ }, "type": { "type": "string", - "enum": [ - "execution-metrics", - "execution_metrics" - ] + "enum": ["execution-metrics", "execution_metrics"] }, "max_tool_calls": { "type": "number", @@ -12325,9 +11032,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -12357,10 +11062,7 @@ }, "type": { "type": "string", - "enum": [ - "agent-judge", - "agent_judge" - ] + "enum": ["agent-judge", "agent_judge"] }, "prompt": { "type": "string" @@ -12414,10 +11116,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -12439,9 +11138,7 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -12477,10 +11174,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -12516,10 +11210,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -12549,15 +11240,10 @@ }, "type": { "type": "string", - "enum": [ - "is-json", - "is_json" - ] + "enum": ["is-json", "is_json"] } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -12593,10 +11279,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -12677,10 +11360,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -12690,10 +11370,7 @@ "minItems": 1 } }, - "required": [ - "type", - "criteria" - ], + "required": ["type", "criteria"], "additionalProperties": false } ] @@ -12730,10 +11407,7 @@ }, "type": { "type": "string", - "enum": [ - "code-judge", - "code_judge" - ] + "enum": ["code-judge", "code_judge"] }, "command": { "anyOf": [ @@ -12785,10 +11459,7 @@ "additionalProperties": {} } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false }, { @@ -12818,10 +11489,7 @@ }, "type": { "type": "string", - "enum": [ - "llm-judge", - "llm_judge" - ] + "enum": ["llm-judge", "llm_judge"] }, "prompt": { "anyOf": [ @@ -12915,10 +11583,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -12937,9 +11602,7 @@ "additionalProperties": {} } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -12999,9 +11662,7 @@ } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -13017,10 +11678,7 @@ "maximum": 1 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -13037,10 +11695,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false }, { @@ -13057,18 +11712,13 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false } ] } }, - "required": [ - "type", - "aggregator" - ], + "required": ["type", "aggregator"], "additionalProperties": false }, { @@ -13098,20 +11748,11 @@ }, "type": { "type": "string", - "enum": [ - "tool-trajectory", - "tool_trajectory" - ] + "enum": ["tool-trajectory", "tool_trajectory"] }, "mode": { "type": "string", - "enum": [ - "any_order", - "in_order", - "exact", - "subset", - "superset" - ] + "enum": ["any_order", "in_order", "exact", "subset", "superset"] }, "minimums": { "type": "object", @@ -13152,12 +11793,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -13171,12 +11807,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -13187,9 +11818,7 @@ ] } }, - "required": [ - "tool" - ], + "required": ["tool"], "additionalProperties": false } }, @@ -13197,12 +11826,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -13216,12 +11840,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -13232,10 +11851,7 @@ ] } }, - "required": [ - "type", - "mode" - ], + "required": ["type", "mode"], "additionalProperties": false }, { @@ -13265,10 +11881,7 @@ }, "type": { "type": "string", - "enum": [ - "field-accuracy", - "field_accuracy" - ] + "enum": ["field-accuracy", "field_accuracy"] }, "fields": { "type": "array", @@ -13280,11 +11893,7 @@ }, "match": { "type": "string", - "enum": [ - "exact", - "numeric_tolerance", - "date" - ] + "enum": ["exact", "numeric_tolerance", "date"] }, "required": { "type": "boolean" @@ -13306,26 +11915,17 @@ } } }, - "required": [ - "path", - "match" - ], + "required": ["path", "match"], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": [ - "weighted_average", - "all_or_nothing" - ] + "enum": ["weighted_average", "all_or_nothing"] } }, - "required": [ - "type", - "fields" - ], + "required": ["type", "fields"], "additionalProperties": false }, { @@ -13362,10 +11962,7 @@ "minimum": 0 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -13402,10 +11999,7 @@ "minimum": 0 } }, - "required": [ - "type", - "budget" - ], + "required": ["type", "budget"], "additionalProperties": false }, { @@ -13435,10 +12029,7 @@ }, "type": { "type": "string", - "enum": [ - "token-usage", - "token_usage" - ] + "enum": ["token-usage", "token_usage"] }, "max_total": { "type": "number", @@ -13453,9 +12044,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -13485,10 +12074,7 @@ }, "type": { "type": "string", - "enum": [ - "execution-metrics", - "execution_metrics" - ] + "enum": ["execution-metrics", "execution_metrics"] }, "max_tool_calls": { "type": "number", @@ -13520,9 +12106,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -13552,10 +12136,7 @@ }, "type": { "type": "string", - "enum": [ - "agent-judge", - "agent_judge" - ] + "enum": ["agent-judge", "agent_judge"] }, "prompt": { "type": "string" @@ -13609,10 +12190,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -13634,9 +12212,7 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -13672,10 +12248,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -13711,10 +12284,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -13744,15 +12314,10 @@ }, "type": { "type": "string", - "enum": [ - "is-json", - "is_json" - ] + "enum": ["is-json", "is_json"] } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -13788,10 +12353,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -13872,10 +12434,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -13885,10 +12444,7 @@ "minItems": 1 } }, - "required": [ - "type", - "criteria" - ], + "required": ["type", "criteria"], "additionalProperties": false } ] @@ -13925,10 +12481,7 @@ }, "type": { "type": "string", - "enum": [ - "code-judge", - "code_judge" - ] + "enum": ["code-judge", "code_judge"] }, "command": { "anyOf": [ @@ -13980,10 +12533,7 @@ "additionalProperties": {} } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false }, { @@ -14013,10 +12563,7 @@ }, "type": { "type": "string", - "enum": [ - "llm-judge", - "llm_judge" - ] + "enum": ["llm-judge", "llm_judge"] }, "prompt": { "anyOf": [ @@ -14110,10 +12657,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -14132,9 +12676,7 @@ "additionalProperties": {} } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -14194,9 +12736,7 @@ } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -14212,10 +12752,7 @@ "maximum": 1 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -14232,10 +12769,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false }, { @@ -14252,18 +12786,13 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false } ] } }, - "required": [ - "type", - "aggregator" - ], + "required": ["type", "aggregator"], "additionalProperties": false }, { @@ -14293,20 +12822,11 @@ }, "type": { "type": "string", - "enum": [ - "tool-trajectory", - "tool_trajectory" - ] + "enum": ["tool-trajectory", "tool_trajectory"] }, "mode": { "type": "string", - "enum": [ - "any_order", - "in_order", - "exact", - "subset", - "superset" - ] + "enum": ["any_order", "in_order", "exact", "subset", "superset"] }, "minimums": { "type": "object", @@ -14347,12 +12867,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -14366,12 +12881,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -14382,9 +12892,7 @@ ] } }, - "required": [ - "tool" - ], + "required": ["tool"], "additionalProperties": false } }, @@ -14392,12 +12900,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -14411,12 +12914,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -14427,10 +12925,7 @@ ] } }, - "required": [ - "type", - "mode" - ], + "required": ["type", "mode"], "additionalProperties": false }, { @@ -14460,10 +12955,7 @@ }, "type": { "type": "string", - "enum": [ - "field-accuracy", - "field_accuracy" - ] + "enum": ["field-accuracy", "field_accuracy"] }, "fields": { "type": "array", @@ -14475,11 +12967,7 @@ }, "match": { "type": "string", - "enum": [ - "exact", - "numeric_tolerance", - "date" - ] + "enum": ["exact", "numeric_tolerance", "date"] }, "required": { "type": "boolean" @@ -14501,26 +12989,17 @@ } } }, - "required": [ - "path", - "match" - ], + "required": ["path", "match"], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": [ - "weighted_average", - "all_or_nothing" - ] + "enum": ["weighted_average", "all_or_nothing"] } }, - "required": [ - "type", - "fields" - ], + "required": ["type", "fields"], "additionalProperties": false }, { @@ -14557,10 +13036,7 @@ "minimum": 0 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -14597,10 +13073,7 @@ "minimum": 0 } }, - "required": [ - "type", - "budget" - ], + "required": ["type", "budget"], "additionalProperties": false }, { @@ -14630,10 +13103,7 @@ }, "type": { "type": "string", - "enum": [ - "token-usage", - "token_usage" - ] + "enum": ["token-usage", "token_usage"] }, "max_total": { "type": "number", @@ -14648,9 +13118,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -14680,10 +13148,7 @@ }, "type": { "type": "string", - "enum": [ - "execution-metrics", - "execution_metrics" - ] + "enum": ["execution-metrics", "execution_metrics"] }, "max_tool_calls": { "type": "number", @@ -14715,9 +13180,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -14747,10 +13210,7 @@ }, "type": { "type": "string", - "enum": [ - "agent-judge", - "agent_judge" - ] + "enum": ["agent-judge", "agent_judge"] }, "prompt": { "type": "string" @@ -14804,10 +13264,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -14829,9 +13286,7 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -14867,10 +13322,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -14906,10 +13358,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -14939,15 +13388,10 @@ }, "type": { "type": "string", - "enum": [ - "is-json", - "is_json" - ] + "enum": ["is-json", "is_json"] } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -14983,10 +13427,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -15067,10 +13508,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -15080,10 +13518,7 @@ "minItems": 1 } }, - "required": [ - "type", - "criteria" - ], + "required": ["type", "criteria"], "additionalProperties": false } ] @@ -15104,11 +13539,7 @@ }, "strategy": { "type": "string", - "enum": [ - "pass_at_k", - "mean", - "confidence_interval" - ] + "enum": ["pass_at_k", "mean", "confidence_interval"] }, "cost_limit_usd": { "type": "number", @@ -15119,9 +13550,7 @@ "minimum": 0 } }, - "required": [ - "count" - ], + "required": ["count"], "additionalProperties": false }, "total_budget_usd": { @@ -15149,10 +13578,7 @@ }, "isolation": { "type": "string", - "enum": [ - "shared", - "per_test" - ] + "enum": ["shared", "per_test"] }, "repos": { "type": "array", @@ -15176,10 +13602,7 @@ "format": "uri" } }, - "required": [ - "type", - "url" - ], + "required": ["type", "url"], "additionalProperties": false }, { @@ -15193,10 +13616,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false } ] @@ -15209,10 +13629,7 @@ }, "resolve": { "type": "string", - "enum": [ - "remote", - "local" - ] + "enum": ["remote", "local"] }, "ancestor": { "type": "integer", @@ -15241,10 +13658,7 @@ "additionalProperties": false } }, - "required": [ - "path", - "source" - ], + "required": ["path", "source"], "additionalProperties": false } }, @@ -15280,11 +13694,7 @@ }, "reset": { "type": "string", - "enum": [ - "none", - "fast", - "strict" - ] + "enum": ["none", "fast", "strict"] } }, "additionalProperties": false @@ -15315,11 +13725,7 @@ }, "reset": { "type": "string", - "enum": [ - "none", - "fast", - "strict" - ] + "enum": ["none", "fast", "strict"] } }, "additionalProperties": false @@ -15350,11 +13756,7 @@ }, "reset": { "type": "string", - "enum": [ - "none", - "fast", - "strict" - ] + "enum": ["none", "fast", "strict"] } }, "additionalProperties": false @@ -15385,11 +13787,7 @@ }, "reset": { "type": "string", - "enum": [ - "none", - "fast", - "strict" - ] + "enum": ["none", "fast", "strict"] } }, "additionalProperties": false @@ -15399,11 +13797,7 @@ }, "mode": { "type": "string", - "enum": [ - "pooled", - "temp", - "static" - ] + "enum": ["pooled", "temp", "static"] }, "path": { "type": "string" @@ -15425,9 +13819,7 @@ "type": "string" } }, - "required": [ - "id" - ], + "required": ["id"], "additionalProperties": false } }, @@ -15482,10 +13874,7 @@ }, "type": { "type": "string", - "enum": [ - "code-judge", - "code_judge" - ] + "enum": ["code-judge", "code_judge"] }, "command": { "anyOf": [ @@ -15537,10 +13926,7 @@ "additionalProperties": {} } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false }, { @@ -15570,10 +13956,7 @@ }, "type": { "type": "string", - "enum": [ - "llm-judge", - "llm_judge" - ] + "enum": ["llm-judge", "llm_judge"] }, "prompt": { "anyOf": [ @@ -15667,10 +14050,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -15689,9 +14069,7 @@ "additionalProperties": {} } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -15751,9 +14129,7 @@ } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -15769,10 +14145,7 @@ "maximum": 1 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -15789,10 +14162,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false }, { @@ -15809,18 +14179,13 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false } ] } }, - "required": [ - "type", - "aggregator" - ], + "required": ["type", "aggregator"], "additionalProperties": false }, { @@ -15850,20 +14215,11 @@ }, "type": { "type": "string", - "enum": [ - "tool-trajectory", - "tool_trajectory" - ] + "enum": ["tool-trajectory", "tool_trajectory"] }, "mode": { "type": "string", - "enum": [ - "any_order", - "in_order", - "exact", - "subset", - "superset" - ] + "enum": ["any_order", "in_order", "exact", "subset", "superset"] }, "minimums": { "type": "object", @@ -15904,12 +14260,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -15923,12 +14274,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -15939,9 +14285,7 @@ ] } }, - "required": [ - "tool" - ], + "required": ["tool"], "additionalProperties": false } }, @@ -15949,12 +14293,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -15968,12 +14307,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -15984,10 +14318,7 @@ ] } }, - "required": [ - "type", - "mode" - ], + "required": ["type", "mode"], "additionalProperties": false }, { @@ -16017,10 +14348,7 @@ }, "type": { "type": "string", - "enum": [ - "field-accuracy", - "field_accuracy" - ] + "enum": ["field-accuracy", "field_accuracy"] }, "fields": { "type": "array", @@ -16032,11 +14360,7 @@ }, "match": { "type": "string", - "enum": [ - "exact", - "numeric_tolerance", - "date" - ] + "enum": ["exact", "numeric_tolerance", "date"] }, "required": { "type": "boolean" @@ -16058,26 +14382,17 @@ } } }, - "required": [ - "path", - "match" - ], + "required": ["path", "match"], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": [ - "weighted_average", - "all_or_nothing" - ] + "enum": ["weighted_average", "all_or_nothing"] } }, - "required": [ - "type", - "fields" - ], + "required": ["type", "fields"], "additionalProperties": false }, { @@ -16114,10 +14429,7 @@ "minimum": 0 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -16154,10 +14466,7 @@ "minimum": 0 } }, - "required": [ - "type", - "budget" - ], + "required": ["type", "budget"], "additionalProperties": false }, { @@ -16187,10 +14496,7 @@ }, "type": { "type": "string", - "enum": [ - "token-usage", - "token_usage" - ] + "enum": ["token-usage", "token_usage"] }, "max_total": { "type": "number", @@ -16205,9 +14511,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -16237,10 +14541,7 @@ }, "type": { "type": "string", - "enum": [ - "execution-metrics", - "execution_metrics" - ] + "enum": ["execution-metrics", "execution_metrics"] }, "max_tool_calls": { "type": "number", @@ -16272,9 +14573,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -16304,10 +14603,7 @@ }, "type": { "type": "string", - "enum": [ - "agent-judge", - "agent_judge" - ] + "enum": ["agent-judge", "agent_judge"] }, "prompt": { "type": "string" @@ -16361,10 +14657,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -16386,9 +14679,7 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -16424,10 +14715,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -16463,10 +14751,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -16496,15 +14781,10 @@ }, "type": { "type": "string", - "enum": [ - "is-json", - "is_json" - ] + "enum": ["is-json", "is_json"] } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -16540,10 +14820,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -16624,10 +14901,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -16637,10 +14911,7 @@ "minItems": 1 } }, - "required": [ - "type", - "criteria" - ], + "required": ["type", "criteria"], "additionalProperties": false } ] @@ -16677,10 +14948,7 @@ }, "type": { "type": "string", - "enum": [ - "code-judge", - "code_judge" - ] + "enum": ["code-judge", "code_judge"] }, "command": { "anyOf": [ @@ -16732,10 +15000,7 @@ "additionalProperties": {} } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false }, { @@ -16765,10 +15030,7 @@ }, "type": { "type": "string", - "enum": [ - "llm-judge", - "llm_judge" - ] + "enum": ["llm-judge", "llm_judge"] }, "prompt": { "anyOf": [ @@ -16862,10 +15124,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -16884,9 +15143,7 @@ "additionalProperties": {} } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -16946,9 +15203,7 @@ } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -16964,10 +15219,7 @@ "maximum": 1 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -16984,10 +15236,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false }, { @@ -17004,18 +15253,13 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false } ] } }, - "required": [ - "type", - "aggregator" - ], + "required": ["type", "aggregator"], "additionalProperties": false }, { @@ -17045,20 +15289,11 @@ }, "type": { "type": "string", - "enum": [ - "tool-trajectory", - "tool_trajectory" - ] + "enum": ["tool-trajectory", "tool_trajectory"] }, "mode": { "type": "string", - "enum": [ - "any_order", - "in_order", - "exact", - "subset", - "superset" - ] + "enum": ["any_order", "in_order", "exact", "subset", "superset"] }, "minimums": { "type": "object", @@ -17099,12 +15334,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -17118,12 +15348,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -17134,9 +15359,7 @@ ] } }, - "required": [ - "tool" - ], + "required": ["tool"], "additionalProperties": false } }, @@ -17144,12 +15367,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -17163,12 +15381,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -17179,10 +15392,7 @@ ] } }, - "required": [ - "type", - "mode" - ], + "required": ["type", "mode"], "additionalProperties": false }, { @@ -17212,10 +15422,7 @@ }, "type": { "type": "string", - "enum": [ - "field-accuracy", - "field_accuracy" - ] + "enum": ["field-accuracy", "field_accuracy"] }, "fields": { "type": "array", @@ -17227,11 +15434,7 @@ }, "match": { "type": "string", - "enum": [ - "exact", - "numeric_tolerance", - "date" - ] + "enum": ["exact", "numeric_tolerance", "date"] }, "required": { "type": "boolean" @@ -17253,26 +15456,17 @@ } } }, - "required": [ - "path", - "match" - ], + "required": ["path", "match"], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": [ - "weighted_average", - "all_or_nothing" - ] + "enum": ["weighted_average", "all_or_nothing"] } }, - "required": [ - "type", - "fields" - ], + "required": ["type", "fields"], "additionalProperties": false }, { @@ -17309,10 +15503,7 @@ "minimum": 0 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -17349,10 +15540,7 @@ "minimum": 0 } }, - "required": [ - "type", - "budget" - ], + "required": ["type", "budget"], "additionalProperties": false }, { @@ -17382,10 +15570,7 @@ }, "type": { "type": "string", - "enum": [ - "token-usage", - "token_usage" - ] + "enum": ["token-usage", "token_usage"] }, "max_total": { "type": "number", @@ -17400,9 +15585,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -17432,10 +15615,7 @@ }, "type": { "type": "string", - "enum": [ - "execution-metrics", - "execution_metrics" - ] + "enum": ["execution-metrics", "execution_metrics"] }, "max_tool_calls": { "type": "number", @@ -17467,9 +15647,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -17499,10 +15677,7 @@ }, "type": { "type": "string", - "enum": [ - "agent-judge", - "agent_judge" - ] + "enum": ["agent-judge", "agent_judge"] }, "prompt": { "type": "string" @@ -17556,10 +15731,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -17581,9 +15753,7 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -17619,10 +15789,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -17658,10 +15825,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -17691,15 +15855,10 @@ }, "type": { "type": "string", - "enum": [ - "is-json", - "is_json" - ] + "enum": ["is-json", "is_json"] } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -17735,10 +15894,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -17819,10 +15975,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -17832,10 +15985,7 @@ "minItems": 1 } }, - "required": [ - "type", - "criteria" - ], + "required": ["type", "criteria"], "additionalProperties": false } ] @@ -17872,10 +16022,7 @@ }, "type": { "type": "string", - "enum": [ - "code-judge", - "code_judge" - ] + "enum": ["code-judge", "code_judge"] }, "command": { "anyOf": [ @@ -17927,10 +16074,7 @@ "additionalProperties": {} } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false }, { @@ -17960,10 +16104,7 @@ }, "type": { "type": "string", - "enum": [ - "llm-judge", - "llm_judge" - ] + "enum": ["llm-judge", "llm_judge"] }, "prompt": { "anyOf": [ @@ -18057,10 +16198,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -18079,9 +16217,7 @@ "additionalProperties": {} } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -18141,9 +16277,7 @@ } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -18159,10 +16293,7 @@ "maximum": 1 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -18179,10 +16310,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false }, { @@ -18199,18 +16327,13 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false } ] } }, - "required": [ - "type", - "aggregator" - ], + "required": ["type", "aggregator"], "additionalProperties": false }, { @@ -18240,20 +16363,11 @@ }, "type": { "type": "string", - "enum": [ - "tool-trajectory", - "tool_trajectory" - ] + "enum": ["tool-trajectory", "tool_trajectory"] }, "mode": { "type": "string", - "enum": [ - "any_order", - "in_order", - "exact", - "subset", - "superset" - ] + "enum": ["any_order", "in_order", "exact", "subset", "superset"] }, "minimums": { "type": "object", @@ -18294,12 +16408,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -18313,12 +16422,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -18329,9 +16433,7 @@ ] } }, - "required": [ - "tool" - ], + "required": ["tool"], "additionalProperties": false } }, @@ -18339,12 +16441,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -18358,12 +16455,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -18374,10 +16466,7 @@ ] } }, - "required": [ - "type", - "mode" - ], + "required": ["type", "mode"], "additionalProperties": false }, { @@ -18407,10 +16496,7 @@ }, "type": { "type": "string", - "enum": [ - "field-accuracy", - "field_accuracy" - ] + "enum": ["field-accuracy", "field_accuracy"] }, "fields": { "type": "array", @@ -18422,11 +16508,7 @@ }, "match": { "type": "string", - "enum": [ - "exact", - "numeric_tolerance", - "date" - ] + "enum": ["exact", "numeric_tolerance", "date"] }, "required": { "type": "boolean" @@ -18448,26 +16530,17 @@ } } }, - "required": [ - "path", - "match" - ], + "required": ["path", "match"], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": [ - "weighted_average", - "all_or_nothing" - ] + "enum": ["weighted_average", "all_or_nothing"] } }, - "required": [ - "type", - "fields" - ], + "required": ["type", "fields"], "additionalProperties": false }, { @@ -18504,10 +16577,7 @@ "minimum": 0 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -18544,10 +16614,7 @@ "minimum": 0 } }, - "required": [ - "type", - "budget" - ], + "required": ["type", "budget"], "additionalProperties": false }, { @@ -18577,10 +16644,7 @@ }, "type": { "type": "string", - "enum": [ - "token-usage", - "token_usage" - ] + "enum": ["token-usage", "token_usage"] }, "max_total": { "type": "number", @@ -18595,9 +16659,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -18627,10 +16689,7 @@ }, "type": { "type": "string", - "enum": [ - "execution-metrics", - "execution_metrics" - ] + "enum": ["execution-metrics", "execution_metrics"] }, "max_tool_calls": { "type": "number", @@ -18662,9 +16721,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -18694,10 +16751,7 @@ }, "type": { "type": "string", - "enum": [ - "agent-judge", - "agent_judge" - ] + "enum": ["agent-judge", "agent_judge"] }, "prompt": { "type": "string" @@ -18751,10 +16805,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -18776,9 +16827,7 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -18814,10 +16863,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -18853,10 +16899,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -18886,15 +16929,10 @@ }, "type": { "type": "string", - "enum": [ - "is-json", - "is_json" - ] + "enum": ["is-json", "is_json"] } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -18930,10 +16968,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -19014,10 +17049,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -19027,10 +17059,7 @@ "minItems": 1 } }, - "required": [ - "type", - "criteria" - ], + "required": ["type", "criteria"], "additionalProperties": false } ] @@ -19051,11 +17080,7 @@ }, "strategy": { "type": "string", - "enum": [ - "pass_at_k", - "mean", - "confidence_interval" - ] + "enum": ["pass_at_k", "mean", "confidence_interval"] }, "cost_limit_usd": { "type": "number", @@ -19066,9 +17091,7 @@ "minimum": 0 } }, - "required": [ - "count" - ], + "required": ["count"], "additionalProperties": false }, "total_budget_usd": { @@ -19119,10 +17142,7 @@ }, "type": { "type": "string", - "enum": [ - "code-judge", - "code_judge" - ] + "enum": ["code-judge", "code_judge"] }, "command": { "anyOf": [ @@ -19174,10 +17194,7 @@ "additionalProperties": {} } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false }, { @@ -19207,10 +17224,7 @@ }, "type": { "type": "string", - "enum": [ - "llm-judge", - "llm_judge" - ] + "enum": ["llm-judge", "llm_judge"] }, "prompt": { "anyOf": [ @@ -19304,10 +17318,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -19326,9 +17337,7 @@ "additionalProperties": {} } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -19388,9 +17397,7 @@ } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -19406,10 +17413,7 @@ "maximum": 1 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -19426,10 +17430,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false }, { @@ -19446,18 +17447,13 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false } ] } }, - "required": [ - "type", - "aggregator" - ], + "required": ["type", "aggregator"], "additionalProperties": false }, { @@ -19487,20 +17483,11 @@ }, "type": { "type": "string", - "enum": [ - "tool-trajectory", - "tool_trajectory" - ] + "enum": ["tool-trajectory", "tool_trajectory"] }, "mode": { "type": "string", - "enum": [ - "any_order", - "in_order", - "exact", - "subset", - "superset" - ] + "enum": ["any_order", "in_order", "exact", "subset", "superset"] }, "minimums": { "type": "object", @@ -19541,12 +17528,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -19560,12 +17542,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -19576,9 +17553,7 @@ ] } }, - "required": [ - "tool" - ], + "required": ["tool"], "additionalProperties": false } }, @@ -19586,12 +17561,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -19605,12 +17575,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -19621,10 +17586,7 @@ ] } }, - "required": [ - "type", - "mode" - ], + "required": ["type", "mode"], "additionalProperties": false }, { @@ -19654,10 +17616,7 @@ }, "type": { "type": "string", - "enum": [ - "field-accuracy", - "field_accuracy" - ] + "enum": ["field-accuracy", "field_accuracy"] }, "fields": { "type": "array", @@ -19669,11 +17628,7 @@ }, "match": { "type": "string", - "enum": [ - "exact", - "numeric_tolerance", - "date" - ] + "enum": ["exact", "numeric_tolerance", "date"] }, "required": { "type": "boolean" @@ -19695,26 +17650,17 @@ } } }, - "required": [ - "path", - "match" - ], + "required": ["path", "match"], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": [ - "weighted_average", - "all_or_nothing" - ] + "enum": ["weighted_average", "all_or_nothing"] } }, - "required": [ - "type", - "fields" - ], + "required": ["type", "fields"], "additionalProperties": false }, { @@ -19751,10 +17697,7 @@ "minimum": 0 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -19791,10 +17734,7 @@ "minimum": 0 } }, - "required": [ - "type", - "budget" - ], + "required": ["type", "budget"], "additionalProperties": false }, { @@ -19824,10 +17764,7 @@ }, "type": { "type": "string", - "enum": [ - "token-usage", - "token_usage" - ] + "enum": ["token-usage", "token_usage"] }, "max_total": { "type": "number", @@ -19842,9 +17779,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -19874,10 +17809,7 @@ }, "type": { "type": "string", - "enum": [ - "execution-metrics", - "execution_metrics" - ] + "enum": ["execution-metrics", "execution_metrics"] }, "max_tool_calls": { "type": "number", @@ -19909,9 +17841,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -19941,10 +17871,7 @@ }, "type": { "type": "string", - "enum": [ - "agent-judge", - "agent_judge" - ] + "enum": ["agent-judge", "agent_judge"] }, "prompt": { "type": "string" @@ -19998,10 +17925,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -20023,9 +17947,7 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -20061,10 +17983,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -20100,10 +18019,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -20133,15 +18049,10 @@ }, "type": { "type": "string", - "enum": [ - "is-json", - "is_json" - ] + "enum": ["is-json", "is_json"] } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -20177,10 +18088,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -20261,10 +18169,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -20274,10 +18179,7 @@ "minItems": 1 } }, - "required": [ - "type", - "criteria" - ], + "required": ["type", "criteria"], "additionalProperties": false } ] @@ -20314,10 +18216,7 @@ }, "type": { "type": "string", - "enum": [ - "code-judge", - "code_judge" - ] + "enum": ["code-judge", "code_judge"] }, "command": { "anyOf": [ @@ -20369,10 +18268,7 @@ "additionalProperties": {} } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false }, { @@ -20402,10 +18298,7 @@ }, "type": { "type": "string", - "enum": [ - "llm-judge", - "llm_judge" - ] + "enum": ["llm-judge", "llm_judge"] }, "prompt": { "anyOf": [ @@ -20499,10 +18392,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -20521,9 +18411,7 @@ "additionalProperties": {} } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -20583,9 +18471,7 @@ } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -20601,10 +18487,7 @@ "maximum": 1 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -20621,10 +18504,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false }, { @@ -20641,18 +18521,13 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false } ] } }, - "required": [ - "type", - "aggregator" - ], + "required": ["type", "aggregator"], "additionalProperties": false }, { @@ -20682,20 +18557,11 @@ }, "type": { "type": "string", - "enum": [ - "tool-trajectory", - "tool_trajectory" - ] + "enum": ["tool-trajectory", "tool_trajectory"] }, "mode": { "type": "string", - "enum": [ - "any_order", - "in_order", - "exact", - "subset", - "superset" - ] + "enum": ["any_order", "in_order", "exact", "subset", "superset"] }, "minimums": { "type": "object", @@ -20736,12 +18602,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -20755,12 +18616,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -20771,9 +18627,7 @@ ] } }, - "required": [ - "tool" - ], + "required": ["tool"], "additionalProperties": false } }, @@ -20781,12 +18635,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -20800,12 +18649,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -20816,10 +18660,7 @@ ] } }, - "required": [ - "type", - "mode" - ], + "required": ["type", "mode"], "additionalProperties": false }, { @@ -20849,10 +18690,7 @@ }, "type": { "type": "string", - "enum": [ - "field-accuracy", - "field_accuracy" - ] + "enum": ["field-accuracy", "field_accuracy"] }, "fields": { "type": "array", @@ -20864,11 +18702,7 @@ }, "match": { "type": "string", - "enum": [ - "exact", - "numeric_tolerance", - "date" - ] + "enum": ["exact", "numeric_tolerance", "date"] }, "required": { "type": "boolean" @@ -20890,26 +18724,17 @@ } } }, - "required": [ - "path", - "match" - ], + "required": ["path", "match"], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": [ - "weighted_average", - "all_or_nothing" - ] + "enum": ["weighted_average", "all_or_nothing"] } }, - "required": [ - "type", - "fields" - ], + "required": ["type", "fields"], "additionalProperties": false }, { @@ -20946,10 +18771,7 @@ "minimum": 0 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -20986,10 +18808,7 @@ "minimum": 0 } }, - "required": [ - "type", - "budget" - ], + "required": ["type", "budget"], "additionalProperties": false }, { @@ -21019,10 +18838,7 @@ }, "type": { "type": "string", - "enum": [ - "token-usage", - "token_usage" - ] + "enum": ["token-usage", "token_usage"] }, "max_total": { "type": "number", @@ -21037,9 +18853,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -21069,10 +18883,7 @@ }, "type": { "type": "string", - "enum": [ - "execution-metrics", - "execution_metrics" - ] + "enum": ["execution-metrics", "execution_metrics"] }, "max_tool_calls": { "type": "number", @@ -21104,9 +18915,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -21136,10 +18945,7 @@ }, "type": { "type": "string", - "enum": [ - "agent-judge", - "agent_judge" - ] + "enum": ["agent-judge", "agent_judge"] }, "prompt": { "type": "string" @@ -21193,10 +18999,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -21218,9 +19021,7 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -21256,10 +19057,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -21295,10 +19093,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -21328,15 +19123,10 @@ }, "type": { "type": "string", - "enum": [ - "is-json", - "is_json" - ] + "enum": ["is-json", "is_json"] } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -21372,10 +19162,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -21456,10 +19243,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -21469,10 +19253,7 @@ "minItems": 1 } }, - "required": [ - "type", - "criteria" - ], + "required": ["type", "criteria"], "additionalProperties": false } ] @@ -21488,10 +19269,7 @@ }, "isolation": { "type": "string", - "enum": [ - "shared", - "per_test" - ] + "enum": ["shared", "per_test"] }, "repos": { "type": "array", @@ -21515,10 +19293,7 @@ "format": "uri" } }, - "required": [ - "type", - "url" - ], + "required": ["type", "url"], "additionalProperties": false }, { @@ -21532,10 +19307,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false } ] @@ -21548,10 +19320,7 @@ }, "resolve": { "type": "string", - "enum": [ - "remote", - "local" - ] + "enum": ["remote", "local"] }, "ancestor": { "type": "integer", @@ -21580,10 +19349,7 @@ "additionalProperties": false } }, - "required": [ - "path", - "source" - ], + "required": ["path", "source"], "additionalProperties": false } }, @@ -21619,11 +19385,7 @@ }, "reset": { "type": "string", - "enum": [ - "none", - "fast", - "strict" - ] + "enum": ["none", "fast", "strict"] } }, "additionalProperties": false @@ -21654,11 +19416,7 @@ }, "reset": { "type": "string", - "enum": [ - "none", - "fast", - "strict" - ] + "enum": ["none", "fast", "strict"] } }, "additionalProperties": false @@ -21689,11 +19447,7 @@ }, "reset": { "type": "string", - "enum": [ - "none", - "fast", - "strict" - ] + "enum": ["none", "fast", "strict"] } }, "additionalProperties": false @@ -21724,11 +19478,7 @@ }, "reset": { "type": "string", - "enum": [ - "none", - "fast", - "strict" - ] + "enum": ["none", "fast", "strict"] } }, "additionalProperties": false @@ -21738,11 +19488,7 @@ }, "mode": { "type": "string", - "enum": [ - "pooled", - "temp", - "static" - ] + "enum": ["pooled", "temp", "static"] }, "path": { "type": "string" @@ -21756,9 +19502,7 @@ ] } }, - "required": [ - "tests" - ], + "required": ["tests"], "additionalProperties": false } }