diff --git a/CLAUDE.md b/CLAUDE.md index ea7608f0f..492e4c944 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -159,6 +159,28 @@ Unit tests alone are insufficient for evaluator changes. After implementing or m 5. **Note:** `--dry-run` returns mock responses that don't match evaluator output schemas. Use it only for testing harness flow, not evaluator logic. +## Completing Work — E2E Checklist + +Before marking any branch as ready for review, complete this checklist: + +1. **Copy `.env` to worktree** (if working in a git worktree): + ```bash + cp /home/christso/projects/agentv/.env .env + ``` + Without this, any eval run or LLM-dependent test will fail with missing API key errors. + +2. **Run unit tests**: `bun run test` — all must pass. + +3. **Run at least one real eval** against an example file to verify end-to-end behavior: + ```bash + bun apps/cli/src/cli.ts eval examples/features/rubric/evals/dataset.eval.yaml --test-id + ``` + Inspect the output JSONL to confirm correct evaluator type, scores, and hits/misses. + +4. **Verify no regressions** in areas adjacent to your changes (e.g., if you changed evaluator parsing, run an eval that exercises different evaluator types). + +5. **Mark PR as ready** only after all above steps pass. + ## Evaluator Type System Evaluator types use **kebab-case** everywhere (matching promptfoo convention): @@ -248,6 +270,7 @@ When working on a GitHub issue, **ALWAYS** follow this workflow: ``` 4. **Before merging**, ensure: + - **E2E verification completed** (see "Completing Work — E2E Checklist" below) - CI pipeline passes (all checks green) - Code has been reviewed if required - No merge conflicts with `main` diff --git a/apps/cli/src/commands/eval/commands/run.ts b/apps/cli/src/commands/eval/commands/run.ts index 4a7ec1b50..7e2117107 100644 --- a/apps/cli/src/commands/eval/commands/run.ts +++ b/apps/cli/src/commands/eval/commands/run.ts @@ -163,6 +163,17 @@ export const evalRunCommand = command({ description: 'Write companion artifacts (grading/.json, timing.json, benchmark.json) to the specified directory', }), + judgeTarget: option({ + type: optional(string), + long: 'judge-target', + description: + 'Override judge target for all evaluators (e.g., "agentv", or a target name from targets.yaml)', + }), + model: option({ + type: optional(string), + long: 'model', + description: 'Override model for the judge target (e.g., "openai:gpt-5-mini")', + }), }, handler: async (args) => { // Launch interactive wizard when no eval paths and stdin is a TTY @@ -203,6 +214,8 @@ export const evalRunCommand = command({ strict: args.strict, benchmarkJson: args.benchmarkJson, artifacts: args.artifacts, + judgeTarget: args.judgeTarget, + model: args.model, }; await runEvalCommand({ testFiles: resolvedPaths, rawOptions }); }, diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index 43eec380b..1f3d77f46 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -82,6 +82,8 @@ interface NormalizedOptions { readonly workspacePath?: string; readonly benchmarkJson?: string; readonly artifacts?: string; + readonly judgeTarget?: string; + readonly model?: string; } function normalizeBoolean(value: unknown): boolean { @@ -249,6 +251,8 @@ function normalizeOptions( workspacePath, benchmarkJson: normalizeString(rawOptions.benchmarkJson), artifacts: normalizeString(rawOptions.artifacts), + judgeTarget: normalizeString(rawOptions.judgeTarget), + model: normalizeString(rawOptions.model), } satisfies NormalizedOptions; } @@ -593,6 +597,8 @@ async function runSingleEvalFile(params: { trials: trialsConfig, totalBudgetUsd, failOnError, + judgeTarget: options.judgeTarget, + model: options.model, streamCallbacks: streamingObserver?.getStreamCallbacks(), onResult: async (result: EvaluationResult) => { // Finalize streaming observer span with score @@ -674,6 +680,11 @@ export async function runEvalCommand(input: RunEvalCommandInput): Promise let options = normalizeOptions(input.rawOptions, config, yamlConfig?.execution); + // Validate --judge-target / --model combinations + if (options.judgeTarget === 'agentv' && !options.model) { + throw new Error('--judge-target agentv requires --model (e.g., --model openai:gpt-5-mini)'); + } + // --retry-errors: override filter to only re-run execution_error test cases. // IMPORTANT: JSONL must be fully loaded here, before the output writer is created below, // since the retry source and output destination may refer to the same file. diff --git a/apps/web/src/content/docs/evaluation/eval-cases.mdx b/apps/web/src/content/docs/evaluation/eval-cases.mdx index cc1545b64..4881674a9 100644 --- a/apps/web/src/content/docs/evaluation/eval-cases.mdx +++ b/apps/web/src/content/docs/evaluation/eval-cases.mdx @@ -265,7 +265,7 @@ tests: ### `assert` present — explicit evaluators only -When `assert` is defined, only the declared evaluators run. No implicit judge is added. Judges that are declared (such as `llm-judge`, `code-judge`, `agent-judge`, or `rubrics`) receive `criteria` as input automatically. +When `assert` is defined, only the declared evaluators run. No implicit judge is added. Judges that are declared (such as `llm-judge`, `code-judge`, or `rubrics`) receive `criteria` as input automatically. If `assert` contains only deterministic evaluators (like `contains` or `regex`), the `criteria` field is not evaluated and a warning is emitted: diff --git a/apps/web/src/content/docs/guides/agent-eval-layers.mdx b/apps/web/src/content/docs/guides/agent-eval-layers.mdx index 783a2ca55..6d0f542cf 100644 --- a/apps/web/src/content/docs/guides/agent-eval-layers.mdx +++ b/apps/web/src/content/docs/guides/agent-eval-layers.mdx @@ -15,8 +15,8 @@ Covers plan quality, plan adherence, and tool selection rationale. Use LLM-based | Concern | AgentV evaluator | |---------|-----------------| -| Plan quality & coherence | `llm_judge` with reasoning-focused prompt | -| Workspace-aware auditing | `agent_judge` with rubrics | +| Plan quality & coherence | `llm-judge` with reasoning-focused prompt | +| Workspace-aware auditing | `llm-judge` with rubrics | ```yaml # Layer 1: Reasoning — verify the agent's plan makes sense @@ -29,7 +29,7 @@ assertions: Did it select appropriate tools for the task? Score 1.0 if reasoning is sound, 0.0 if not. - name: workspace-audit - type: agent-judge + type: llm-judge max_steps: 5 temperature: 0 rubrics: diff --git a/bun.lock b/bun.lock index 200a436cc..70471cff6 100644 --- a/bun.lock +++ b/bun.lock @@ -24,7 +24,7 @@ }, "apps/cli": { "name": "agentv", - "version": "2.12.0", + "version": "2.19.0", "bin": { "agentv": "./dist/cli.js", }, @@ -61,13 +61,14 @@ }, "packages/core": { "name": "@agentv/core", - "version": "2.12.0", + "version": "2.19.0", "dependencies": { "@agentclientprotocol/sdk": "^0.14.1", "@agentv/eval": "workspace:*", "@ai-sdk/anthropic": "^2.0.53", "@ai-sdk/azure": "^2.0.78", "@ai-sdk/google": "^2.0.44", + "@ai-sdk/openai": "^2.0.0", "@anthropic-ai/claude-agent-sdk": "^0.2.49", "@github/copilot-sdk": "^0.1.25", "@mariozechner/pi-agent-core": "^0.54.2", @@ -95,7 +96,7 @@ }, "packages/eval": { "name": "@agentv/eval", - "version": "2.12.0", + "version": "2.19.0", "dependencies": { "zod": "^3.23.8", }, diff --git a/docs/plans/2026-02-26-eval-schema-generation-design.md b/docs/plans/2026-02-26-eval-schema-generation-design.md index a20a7909f..9d6047886 100644 --- a/docs/plans/2026-02-26-eval-schema-generation-design.md +++ b/docs/plans/2026-02-26-eval-schema-generation-design.md @@ -248,14 +248,9 @@ const ExecutionMetricsSchema = EvaluatorCommonSchema.extend({ exploration_tolerance: z.number().min(0).optional(), }); -const AgentJudgeSchema = EvaluatorCommonSchema.extend({ - type: z.literal('agent_judge'), - prompt: z.string().optional(), - rubrics: z.array(RubricItemSchema).optional(), - max_steps: z.number().int().min(1).max(50).optional(), - temperature: z.number().min(0).max(2).optional(), - target: z.string().optional(), -}); +// Note: agent_judge was removed — llm-judge now covers all judge use cases +// including agentic behavior (auto-detected based on judge provider kind). +// See LlmJudgeSchema above for the unified schema. const ContainsSchema = EvaluatorCommonSchema.extend({ type: z.literal('contains'), @@ -292,7 +287,6 @@ const EvaluatorSchema = z.union([ CostSchema, TokenUsageSchema, ExecutionMetricsSchema, - AgentJudgeSchema, ContainsSchema, RegexSchema, IsJsonSchema, diff --git a/examples/features/agent-judge/.agentv/targets.yaml b/examples/features/agent-judge/.agentv/targets.yaml deleted file mode 100644 index 6d5c82918..000000000 --- a/examples/features/agent-judge/.agentv/targets.yaml +++ /dev/null @@ -1,22 +0,0 @@ -targets: - # Mock agent that "creates tests" in the workspace. - # Each test gets a fresh copy of workspace-template/ as its CWD. - - name: mock_agent - provider: cli - command: >- - bash -c ' - mkdir -p tests && - printf "import { add, multiply } from \"../src/main\";\n\ndescribe(\"math functions\", () => {\n test(\"add returns sum\", () => {\n expect(add(2, 3)).toBe(5);\n });\n\n test(\"multiply returns product\", () => {\n expect(multiply(4, 5)).toBe(20);\n });\n});\n" > tests/math.test.ts && - printf "import { greet } from \"../src/main\";\n\ndescribe(\"greet\", () => {\n test(\"returns greeting\", () => {\n expect(greet(\"World\")).toBe(\"Hello, World!\");\n });\n});\n" > tests/greet.test.ts && - echo "Created test files: tests/math.test.ts and tests/greet.test.ts" > {OUTPUT_FILE} - ' - workspace_template: ../workspace-template - judge_target: azure_judge - - # Azure OpenAI target used as judge provider for built-in agent_judge mode. - - name: azure_judge - provider: azure - endpoint: ${{ AZURE_OPENAI_ENDPOINT }} - api_key: ${{ AZURE_OPENAI_API_KEY }} - model: ${{ AZURE_DEPLOYMENT_NAME }} - version: ${{ AZURE_OPENAI_API_VERSION }} diff --git a/examples/features/agent-judge/evals/dataset.eval.baseline.jsonl b/examples/features/agent-judge/evals/dataset.eval.baseline.jsonl deleted file mode 100644 index bc2d5b6ee..000000000 --- a/examples/features/agent-judge/evals/dataset.eval.baseline.jsonl +++ /dev/null @@ -1,2 +0,0 @@ -{"timestamp":"2026-02-20T21:37:58.641Z","test_id":"verify-test-creation-freeform","dataset":"dataset","score":1,"hits":["Created tests/math.test.ts and tests/greet.test.ts in tests/ directory","Test files import functions from src/main.ts","add, multiply, and greet functions are tested","Assertions are meaningful and verify correct outputs"],"misses":[],"target":"mock_agent","reasoning":"workspace-audit: All criteria are fully met: each function is tested with meaningful assertions, test files are correctly placed and import from the source file.","scores":[{"name":"workspace-audit","type":"agent-judge","score":1,"weight":1,"verdict":"pass","hits":["Created tests/math.test.ts and tests/greet.test.ts in tests/ directory","Test files import functions from src/main.ts","add, multiply, and greet functions are tested","Assertions are meaningful and verify correct outputs"],"misses":[],"reasoning":"All criteria are fully met: each function is tested with meaningful assertions, test files are correctly placed and import from the source file.","details":{"mode":"built-in","steps":3,"tool_calls":5}}]} -{"timestamp":"2026-02-20T21:37:59.540Z","test_id":"verify-test-creation-rubric","dataset":"dataset","score":1,"hits":["[tests-dir-exists] A tests/ directory exists in the workspace: A 'tests/' directory exists in the workspace, containing test files.","[math-tests] Test file exists that tests the add and multiply functions: 'tests/math.test.ts' exists and contains tests for both 'add' and 'multiply' functions.","[greet-tests] Test file exists that tests the greet function: 'tests/greet.test.ts' exists and contains a test for the 'greet' function.","[assertions-present] Tests contain proper assertions (expect/assert calls): All test files contain proper assertions using 'expect' calls."],"misses":[],"target":"mock_agent","reasoning":"workspace-audit-rubric: All required test files exist in the 'tests/' directory, and each function from 'src/main.ts' is covered by appropriate unit tests with proper assertions. The candidate answer meets all rubric criteria.","scores":[{"name":"workspace-audit-rubric","type":"agent-judge","score":1,"weight":1,"verdict":"pass","hits":["[tests-dir-exists] A tests/ directory exists in the workspace: A 'tests/' directory exists in the workspace, containing test files.","[math-tests] Test file exists that tests the add and multiply functions: 'tests/math.test.ts' exists and contains tests for both 'add' and 'multiply' functions.","[greet-tests] Test file exists that tests the greet function: 'tests/greet.test.ts' exists and contains a test for the 'greet' function.","[assertions-present] Tests contain proper assertions (expect/assert calls): All test files contain proper assertions using 'expect' calls."],"misses":[],"reasoning":"All required test files exist in the 'tests/' directory, and each function from 'src/main.ts' is covered by appropriate unit tests with proper assertions. The candidate answer meets all rubric criteria.","details":{"mode":"built-in","steps":2,"tool_calls":3}}]} diff --git a/examples/features/agent-judge/evals/dataset.eval.yaml b/examples/features/agent-judge/evals/dataset.eval.yaml deleted file mode 100644 index a9bf21048..000000000 --- a/examples/features/agent-judge/evals/dataset.eval.yaml +++ /dev/null @@ -1,64 +0,0 @@ -# Agent Judge feature demonstration -# Tests that the agent_judge evaluator can investigate the workspace -# to verify that an agent created the expected files and content. -# -# The mock_agent creates test files in the workspace-template. -# The agent_judge evaluator uses an AI SDK agent loop with filesystem tools -# to verify the test files exist and contain proper test cases. - -description: Verify agent_judge evaluator can audit workspace file creation - -execution: - target: mock_agent - -tests: - # Case 1: freeform agent_judge (no rubrics) — scores 0-1 - - id: verify-test-creation-freeform - criteria: >- - The agent should create unit test files in a tests/ directory. - Test files should import from src/main.ts and test the add, multiply, - and greet functions with meaningful assertions. - - input: - - role: user - content: - - type: text - value: Create unit tests for all functions in src/main.ts - - assertions: - - name: workspace-audit - type: agent-judge - max_steps: 5 - temperature: 0 - - # Case 2: rubric-based agent_judge — structured evaluation - - id: verify-test-creation-rubric - criteria: >- - The agent should create comprehensive unit tests for the project. - - input: - - role: user - content: - - type: text - value: Create unit tests for all functions in src/main.ts - - assertions: - - name: workspace-audit-rubric - type: agent-judge - max_steps: 5 - temperature: 0 - rubrics: - - id: tests-dir-exists - outcome: "A tests/ directory exists in the workspace" - weight: 1.0 - required: true - - id: math-tests - outcome: "Test file exists that tests the add and multiply functions" - weight: 1.0 - required: true - - id: greet-tests - outcome: "Test file exists that tests the greet function" - weight: 1.0 - - id: assertions-present - outcome: "Tests contain proper assertions (expect/assert calls)" - weight: 0.5 diff --git a/examples/features/agent-judge/workspace-template/package.json b/examples/features/agent-judge/workspace-template/package.json deleted file mode 100644 index 24d635536..000000000 --- a/examples/features/agent-judge/workspace-template/package.json +++ /dev/null @@ -1,5 +0,0 @@ -{ - "name": "sample-project", - "version": "1.0.0", - "type": "module" -} diff --git a/examples/features/agent-judge/workspace-template/src/main.ts b/examples/features/agent-judge/workspace-template/src/main.ts deleted file mode 100644 index cfda22527..000000000 --- a/examples/features/agent-judge/workspace-template/src/main.ts +++ /dev/null @@ -1,11 +0,0 @@ -export function add(a: number, b: number): number { - return a + b; -} - -export function multiply(a: number, b: number): number { - return a * b; -} - -export function greet(name: string): string { - return `Hello, ${name}!`; -} diff --git a/examples/features/file-changes-judges/.agentv/targets.yaml b/examples/features/file-changes-judges/.agentv/targets.yaml index d9645bc03..10c067b31 100644 --- a/examples/features/file-changes-judges/.agentv/targets.yaml +++ b/examples/features/file-changes-judges/.agentv/targets.yaml @@ -11,7 +11,7 @@ targets: workspace_template: ../workspace-template judge_target: azure_judge - # Azure OpenAI — used as LLM judge (rubrics) and built-in agent_judge provider + # Azure OpenAI — used as LLM judge (rubrics) and built-in llm-judge provider - name: azure_judge provider: azure endpoint: ${{ AZURE_OPENAI_ENDPOINT }} @@ -19,7 +19,7 @@ targets: model: ${{ AZURE_DEPLOYMENT_NAME }} version: ${{ AZURE_OPENAI_API_VERSION }} - # Copilot CLI — used as delegated agent_judge target + # Copilot CLI — used as delegated llm-judge target - name: copilot_judge provider: copilot-cli model: claude-haiku-4.5 diff --git a/examples/features/file-changes-judges/evals/dataset.eval.yaml b/examples/features/file-changes-judges/evals/dataset.eval.yaml index 65ebd68df..2fb796537 100644 --- a/examples/features/file-changes-judges/evals/dataset.eval.yaml +++ b/examples/features/file-changes-judges/evals/dataset.eval.yaml @@ -2,13 +2,13 @@ # # Proves that file_changes diffs are correctly passed to all judge types: # 1. rubrics — LLM judge (Azure) evaluates the diff -# 2. agent_judge — built-in mode (Azure via AI SDK) sees file_changes in prompt -# 3. agent_judge — delegated mode (Copilot CLI with haiku) sees file_changes in prompt +# 2. llm-judge — built-in mode (Azure via AI SDK) sees file_changes in prompt +# 3. llm-judge — delegated mode (Copilot CLI with haiku) sees file_changes in prompt # # The mock agent adds a `subtract` function to calculator.ts, producing a small # diff (~10 lines) that fits comfortably in any LLM context window. -description: Verify file_changes diffs are accessible to LLM judge, built-in agent judge, and copilot-cli agent judge +description: Verify file_changes diffs are accessible to LLM judge (rubrics, built-in, and copilot-cli) execution: target: mock_agent @@ -43,14 +43,14 @@ tests: outcome: "The file_changes contains a valid unified diff format" weight: 0.5 - # 2. Built-in agent judge — Azure via AI SDK with filesystem tools - - name: agent-judge-builtin - type: agent-judge + # 2. Built-in LLM judge — Azure via AI SDK with filesystem tools + - name: llm-judge-builtin + type: llm-judge max_steps: 3 temperature: 0 - # 3. Copilot CLI agent judge — delegated via target - - name: agent-judge-copilot - type: agent-judge + # 3. Copilot CLI LLM judge — delegated via target + - name: llm-judge-copilot + type: llm-judge target: copilot_judge temperature: 0 diff --git a/packages/core/package.json b/packages/core/package.json index d0c0a031e..600890177 100644 --- a/packages/core/package.json +++ b/packages/core/package.json @@ -39,11 +39,12 @@ }, "files": ["dist", "README.md"], "dependencies": { - "@agentv/eval": "workspace:*", "@agentclientprotocol/sdk": "^0.14.1", + "@agentv/eval": "workspace:*", "@ai-sdk/anthropic": "^2.0.53", "@ai-sdk/azure": "^2.0.78", "@ai-sdk/google": "^2.0.44", + "@ai-sdk/openai": "^2.0.0", "@anthropic-ai/claude-agent-sdk": "^0.2.49", "@github/copilot-sdk": "^0.1.25", "@mariozechner/pi-agent-core": "^0.54.2", diff --git a/packages/core/src/evaluation/evaluators/agent-judge.ts b/packages/core/src/evaluation/evaluators/agent-judge.ts deleted file mode 100644 index 2dc00f769..000000000 --- a/packages/core/src/evaluation/evaluators/agent-judge.ts +++ /dev/null @@ -1,598 +0,0 @@ -import fs from 'node:fs/promises'; -import path from 'node:path'; - -import { generateText, stepCountIs, tool } from 'ai'; -import { z } from 'zod'; - -import { extractLastAssistantContent } from '../providers/types.js'; -import type { Provider } from '../providers/types.js'; -import { TEMPLATE_VARIABLES } from '../template-variables.js'; -import type { JsonObject, RubricItem } from '../types.js'; -import { - buildOutputSchema, - buildRubricOutputSchema, - calculateRubricScore, - freeformEvaluationSchema, - rubricEvaluationSchema, - substituteVariables, -} from './llm-judge.js'; -import { clampScore, isNonEmptyString, parseJsonFromText, scoreToVerdict } from './scoring.js'; -import type { EvaluationContext, EvaluationScore, Evaluator } from './types.js'; - -const DEFAULT_MAX_STEPS = 10; -const MAX_STEPS_LIMIT = 50; -const MAX_FILE_SIZE = 50 * 1024; // 50KB -const MAX_SEARCH_MATCHES = 20; - -/** - * Directories/patterns to skip during file search. - */ -const SEARCH_SKIP_DIRS = new Set([ - 'node_modules', - '.git', - '.next', - 'dist', - '__pycache__', - '.cache', -]); - -/** - * Binary file extensions to skip during search. - */ -const BINARY_EXTENSIONS = new Set([ - '.png', - '.jpg', - '.jpeg', - '.gif', - '.ico', - '.svg', - '.woff', - '.woff2', - '.ttf', - '.eot', - '.mp3', - '.mp4', - '.wav', - '.zip', - '.tar', - '.gz', - '.pdf', - '.exe', - '.dll', - '.so', - '.dylib', -]); - -export interface AgentJudgeEvaluatorOptions { - readonly resolveJudgeProvider: (ctx: EvaluationContext) => Promise; - readonly maxSteps?: number; - readonly temperature?: number; - readonly evaluatorTemplate?: string; - readonly judgeTargetProvider?: Provider; -} - -export class AgentJudgeEvaluator implements Evaluator { - readonly kind = 'agent-judge'; - - private readonly resolveJudgeProvider: (ctx: EvaluationContext) => Promise; - private readonly maxSteps: number; - private readonly temperature: number; - private readonly evaluatorTemplate?: string; - private readonly judgeTargetProvider?: Provider; - - constructor(options: AgentJudgeEvaluatorOptions) { - this.resolveJudgeProvider = options.resolveJudgeProvider; - this.maxSteps = Math.min(options.maxSteps ?? DEFAULT_MAX_STEPS, MAX_STEPS_LIMIT); - this.temperature = options.temperature ?? 0; - this.evaluatorTemplate = options.evaluatorTemplate; - this.judgeTargetProvider = options.judgeTargetProvider; - } - - async evaluate(context: EvaluationContext): Promise { - if (this.judgeTargetProvider) { - return this.evaluateWithJudgeTarget(context); - } - return this.evaluateBuiltIn(context); - } - - /** - * Built-in mode: Uses Vercel AI SDK generateText() with sandboxed filesystem tools. - */ - private async evaluateBuiltIn(context: EvaluationContext): Promise { - const judgeProvider = await this.resolveJudgeProvider(context); - if (!judgeProvider) { - throw new Error('No judge provider available for agent-judge evaluation'); - } - - const model = judgeProvider.asLanguageModel?.(); - if (!model) { - throw new Error( - `Judge provider '${judgeProvider.targetName}' does not support asLanguageModel() — required for built-in agent-judge mode`, - ); - } - - const workspacePath = context.workspacePath; - if (!workspacePath) { - throw new Error( - 'agent-judge evaluator requires a workspace_template target (workspacePath is not set)', - ); - } - - const systemPrompt = this.buildSystemPrompt(context); - const userPrompt = this.buildUserPrompt(context); - - const config = context.evaluator; - const rubrics = config?.type === 'agent-judge' ? config.rubrics : undefined; - - const fsTools = createFilesystemTools(workspacePath); - - const evaluatorRawRequest: JsonObject = { - mode: 'built-in', - systemPrompt, - userPrompt, - target: judgeProvider.targetName, - maxSteps: this.maxSteps, - }; - - try { - const { text, steps } = await generateText({ - model, - system: systemPrompt, - prompt: userPrompt, - tools: fsTools, - stopWhen: stepCountIs(this.maxSteps), - temperature: this.temperature, - }); - - const toolCallCount = steps.reduce((count, step) => count + (step.toolCalls?.length ?? 0), 0); - - const details: JsonObject = { - mode: 'built-in', - steps: steps.length, - tool_calls: toolCallCount, - }; - - return this.parseResult(text, rubrics, evaluatorRawRequest, details); - } catch (error) { - const message = error instanceof Error ? error.message : String(error); - return { - score: 0, - verdict: 'fail', - hits: [], - misses: [`agent-judge built-in evaluation failed: ${message}`], - expectedAspectCount: 1, - evaluatorRawRequest, - details: { mode: 'built-in', error: message }, - }; - } - } - - /** - * Judge target mode: Delegates to an external agent provider via Provider.invoke(). - */ - private async evaluateWithJudgeTarget(context: EvaluationContext): Promise { - const provider = this.judgeTargetProvider as Provider; - - const workspacePath = context.workspacePath; - const prompt = this.buildDelegatedPrompt(context); - - const evaluatorRawRequest: JsonObject = { - mode: 'judge_target', - judge_target: provider.targetName, - prompt, - }; - - try { - const response = await provider.invoke({ - question: prompt, - cwd: workspacePath, - evalCaseId: context.evalCase.id, - attempt: context.attempt, - }); - - const assistantContent = extractLastAssistantContent(response.output); - if (!assistantContent) { - return { - score: 0, - verdict: 'fail', - hits: [], - misses: ['agent-judge judge_target returned no assistant response'], - expectedAspectCount: 1, - evaluatorRawRequest, - details: { mode: 'judge_target', judge_target: provider.targetName }, - }; - } - - const config = context.evaluator; - const rubrics = config?.type === 'agent-judge' ? config.rubrics : undefined; - - const details: JsonObject = { - mode: 'judge_target', - judge_target: provider.targetName, - }; - - return this.parseResult(assistantContent, rubrics, evaluatorRawRequest, details); - } catch (error) { - const message = error instanceof Error ? error.message : String(error); - return { - score: 0, - verdict: 'fail', - hits: [], - misses: [`agent-judge judge_target evaluation failed: ${message}`], - expectedAspectCount: 1, - evaluatorRawRequest, - details: { - mode: 'judge_target', - judge_target: provider.targetName, - error: message, - }, - }; - } - } - - /** - * Parse the agent's response text into an EvaluationScore. - * Supports both freeform and rubric modes. - */ - private parseResult( - text: string, - rubrics: readonly RubricItem[] | undefined, - evaluatorRawRequest: JsonObject, - details: JsonObject, - ): EvaluationScore { - try { - const parsed = parseJsonFromText(text); - - if (rubrics && rubrics.length > 0) { - const data = rubricEvaluationSchema.parse(parsed); - const { score, verdict, hits, misses } = calculateRubricScore(data, rubrics); - return { - score, - verdict, - hits, - misses, - expectedAspectCount: rubrics.length, - reasoning: data.overall_reasoning, - evaluatorRawRequest, - details, - }; - } - - const data = freeformEvaluationSchema.parse(parsed); - const score = clampScore(data.score); - const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : []; - const misses = Array.isArray(data.misses) - ? data.misses.filter(isNonEmptyString).slice(0, 4) - : []; - - return { - score, - verdict: scoreToVerdict(score), - hits, - misses, - expectedAspectCount: Math.max(hits.length + misses.length, 1), - reasoning: data.reasoning, - evaluatorRawRequest, - details, - }; - } catch { - return { - score: 0, - verdict: 'fail', - hits: [], - misses: ['Failed to parse agent-judge response as valid evaluation JSON'], - expectedAspectCount: 1, - evaluatorRawRequest, - details, - }; - } - } - - /** - * Build system prompt for built-in mode. - * Includes output format instructions. - */ - private buildSystemPrompt(context: EvaluationContext): string { - const config = context.evaluator; - const rubrics = config?.type === 'agent-judge' ? config.rubrics : undefined; - - const parts: string[] = [ - 'You are an expert evaluator with access to the workspace filesystem.', - 'Use the provided tools to investigate the workspace and verify the criteria are met.', - 'Thoroughly examine relevant files before making your assessment.', - '', - ]; - - if (rubrics && rubrics.length > 0) { - parts.push(buildRubricOutputSchema()); - } else { - parts.push(buildOutputSchema()); - } - - return parts.join('\n'); - } - - /** - * Build user prompt for built-in mode. - * Uses custom template if provided, otherwise builds default prompt. - */ - private buildUserPrompt(context: EvaluationContext): string { - const formattedQuestion = - context.promptInputs.question && context.promptInputs.question.trim().length > 0 - ? context.promptInputs.question - : context.evalCase.question; - - const variables: Record = { - [TEMPLATE_VARIABLES.ANSWER]: context.candidate.trim(), - [TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context.evalCase.reference_answer ?? '').trim(), - [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(), - [TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(), - [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? '', - }; - - if (this.evaluatorTemplate) { - return substituteVariables(this.evaluatorTemplate, variables); - } - - const config = context.evaluator; - const rubrics = config?.type === 'agent-judge' ? config.rubrics : undefined; - - const parts: string[] = [ - 'Evaluate the candidate answer by investigating the workspace.', - '', - '[[ ## question ## ]]', - formattedQuestion, - '', - '[[ ## criteria ## ]]', - context.evalCase.criteria, - '', - ]; - - if (context.evalCase.reference_answer && context.evalCase.reference_answer.trim().length > 0) { - parts.push('[[ ## reference_answer ## ]]', context.evalCase.reference_answer, ''); - } - - parts.push('[[ ## answer ## ]]', context.candidate, ''); - - if (context.fileChanges) { - parts.push('[[ ## file_changes ## ]]', context.fileChanges, ''); - } - - if (rubrics && rubrics.length > 0) { - parts.push('[[ ## rubrics ## ]]'); - for (const rubric of rubrics) { - const requiredLabel = rubric.required ? ' (REQUIRED)' : ''; - const weightLabel = rubric.weight !== 1.0 ? ` (weight: ${rubric.weight})` : ''; - parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.outcome}`); - } - parts.push( - '', - 'For each rubric, investigate the workspace to determine if it is satisfied. Provide brief reasoning.', - ); - } else { - parts.push( - 'Investigate the workspace to verify the criteria. Provide a score between 0.0 and 1.0.', - ); - } - - return parts.join('\n'); - } - - /** - * Build the full evaluation prompt for judge target mode (delegation). - * Combines task context, criteria, candidate info, and output format instructions. - */ - private buildDelegatedPrompt(context: EvaluationContext): string { - const formattedQuestion = - context.promptInputs.question && context.promptInputs.question.trim().length > 0 - ? context.promptInputs.question - : context.evalCase.question; - - const config = context.evaluator; - const rubrics = config?.type === 'agent-judge' ? config.rubrics : undefined; - - if (this.evaluatorTemplate) { - const variables: Record = { - [TEMPLATE_VARIABLES.ANSWER]: context.candidate.trim(), - [TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context.evalCase.reference_answer ?? '').trim(), - [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(), - [TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(), - [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? '', - }; - const customPrompt = substituteVariables(this.evaluatorTemplate, variables); - - const outputSchema = - rubrics && rubrics.length > 0 ? buildRubricOutputSchema() : buildOutputSchema(); - - return `${customPrompt}\n\n${outputSchema}`; - } - - const parts: string[] = [ - 'You are an expert evaluator. Investigate the workspace to verify the criteria are met.', - '', - '[[ ## question ## ]]', - formattedQuestion, - '', - '[[ ## criteria ## ]]', - context.evalCase.criteria, - '', - ]; - - if (context.evalCase.reference_answer && context.evalCase.reference_answer.trim().length > 0) { - parts.push('[[ ## reference_answer ## ]]', context.evalCase.reference_answer, ''); - } - - parts.push('[[ ## answer ## ]]', context.candidate, ''); - - if (context.fileChanges) { - parts.push('[[ ## file_changes ## ]]', context.fileChanges, ''); - } - - if (rubrics && rubrics.length > 0) { - parts.push('[[ ## rubrics ## ]]'); - for (const rubric of rubrics) { - const requiredLabel = rubric.required ? ' (REQUIRED)' : ''; - const weightLabel = rubric.weight !== 1.0 ? ` (weight: ${rubric.weight})` : ''; - parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.outcome}`); - } - parts.push(''); - parts.push(buildRubricOutputSchema()); - } else { - parts.push(buildOutputSchema()); - } - - return parts.join('\n'); - } -} - -// --------------------------------------------------------------------------- -// Sandboxed filesystem tools for built-in mode -// --------------------------------------------------------------------------- - -/** - * Resolve a relative path within the sandbox, preventing path traversal. - * Returns the absolute path if valid, or throws if the path escapes the sandbox. - */ -function resolveSandboxed(basePath: string, relativePath: string): string { - const resolved = path.resolve(basePath, relativePath); - if (!resolved.startsWith(basePath + path.sep) && resolved !== basePath) { - throw new Error(`Path '${relativePath}' is outside the workspace`); - } - return resolved; -} - -/** - * Create sandboxed filesystem tools for the AI SDK agent loop. - */ -function createFilesystemTools(workspacePath: string) { - return { - list_files: tool({ - description: - 'List files and directories at a relative path within the workspace. Returns names only (single level, no recursion).', - inputSchema: z.object({ - path: z.string().describe('Relative path within workspace (use "." for root)').default('.'), - }), - execute: async (input: { path: string }) => { - try { - const resolved = resolveSandboxed(workspacePath, input.path); - const entries = await fs.readdir(resolved, { withFileTypes: true }); - return entries - .map((e) => ({ - name: e.name, - type: e.isDirectory() ? 'directory' : 'file', - })) - .slice(0, 100); - } catch (error) { - return { error: error instanceof Error ? error.message : String(error) }; - } - }, - }), - - read_file: tool({ - description: - 'Read the content of a file at a relative path within the workspace. Large files are truncated at 50KB.', - inputSchema: z.object({ - path: z.string().describe('Relative path to file within workspace'), - }), - execute: async (input: { path: string }) => { - try { - const resolved = resolveSandboxed(workspacePath, input.path); - const stat = await fs.stat(resolved); - if (stat.isDirectory()) { - return { error: `'${input.path}' is a directory, not a file` }; - } - const buffer = Buffer.alloc(Math.min(stat.size, MAX_FILE_SIZE)); - const fd = await fs.open(resolved, 'r'); - try { - await fd.read(buffer, 0, buffer.length, 0); - } finally { - await fd.close(); - } - const content = buffer.toString('utf-8'); - const truncated = stat.size > MAX_FILE_SIZE; - return { content, truncated, size: stat.size }; - } catch (error) { - return { error: error instanceof Error ? error.message : String(error) }; - } - }, - }), - - search_files: tool({ - description: - 'Search for a regex pattern across files in the workspace. Returns up to 20 matches. Skips binary files and node_modules/.git.', - inputSchema: z.object({ - pattern: z.string().describe('Regex pattern to search for'), - path: z.string().describe('Relative path to search within (use "." for root)').default('.'), - }), - execute: async (input: { pattern: string; path: string }) => { - try { - const resolved = resolveSandboxed(workspacePath, input.path); - const regex = new RegExp(input.pattern, 'gi'); - const matches: Array<{ file: string; line: number; text: string }> = []; - - await searchDirectory(resolved, workspacePath, regex, matches); - - return { matches, total: matches.length }; - } catch (error) { - return { error: error instanceof Error ? error.message : String(error) }; - } - }, - }), - }; -} - -/** - * Recursively search a directory for regex matches. - */ -async function searchDirectory( - dirPath: string, - workspacePath: string, - regex: RegExp, - matches: Array<{ file: string; line: number; text: string }>, -): Promise { - if (matches.length >= MAX_SEARCH_MATCHES) return; - - let entries: import('node:fs').Dirent[]; - try { - entries = await fs.readdir(dirPath, { withFileTypes: true }); - } catch { - return; - } - - for (const entry of entries) { - if (matches.length >= MAX_SEARCH_MATCHES) return; - - if (SEARCH_SKIP_DIRS.has(entry.name)) continue; - - const fullPath = path.join(dirPath, entry.name); - - if (entry.isDirectory()) { - await searchDirectory(fullPath, workspacePath, regex, matches); - } else if (entry.isFile()) { - const ext = path.extname(entry.name).toLowerCase(); - if (BINARY_EXTENSIONS.has(ext)) continue; - - try { - const stat = await fs.stat(fullPath); - if (stat.size > MAX_FILE_SIZE) continue; - - const content = await fs.readFile(fullPath, 'utf-8'); - const lines = content.split('\n'); - - for (let i = 0; i < lines.length; i++) { - if (matches.length >= MAX_SEARCH_MATCHES) return; - regex.lastIndex = 0; - if (regex.test(lines[i])) { - matches.push({ - file: path.relative(workspacePath, fullPath), - line: i + 1, - text: lines[i].substring(0, 200), - }); - } - } - } catch { - // Skip unreadable files - } - } - } -} diff --git a/packages/core/src/evaluation/evaluators/index.ts b/packages/core/src/evaluation/evaluators/index.ts index 157ff7c99..a64705fbe 100644 --- a/packages/core/src/evaluation/evaluators/index.ts +++ b/packages/core/src/evaluation/evaluators/index.ts @@ -52,9 +52,6 @@ export { } from './llm-judge.js'; export type { LlmJudgeEvaluatorOptions } from './llm-judge.js'; -export { AgentJudgeEvaluator } from './agent-judge.js'; -export type { AgentJudgeEvaluatorOptions } from './agent-judge.js'; - export { SkillTriggerEvaluator } from './skill-trigger.js'; export { assembleLlmJudgePrompt } from './llm-judge-prompt.js'; diff --git a/packages/core/src/evaluation/evaluators/llm-judge.ts b/packages/core/src/evaluation/evaluators/llm-judge.ts index 46125f3e7..88e6a5268 100644 --- a/packages/core/src/evaluation/evaluators/llm-judge.ts +++ b/packages/core/src/evaluation/evaluators/llm-judge.ts @@ -1,14 +1,65 @@ -import { generateText } from 'ai'; +import fs from 'node:fs/promises'; +import path from 'node:path'; + +import { generateText, stepCountIs, tool } from 'ai'; import { z } from 'zod'; import type { Provider, ProviderResponse } from '../providers/types.js'; -import { extractLastAssistantContent } from '../providers/types.js'; +import { extractLastAssistantContent, isAgentProvider } from '../providers/types.js'; import { TEMPLATE_VARIABLES } from '../template-variables.js'; import type { TokenUsage } from '../trace.js'; import type { JsonObject, RubricItem } from '../types.js'; import { clampScore, isNonEmptyString, parseJsonFromText, scoreToVerdict } from './scoring.js'; import type { EvaluationContext, EvaluationScore, Evaluator } from './types.js'; +// --------------------------------------------------------------------------- +// Constants for built-in agent mode (filesystem tools) +// --------------------------------------------------------------------------- + +const DEFAULT_MAX_STEPS = 10; +const MAX_STEPS_LIMIT = 50; +const MAX_FILE_SIZE = 50 * 1024; // 50KB +const MAX_SEARCH_MATCHES = 20; + +/** + * Directories/patterns to skip during file search. + */ +const SEARCH_SKIP_DIRS = new Set([ + 'node_modules', + '.git', + '.next', + 'dist', + '__pycache__', + '.cache', +]); + +/** + * Binary file extensions to skip during search. + */ +const BINARY_EXTENSIONS = new Set([ + '.png', + '.jpg', + '.jpeg', + '.gif', + '.ico', + '.svg', + '.woff', + '.woff2', + '.ttf', + '.eot', + '.mp3', + '.mp4', + '.wav', + '.zip', + '.tar', + '.gz', + '.pdf', + '.exe', + '.dll', + '.so', + '.dylib', +]); + /** * Default evaluator template for the user prompt (variables will be substituted). * Custom evaluators can override this via evaluatorTemplate option. @@ -38,6 +89,8 @@ export interface LlmJudgeEvaluatorOptions { readonly maxOutputTokens?: number; readonly temperature?: number; readonly evaluatorTemplate?: string; + readonly maxSteps?: number; + readonly judgeTargetProvider?: Provider; } const freeformEvaluationSchema = z.object({ @@ -82,20 +135,40 @@ export class LlmJudgeEvaluator implements Evaluator { private readonly maxOutputTokens?: number; private readonly temperature?: number; private readonly evaluatorTemplate?: string; + private readonly maxSteps: number; + private readonly judgeTargetProvider?: Provider; constructor(options: LlmJudgeEvaluatorOptions) { this.resolveJudgeProvider = options.resolveJudgeProvider; this.maxOutputTokens = options.maxOutputTokens; this.temperature = options.temperature; this.evaluatorTemplate = options.evaluatorTemplate; + this.maxSteps = Math.min(options.maxSteps ?? DEFAULT_MAX_STEPS, MAX_STEPS_LIMIT); + this.judgeTargetProvider = options.judgeTargetProvider; } async evaluate(context: EvaluationContext): Promise { + // Delegate mode: judge target provider is an agent provider — send prompt via invoke() + if (this.judgeTargetProvider) { + return this.evaluateWithJudgeTarget(context); + } + const judgeProvider = await this.resolveJudgeProvider(context); if (!judgeProvider) { throw new Error('No judge provider available for LLM grading'); } + // Built-in agent mode: agentv provider → AI SDK generateText with filesystem tools + if (judgeProvider.kind === 'agentv') { + return this.evaluateBuiltIn(context, judgeProvider); + } + + // Delegate mode: resolved provider is an agent provider → send prompt via invoke() + if (isAgentProvider(judgeProvider)) { + return this.evaluateWithDelegatedAgent(context, judgeProvider); + } + + // LLM mode: structured JSON evaluation const config = context.evaluator; if (config?.type === 'llm-judge' && config.rubrics && config.rubrics.length > 0) { return this.evaluateWithRubrics(context, judgeProvider, config.rubrics); @@ -104,6 +177,10 @@ export class LlmJudgeEvaluator implements Evaluator { return this.evaluateFreeform(context, judgeProvider); } + // --------------------------------------------------------------------------- + // LLM mode (existing) + // --------------------------------------------------------------------------- + private async evaluateFreeform( context: EvaluationContext, judgeProvider: Provider, @@ -177,7 +254,7 @@ export class LlmJudgeEvaluator implements Evaluator { tokenUsage, }; } catch (e: unknown) { - // Judge parse failure → skip (not silent zero). + // Judge parse failure -> skip (not silent zero). // Signals infrastructure error to downstream consumers, excluded from score averages. const message = e instanceof Error ? e.message : String(e); const evalName = context.evaluator?.name ?? 'llm-judge'; @@ -314,6 +391,393 @@ export class LlmJudgeEvaluator implements Evaluator { } } + // --------------------------------------------------------------------------- + // Built-in agent mode (agentv provider — AI SDK generateText with filesystem tools) + // --------------------------------------------------------------------------- + + /** + * Built-in mode: Uses Vercel AI SDK generateText() with sandboxed filesystem tools. + */ + private async evaluateBuiltIn( + context: EvaluationContext, + judgeProvider: Provider, + ): Promise { + const model = judgeProvider.asLanguageModel?.(); + if (!model) { + throw new Error( + `Judge provider '${judgeProvider.targetName}' does not support asLanguageModel() — required for built-in agent mode`, + ); + } + + const workspacePath = context.workspacePath; + if (!workspacePath) { + throw new Error( + 'llm-judge built-in agent mode requires a workspace_template target (workspacePath is not set)', + ); + } + + const systemPrompt = this.buildAgentSystemPrompt(context); + const userPrompt = this.buildAgentUserPrompt(context); + + const config = context.evaluator; + const rubrics = config?.type === 'llm-judge' ? config.rubrics : undefined; + + const fsTools = createFilesystemTools(workspacePath); + + const evaluatorRawRequest: JsonObject = { + mode: 'built-in', + systemPrompt, + userPrompt, + target: judgeProvider.targetName, + maxSteps: this.maxSteps, + }; + + try { + const { text, steps } = await generateText({ + model, + system: systemPrompt, + prompt: userPrompt, + tools: fsTools, + stopWhen: stepCountIs(this.maxSteps), + temperature: this.temperature ?? 0, + }); + + const toolCallCount = steps.reduce((count, step) => count + (step.toolCalls?.length ?? 0), 0); + + const details: JsonObject = { + mode: 'built-in', + steps: steps.length, + tool_calls: toolCallCount, + }; + + return this.parseAgentResult(text, rubrics, evaluatorRawRequest, details); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + return { + score: 0, + verdict: 'fail', + hits: [], + misses: [`llm-judge built-in evaluation failed: ${message}`], + expectedAspectCount: 1, + evaluatorRawRequest, + details: { mode: 'built-in', error: message }, + }; + } + } + + // --------------------------------------------------------------------------- + // Delegate mode (agent provider — send prompt via Provider.invoke()) + // --------------------------------------------------------------------------- + + /** + * Judge target mode: Delegates to an explicit judgeTargetProvider via Provider.invoke(). + */ + private async evaluateWithJudgeTarget(context: EvaluationContext): Promise { + return this.evaluateWithDelegate(context, this.judgeTargetProvider as Provider, 'judge_target'); + } + + /** + * Delegate mode: resolved provider is an agent provider — send prompt via invoke(). + */ + private async evaluateWithDelegatedAgent( + context: EvaluationContext, + judgeProvider: Provider, + ): Promise { + return this.evaluateWithDelegate(context, judgeProvider, 'delegate'); + } + + /** + * Shared implementation for judge_target and delegate modes. + * Both invoke a provider and parse the agent result from the response. + */ + private async evaluateWithDelegate( + context: EvaluationContext, + provider: Provider, + modeLabel: string, + ): Promise { + const workspacePath = context.workspacePath; + const prompt = this.buildDelegatedPrompt(context); + + const evaluatorRawRequest: JsonObject = { + mode: modeLabel, + judge_target: provider.targetName, + prompt, + }; + + try { + const response = await provider.invoke({ + question: prompt, + cwd: workspacePath, + evalCaseId: context.evalCase.id, + attempt: context.attempt, + }); + + const assistantContent = extractLastAssistantContent(response.output); + if (!assistantContent) { + return { + score: 0, + verdict: 'fail', + hits: [], + misses: [`llm-judge ${modeLabel} returned no assistant response`], + expectedAspectCount: 1, + evaluatorRawRequest, + details: { mode: modeLabel, judge_target: provider.targetName }, + }; + } + + const config = context.evaluator; + const rubrics = config?.type === 'llm-judge' ? config.rubrics : undefined; + + const details: JsonObject = { + mode: modeLabel, + judge_target: provider.targetName, + }; + + return this.parseAgentResult(assistantContent, rubrics, evaluatorRawRequest, details); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + return { + score: 0, + verdict: 'fail', + hits: [], + misses: [`llm-judge ${modeLabel} evaluation failed: ${message}`], + expectedAspectCount: 1, + evaluatorRawRequest, + details: { + mode: modeLabel, + judge_target: provider.targetName, + error: message, + }, + }; + } + } + + // --------------------------------------------------------------------------- + // Prompt builders for agent modes + // --------------------------------------------------------------------------- + + /** + * Build system prompt for built-in agent mode. + * Includes output format instructions. + */ + private buildAgentSystemPrompt(context: EvaluationContext): string { + const config = context.evaluator; + const rubrics = config?.type === 'llm-judge' ? config.rubrics : undefined; + + const parts: string[] = [ + 'You are an expert evaluator with access to the workspace filesystem.', + 'Use the provided tools to investigate the workspace and verify the criteria are met.', + 'Thoroughly examine relevant files before making your assessment.', + '', + ]; + + if (rubrics && rubrics.length > 0) { + parts.push(buildRubricOutputSchema()); + } else { + parts.push(buildOutputSchema()); + } + + return parts.join('\n'); + } + + /** + * Build user prompt for built-in agent mode. + * Uses custom template if provided, otherwise builds default prompt. + */ + private buildAgentUserPrompt(context: EvaluationContext): string { + const formattedQuestion = + context.promptInputs.question && context.promptInputs.question.trim().length > 0 + ? context.promptInputs.question + : context.evalCase.question; + + const variables: Record = { + [TEMPLATE_VARIABLES.ANSWER]: context.candidate.trim(), + [TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context.evalCase.reference_answer ?? '').trim(), + [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(), + [TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(), + [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? '', + }; + + if (this.evaluatorTemplate) { + return substituteVariables(this.evaluatorTemplate, variables); + } + + const config = context.evaluator; + const rubrics = config?.type === 'llm-judge' ? config.rubrics : undefined; + + const parts: string[] = [ + 'Evaluate the candidate answer by investigating the workspace.', + '', + '[[ ## question ## ]]', + formattedQuestion, + '', + '[[ ## criteria ## ]]', + context.evalCase.criteria, + '', + ]; + + if (context.evalCase.reference_answer && context.evalCase.reference_answer.trim().length > 0) { + parts.push('[[ ## reference_answer ## ]]', context.evalCase.reference_answer, ''); + } + + parts.push('[[ ## answer ## ]]', context.candidate, ''); + + if (context.fileChanges) { + parts.push('[[ ## file_changes ## ]]', context.fileChanges, ''); + } + + if (rubrics && rubrics.length > 0) { + parts.push('[[ ## rubrics ## ]]'); + for (const rubric of rubrics) { + const requiredLabel = rubric.required ? ' (REQUIRED)' : ''; + const weightLabel = rubric.weight !== 1.0 ? ` (weight: ${rubric.weight})` : ''; + parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.outcome}`); + } + parts.push( + '', + 'For each rubric, investigate the workspace to determine if it is satisfied. Provide brief reasoning.', + ); + } else { + parts.push( + 'Investigate the workspace to verify the criteria. Provide a score between 0.0 and 1.0.', + ); + } + + return parts.join('\n'); + } + + /** + * Build the full evaluation prompt for delegate mode (agent providers). + * Combines task context, criteria, candidate info, and output format instructions. + */ + private buildDelegatedPrompt(context: EvaluationContext): string { + const formattedQuestion = + context.promptInputs.question && context.promptInputs.question.trim().length > 0 + ? context.promptInputs.question + : context.evalCase.question; + + const config = context.evaluator; + const rubrics = config?.type === 'llm-judge' ? config.rubrics : undefined; + + if (this.evaluatorTemplate) { + const variables: Record = { + [TEMPLATE_VARIABLES.ANSWER]: context.candidate.trim(), + [TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context.evalCase.reference_answer ?? '').trim(), + [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(), + [TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(), + [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? '', + }; + const customPrompt = substituteVariables(this.evaluatorTemplate, variables); + + const outputSchema = + rubrics && rubrics.length > 0 ? buildRubricOutputSchema() : buildOutputSchema(); + + return `${customPrompt}\n\n${outputSchema}`; + } + + const parts: string[] = [ + 'You are an expert evaluator. Investigate the workspace to verify the criteria are met.', + '', + '[[ ## question ## ]]', + formattedQuestion, + '', + '[[ ## criteria ## ]]', + context.evalCase.criteria, + '', + ]; + + if (context.evalCase.reference_answer && context.evalCase.reference_answer.trim().length > 0) { + parts.push('[[ ## reference_answer ## ]]', context.evalCase.reference_answer, ''); + } + + parts.push('[[ ## answer ## ]]', context.candidate, ''); + + if (context.fileChanges) { + parts.push('[[ ## file_changes ## ]]', context.fileChanges, ''); + } + + if (rubrics && rubrics.length > 0) { + parts.push('[[ ## rubrics ## ]]'); + for (const rubric of rubrics) { + const requiredLabel = rubric.required ? ' (REQUIRED)' : ''; + const weightLabel = rubric.weight !== 1.0 ? ` (weight: ${rubric.weight})` : ''; + parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.outcome}`); + } + parts.push(''); + parts.push(buildRubricOutputSchema()); + } else { + parts.push(buildOutputSchema()); + } + + return parts.join('\n'); + } + + // --------------------------------------------------------------------------- + // Agent result parser (shared by built-in and delegate modes) + // --------------------------------------------------------------------------- + + /** + * Parse the agent's response text into an EvaluationScore. + * Supports both freeform and rubric modes. + */ + private parseAgentResult( + text: string, + rubrics: readonly RubricItem[] | undefined, + evaluatorRawRequest: JsonObject, + details: JsonObject, + ): EvaluationScore { + try { + const parsed = parseJsonFromText(text); + + if (rubrics && rubrics.length > 0) { + const data = rubricEvaluationSchema.parse(parsed); + const { score, verdict, hits, misses } = calculateRubricScore(data, rubrics); + return { + score, + verdict, + hits, + misses, + expectedAspectCount: rubrics.length, + reasoning: data.overall_reasoning, + evaluatorRawRequest, + details, + }; + } + + const data = freeformEvaluationSchema.parse(parsed); + const score = clampScore(data.score); + const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : []; + const misses = Array.isArray(data.misses) + ? data.misses.filter(isNonEmptyString).slice(0, 4) + : []; + + return { + score, + verdict: scoreToVerdict(score), + hits, + misses, + expectedAspectCount: Math.max(hits.length + misses.length, 1), + reasoning: data.reasoning, + evaluatorRawRequest, + details, + }; + } catch { + return { + score: 0, + verdict: 'fail', + hits: [], + misses: ['Failed to parse llm-judge agent response as valid evaluation JSON'], + expectedAspectCount: 1, + evaluatorRawRequest, + details, + }; + } + } + + // --------------------------------------------------------------------------- + // LLM mode prompt builders + // --------------------------------------------------------------------------- + /** * Build prompt for score-range rubric evaluation. */ @@ -421,6 +885,10 @@ export class LlmJudgeEvaluator implements Evaluator { return parts.join('\n'); } + // --------------------------------------------------------------------------- + // LLM mode retry logic + // --------------------------------------------------------------------------- + private async runWithRetry(options: { readonly context: EvaluationContext; readonly judgeProvider: Provider; @@ -474,6 +942,10 @@ export class LlmJudgeEvaluator implements Evaluator { } } +// --------------------------------------------------------------------------- +// Output schema builders (exported for reuse) +// --------------------------------------------------------------------------- + /** * Build the mandatory output schema that all evaluators must follow. * This schema is always appended to the evaluator template. @@ -656,3 +1128,162 @@ function calculateScoreRangeResult( }, }; } + +// --------------------------------------------------------------------------- +// Sandboxed filesystem tools for built-in agent mode +// --------------------------------------------------------------------------- + +/** + * Resolve a relative path within the sandbox, preventing path traversal. + * Returns the absolute path if valid, or throws if the path escapes the sandbox. + */ +function resolveSandboxed(basePath: string, relativePath: string): string { + const resolved = path.resolve(basePath, relativePath); + if (!resolved.startsWith(basePath + path.sep) && resolved !== basePath) { + throw new Error(`Path '${relativePath}' is outside the workspace`); + } + return resolved; +} + +/** + * Create sandboxed filesystem tools for the AI SDK agent loop. + */ +function createFilesystemTools(workspacePath: string) { + return { + list_files: tool({ + description: + 'List files and directories at a relative path within the workspace. Returns names only (single level, no recursion).', + inputSchema: z.object({ + path: z.string().describe('Relative path within workspace (use "." for root)').default('.'), + }), + execute: async (input: { path: string }) => { + try { + const resolved = resolveSandboxed(workspacePath, input.path); + const entries = await fs.readdir(resolved, { withFileTypes: true }); + return entries + .map((e) => ({ + name: e.name, + type: e.isDirectory() ? 'directory' : 'file', + })) + .slice(0, 100); + } catch (error) { + return { error: error instanceof Error ? error.message : String(error) }; + } + }, + }), + + read_file: tool({ + description: + 'Read the content of a file at a relative path within the workspace. Large files are truncated at 50KB.', + inputSchema: z.object({ + path: z.string().describe('Relative path to file within workspace'), + }), + execute: async (input: { path: string }) => { + try { + const resolved = resolveSandboxed(workspacePath, input.path); + const stat = await fs.stat(resolved); + if (stat.isDirectory()) { + return { error: `'${input.path}' is a directory, not a file` }; + } + const buffer = Buffer.alloc(Math.min(stat.size, MAX_FILE_SIZE)); + const fd = await fs.open(resolved, 'r'); + try { + await fd.read(buffer, 0, buffer.length, 0); + } finally { + await fd.close(); + } + const content = buffer.toString('utf-8'); + const truncated = stat.size > MAX_FILE_SIZE; + return { content, truncated, size: stat.size }; + } catch (error) { + return { error: error instanceof Error ? error.message : String(error) }; + } + }, + }), + + search_files: tool({ + description: + 'Search for a regex pattern across files in the workspace. Returns up to 20 matches. Skips binary files and node_modules/.git.', + inputSchema: z.object({ + pattern: z.string().describe('Regex pattern to search for'), + path: z.string().describe('Relative path to search within (use "." for root)').default('.'), + }), + execute: async (input: { pattern: string; path: string }) => { + try { + const resolved = resolveSandboxed(workspacePath, input.path); + let regex: RegExp; + try { + regex = new RegExp(input.pattern, 'gi'); + } catch (regexErr) { + return { + error: `Invalid regex pattern: ${regexErr instanceof Error ? regexErr.message : String(regexErr)}`, + }; + } + const matches: Array<{ file: string; line: number; text: string }> = []; + + await searchDirectory(resolved, workspacePath, regex, matches); + + return { matches, total: matches.length }; + } catch (error) { + return { error: error instanceof Error ? error.message : String(error) }; + } + }, + }), + }; +} + +/** + * Recursively search a directory for regex matches. + */ +async function searchDirectory( + dirPath: string, + workspacePath: string, + regex: RegExp, + matches: Array<{ file: string; line: number; text: string }>, +): Promise { + if (matches.length >= MAX_SEARCH_MATCHES) return; + + let entries: import('node:fs').Dirent[]; + try { + entries = await fs.readdir(dirPath, { withFileTypes: true }); + } catch { + return; + } + + for (const entry of entries) { + if (matches.length >= MAX_SEARCH_MATCHES) return; + + if (SEARCH_SKIP_DIRS.has(entry.name)) continue; + + const fullPath = path.join(dirPath, entry.name); + + if (entry.isDirectory()) { + await searchDirectory(fullPath, workspacePath, regex, matches); + } else if (entry.isFile()) { + const ext = path.extname(entry.name).toLowerCase(); + if (BINARY_EXTENSIONS.has(ext)) continue; + + try { + const stat = await fs.stat(fullPath); + if (stat.size > MAX_FILE_SIZE) continue; + + const content = await fs.readFile(fullPath, 'utf-8'); + const lines = content.split('\n'); + + for (let i = 0; i < lines.length; i++) { + if (matches.length >= MAX_SEARCH_MATCHES) return; + regex.lastIndex = 0; + if (regex.test(lines[i])) { + matches.push({ + file: path.relative(workspacePath, fullPath), + line: i + 1, + text: lines[i].substring(0, 200), + }); + } + } + } catch { + // Skip unreadable files + } + } + } +} diff --git a/packages/core/src/evaluation/loaders/eval-yaml-transpiler.ts b/packages/core/src/evaluation/loaders/eval-yaml-transpiler.ts index 4d9560157..9c79366a0 100644 --- a/packages/core/src/evaluation/loaders/eval-yaml-transpiler.ts +++ b/packages/core/src/evaluation/loaders/eval-yaml-transpiler.ts @@ -143,11 +143,7 @@ function assertionToNaturalLanguage(entry: RawAssertEntry): string | null { return `Output ends with '${entry.value}'`; case 'llm-judge': - case 'llm_judge': - return typeof entry.prompt === 'string' ? entry.prompt : null; - - case 'agent-judge': - case 'agent_judge': { + case 'llm_judge': { // Expand each rubric item to its own assertion string // Return the first one — callers handle arrays via assertionToNaturalLanguageList if (Array.isArray(entry.rubrics) && entry.rubrics.length > 0) { @@ -217,10 +213,10 @@ function assertionToNaturalLanguage(entry: RawAssertEntry): string | null { /** * Expand a single assertion entry into zero or more NL strings. - * Most assertions produce exactly one string; agent-judge with rubrics expands to many. + * Most assertions produce exactly one string; llm-judge with rubrics expands to many. */ function assertionToNaturalLanguageList(entry: RawAssertEntry): string[] { - if (entry.type === 'agent-judge' || entry.type === 'agent_judge') { + if (entry.type === 'llm-judge' || entry.type === 'llm_judge') { if (Array.isArray(entry.rubrics) && entry.rubrics.length > 0) { return (entry.rubrics as Array<{ outcome?: string; criteria?: string; id?: string }>) .map((r) => r.outcome ?? r.criteria ?? r.id) diff --git a/packages/core/src/evaluation/loaders/evaluator-parser.ts b/packages/core/src/evaluation/loaders/evaluator-parser.ts index 2eb72cb92..4ec619e22 100644 --- a/packages/core/src/evaluation/loaders/evaluator-parser.ts +++ b/packages/core/src/evaluation/loaders/evaluator-parser.ts @@ -852,82 +852,6 @@ async function parseEvaluatorList( continue; } - if (typeValue === 'agent-judge') { - // Validate max_steps (1-50) - const rawMaxSteps = rawEvaluator.max_steps ?? rawEvaluator.maxSteps; - let maxSteps: number | undefined; - if (rawMaxSteps !== undefined) { - if ( - typeof rawMaxSteps !== 'number' || - !Number.isInteger(rawMaxSteps) || - rawMaxSteps < 1 || - rawMaxSteps > 50 - ) { - logWarning( - `Skipping agent-judge evaluator '${name}' in '${evalId}': max_steps must be an integer 1-50`, - ); - continue; - } - maxSteps = rawMaxSteps; - } - - // Validate temperature (0-2) - const rawTemperature = rawEvaluator.temperature; - let temperature: number | undefined; - if (rawTemperature !== undefined) { - if (typeof rawTemperature !== 'number' || rawTemperature < 0 || rawTemperature > 2) { - logWarning( - `Skipping agent-judge evaluator '${name}' in '${evalId}': temperature must be a number 0-2`, - ); - continue; - } - temperature = rawTemperature; - } - - // Validate target (string) - const judgeTarget = asString(rawEvaluator.target); - - // Parse prompt (file path or inline text) - let agentPrompt: string | undefined; - let agentPromptPath: string | undefined; - const rawAgentPrompt = rawEvaluator.prompt; - if (typeof rawAgentPrompt === 'string') { - agentPrompt = rawAgentPrompt; - const resolved = await resolveFileReference(rawAgentPrompt, searchRoots); - if (resolved.resolvedPath) { - agentPromptPath = path.resolve(resolved.resolvedPath); - } - } - - // Parse rubrics via existing infrastructure - const rawAgentRubrics = rawEvaluator.rubrics; - const agentParsedRubrics = Array.isArray(rawAgentRubrics) - ? parseRubricItems(rawAgentRubrics, name, evalId) - : undefined; - - const weight = validateWeight(rawEvaluator.weight, name, evalId); - const required = parseRequired(rawEvaluator.required); - - evaluators.push({ - name, - type: 'agent-judge', - ...(agentPrompt ? { prompt: agentPrompt } : {}), - ...(agentPromptPath - ? { promptPath: agentPromptPath, resolvedPromptPath: agentPromptPath } - : {}), - ...(agentParsedRubrics && agentParsedRubrics.length > 0 - ? { rubrics: agentParsedRubrics } - : {}), - ...(maxSteps !== undefined ? { max_steps: maxSteps } : {}), - ...(temperature !== undefined ? { temperature } : {}), - ...(judgeTarget ? { target: judgeTarget } : {}), - ...(weight !== undefined ? { weight } : {}), - ...(required !== undefined ? { required } : {}), - ...(negate !== undefined ? { negate } : {}), - }); - continue; - } - if (typeValue === 'skill-trigger') { const skillName = asString(rawEvaluator.skill); if (!skillName) { @@ -1266,6 +1190,9 @@ async function parseEvaluatorList( 'config', 'required', 'negate', + 'max_steps', + 'maxSteps', + 'temperature', ]); const config: Record = {}; for (const [key, value] of Object.entries(rawEvaluator)) { @@ -1284,6 +1211,19 @@ async function parseEvaluatorList( const finalConfig = promptScriptConfig ?? (Object.keys(mergedConfig).length > 0 ? mergedConfig : undefined); + // Parse optional max_steps and temperature (used in agent mode) + const rawMaxStepsLlm = rawEvaluator.max_steps ?? rawEvaluator.maxSteps; + const llmMaxSteps = + typeof rawMaxStepsLlm === 'number' && + Number.isInteger(rawMaxStepsLlm) && + rawMaxStepsLlm >= 1 && + rawMaxStepsLlm <= 50 + ? rawMaxStepsLlm + : undefined; + const rawTempLlm = rawEvaluator.temperature; + const llmTemperature = + typeof rawTempLlm === 'number' && rawTempLlm >= 0 && rawTempLlm <= 2 ? rawTempLlm : undefined; + evaluators.push({ name, type: 'llm-judge', @@ -1297,6 +1237,8 @@ async function parseEvaluatorList( ...(required !== undefined ? { required } : {}), ...(negate !== undefined ? { negate } : {}), ...(finalConfig ? { config: finalConfig } : {}), + ...(llmMaxSteps !== undefined ? { max_steps: llmMaxSteps } : {}), + ...(llmTemperature !== undefined ? { temperature: llmTemperature } : {}), }); } diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index 396bc15fe..95cbdab7f 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -235,6 +235,10 @@ export interface RunEvaluationOptions { readonly retainOnSuccess?: 'keep' | 'cleanup'; /** Retention policy override for failed cases */ readonly retainOnFailure?: 'keep' | 'cleanup'; + /** CLI override: judge target name (e.g., "agentv" or a target from targets.yaml) */ + readonly judgeTarget?: string; + /** CLI override: model for judge target (e.g., "openai:gpt-5-mini") */ + readonly model?: string; } export async function runEvaluation( @@ -271,6 +275,8 @@ export async function runEvaluation( workspaceClean, retainOnSuccess, retainOnFailure, + judgeTarget: cliJudgeTarget, + model: cliModel, } = options; // Disable cache when trials > 1 (cache makes trials deterministic = pointless) @@ -335,6 +341,25 @@ export async function runEvaluation( const resolveJudgeProvider = async ( targetContext: ResolvedTarget, ): Promise => { + // CLI --judge-target takes highest priority + if (cliJudgeTarget) { + if (cliJudgeTarget === 'agentv') { + if (!cliModel) { + throw new Error('--judge-target "agentv" requires --model (e.g., "openai:gpt-5-mini")'); + } + const { AgentvProvider } = await import('./providers/agentv-provider.js'); + return new AgentvProvider('agentv', { model: cliModel, temperature: 0 }); + } + const overrideTarget = resolveTargetByName(cliJudgeTarget); + if (!overrideTarget) { + throw new Error(`--judge-target "${cliJudgeTarget}" not found in targets`); + } + return getOrCreateProvider(overrideTarget); + } + + // TODO: When --model is provided without --judge-target, override the model of + // whichever judge target is resolved. For now, --model only works with --judge-target agentv. + const judgeName = targetContext.judgeTarget ?? targetContext.name; const resolvedJudge = resolveTargetByName(judgeName); if (!resolvedJudge) { @@ -346,7 +371,8 @@ export async function runEvaluation( // Validate judge_target: error if an agent provider would be used as judge. // Agent providers can't return structured JSON for judging — they respond with // tool calls and markdown, causing silent score-0 failures. - if (isAgentProvider(getOrCreateProvider(target)) && !target.judgeTarget) { + // CLI --judge-target override also satisfies this requirement. + if (isAgentProvider(getOrCreateProvider(target)) && !target.judgeTarget && !cliJudgeTarget) { throw new Error( `Target "${target.name}" is an agent provider ("${target.kind}") with no judge_target — agent providers cannot return structured JSON for judging. Set judge_target to an LLM provider (e.g., azure-llm).`, ); diff --git a/packages/core/src/evaluation/providers/agentv-provider.ts b/packages/core/src/evaluation/providers/agentv-provider.ts new file mode 100644 index 000000000..88084c8fa --- /dev/null +++ b/packages/core/src/evaluation/providers/agentv-provider.ts @@ -0,0 +1,89 @@ +import { createAnthropic } from '@ai-sdk/anthropic'; +import { createAzure } from '@ai-sdk/azure'; +import { createGoogleGenerativeAI } from '@ai-sdk/google'; +import { createOpenAI } from '@ai-sdk/openai'; +import type { LanguageModel } from 'ai'; + +import type { AgentVResolvedConfig } from './targets.js'; +import type { Provider, ProviderRequest, ProviderResponse } from './types.js'; + +/** + * Parse a model string like "openai:gpt-5-mini" into provider prefix and model name. + */ +function parseModelString(model: string): { provider: string; modelName: string } { + const colonIndex = model.indexOf(':'); + if (colonIndex === -1) { + throw new Error( + `Invalid model string "${model}". Expected format "provider:model" (e.g., "openai:gpt-5-mini")`, + ); + } + return { + provider: model.slice(0, colonIndex), + modelName: model.slice(colonIndex + 1), + }; +} + +/** + * Create a LanguageModel from a model string using the appropriate AI SDK provider. + */ +function createLanguageModel(modelString: string): LanguageModel { + const { provider, modelName } = parseModelString(modelString); + + switch (provider) { + case 'openai': + // Cast: @ai-sdk/openai may return LanguageModelV3 while the rest of the + // codebase uses LanguageModelV2. The runtime API is compatible. + return createOpenAI()(modelName) as unknown as LanguageModel; + case 'anthropic': + return createAnthropic()(modelName); + case 'azure': + return createAzure()(modelName); + case 'google': + return createGoogleGenerativeAI()(modelName); + default: + throw new Error( + `Unsupported AI SDK provider "${provider}" in model string "${modelString}". Supported providers: openai, anthropic, azure, google`, + ); + } +} + +/** + * AgentV built-in provider for LLM judge evaluation. + * + * Resolves an AI SDK model string (e.g., "openai:gpt-5-mini", "anthropic:claude-sonnet-4-20250514") + * to a Vercel AI SDK LanguageModel by parsing the provider prefix and creating the appropriate + * AI SDK provider directly. This provider is used exclusively for judge evaluation — it does not + * support direct agent invocation. + * + * Usage: `--judge-target agentv --model openai:gpt-5-mini` + */ +export class AgentvProvider implements Provider { + readonly id: string; + readonly kind = 'agentv' as const; + readonly targetName: string; + + private readonly model: LanguageModel; + + constructor(targetName: string, config: AgentVResolvedConfig) { + this.id = `agentv:${targetName}`; + this.targetName = targetName; + this.model = createLanguageModel(config.model); + } + + /** + * Direct invoke is not supported for the agentv provider. + * Use asLanguageModel() with generateText() instead. + */ + async invoke(_request: ProviderRequest): Promise { + throw new Error( + 'AgentvProvider does not support direct invoke(). Use asLanguageModel() with generateText() instead.', + ); + } + + /** + * Returns the resolved AI SDK LanguageModel for use with generateText/generateObject. + */ + asLanguageModel(): LanguageModel { + return this.model; + } +} diff --git a/packages/core/src/evaluation/providers/index.ts b/packages/core/src/evaluation/providers/index.ts index 62cd8eef8..6ec6e2dfa 100644 --- a/packages/core/src/evaluation/providers/index.ts +++ b/packages/core/src/evaluation/providers/index.ts @@ -1,3 +1,4 @@ +import { AgentvProvider } from './agentv-provider.js'; import { AnthropicProvider, AzureProvider, GeminiProvider } from './ai-sdk.js'; import { ClaudeCliProvider } from './claude-cli.js'; import { ClaudeSdkProvider } from './claude-sdk.js'; @@ -30,6 +31,7 @@ export type { } from './types.js'; export type { + AgentVResolvedConfig, AnthropicResolvedConfig, AzureResolvedConfig, ClaudeResolvedConfig, @@ -95,6 +97,7 @@ export function createBuiltinProviderRegistry(): ProviderRegistry { // claude-sdk is the explicit SDK provider (requires @anthropic-ai/claude-agent-sdk) .register('claude-sdk', (t) => new ClaudeSdkProvider(t.name, t.config as never)) .register('mock', (t) => new MockProvider(t.name, t.config as never)) + .register('agentv', (t) => new AgentvProvider(t.name, t.config as never)) .register('vscode', (t) => new VSCodeProvider(t.name, t.config as never, 'vscode')) .register( 'vscode-insiders', diff --git a/packages/core/src/evaluation/providers/targets.ts b/packages/core/src/evaluation/providers/targets.ts index aa30b06b6..26f827eae 100644 --- a/packages/core/src/evaluation/providers/targets.ts +++ b/packages/core/src/evaluation/providers/targets.ts @@ -514,6 +514,11 @@ export interface VSCodeResolvedConfig { readonly timeoutMs?: number; } +export interface AgentVResolvedConfig { + readonly model: string; + readonly temperature: number; +} + /** * Healthcheck configuration type derived from CliHealthcheckSchema. * Supports both HTTP and command-based healthchecks. @@ -628,6 +633,14 @@ export type ResolvedTarget = readonly providerBatching?: boolean; readonly config: VSCodeResolvedConfig; } + | { + readonly kind: 'agentv'; + readonly name: string; + readonly judgeTarget?: string; + readonly workers?: number; + readonly providerBatching?: boolean; + readonly config: AgentVResolvedConfig; + } | { readonly kind: 'cli'; readonly name: string; @@ -841,6 +854,23 @@ export function resolveTargetDefinition( providerBatching, config: resolveVSCodeConfig(parsed, env, provider === 'vscode-insiders', evalFilePath), }; + case 'agentv': { + const model = typeof parsed.model === 'string' ? parsed.model : undefined; + if (!model) { + throw new Error( + `Target "${parsed.name}" (provider: agentv) requires a "model" field (e.g., "openai:gpt-5-mini")`, + ); + } + const temperature = typeof parsed.temperature === 'number' ? parsed.temperature : 0; + return { + kind: 'agentv', + name: parsed.name, + judgeTarget: parsed.judge_target, + workers: typeof parsed.workers === 'number' ? parsed.workers : undefined, + providerBatching, + config: { model, temperature }, + }; + } case 'cli': return { kind: 'cli', diff --git a/packages/core/src/evaluation/providers/types.ts b/packages/core/src/evaluation/providers/types.ts index af5e3b6a1..e0106071a 100644 --- a/packages/core/src/evaluation/providers/types.ts +++ b/packages/core/src/evaluation/providers/types.ts @@ -25,7 +25,8 @@ export type ProviderKind = | 'cli' | 'mock' | 'vscode' - | 'vscode-insiders'; + | 'vscode-insiders' + | 'agentv'; /** * Agent providers that have filesystem access and don't need unwrapped guidelines. @@ -63,6 +64,7 @@ export const KNOWN_PROVIDERS: readonly ProviderKind[] = [ 'mock', 'vscode', 'vscode-insiders', + 'agentv', ] as const; /** diff --git a/packages/core/src/evaluation/registry/builtin-evaluators.ts b/packages/core/src/evaluation/registry/builtin-evaluators.ts index dee6b0237..7d8e6ff88 100644 --- a/packages/core/src/evaluation/registry/builtin-evaluators.ts +++ b/packages/core/src/evaluation/registry/builtin-evaluators.ts @@ -6,9 +6,7 @@ * the EvaluatorRegistry at startup. */ -import { readFileSync } from 'node:fs'; import { - AgentJudgeEvaluator, CodeEvaluator, CompositeEvaluator, CostEvaluator, @@ -34,10 +32,10 @@ import { } from '../evaluators.js'; import { InlineAssertEvaluator } from '../evaluators/inline-assert.js'; import { resolveCustomPrompt } from '../evaluators/prompt-resolution.js'; +import { isAgentProvider } from '../providers/types.js'; import type { Provider } from '../providers/types.js'; import type { ToolTrajectoryEvaluatorConfig } from '../trace.js'; import type { - AgentJudgeEvaluatorConfig, CodeEvaluatorConfig, CompositeEvaluatorConfig, ContainsAllEvaluatorConfig, @@ -74,6 +72,11 @@ export const INLINE_ASSERT_FN = Symbol.for('agentv.inline-assert-fn'); * Factory for `llm-judge` evaluators. * Creates a wrapper that resolves custom prompts at evaluation time and * optionally overrides the judge target per evaluator. + * + * Auto-detects mode based on the resolved judge provider: + * - LLM providers (azure, anthropic, gemini): structured JSON mode + * - Agent providers (claude-cli, copilot, etc.): delegate mode + * - agentv provider: built-in AI SDK agent mode with filesystem tools */ export const llmJudgeFactory: EvaluatorFactoryFn = (config, context) => { const c = config as LlmJudgeEvaluatorConfig; @@ -88,12 +91,20 @@ export const llmJudgeFactory: EvaluatorFactoryFn = (config, context) => { if (!judgeTargetProvider) { throw new Error(`llm-judge evaluator '${c.name}': target '${c.target}' not found in targets`); } + // Only pass judgeTargetProvider for agent providers (delegate mode). + // LLM providers use the normal resolveJudgeProvider path for structured JSON mode. + // Note: agentv uses asLanguageModel() not invoke(), so it's not in AGENT_PROVIDER_KINDS; + // check it explicitly here for built-in agent mode. + const isAgent = isAgentProvider(judgeTargetProvider) || judgeTargetProvider.kind === 'agentv'; evaluator = new LlmJudgeEvaluator({ resolveJudgeProvider: async (evalContext) => { if (judgeTargetProvider) return judgeTargetProvider; if (evalContext.judgeProvider) return evalContext.judgeProvider; return judgeProvider; }, + maxSteps: c.max_steps, + temperature: c.temperature, + ...(isAgent ? { judgeTargetProvider } : {}), }); } @@ -198,45 +209,6 @@ export const executionMetricsFactory: EvaluatorFactoryFn = (config) => { }); }; -/** Factory for `agent-judge` evaluators. */ -export const agentJudgeFactory: EvaluatorFactoryFn = (config, context) => { - const c = config as AgentJudgeEvaluatorConfig; - const { judgeProvider, targetResolver } = context; - - let customPrompt: string | undefined; - if (c.resolvedPromptPath) { - try { - customPrompt = readFileSync(c.resolvedPromptPath, 'utf-8'); - } catch (error) { - const message = error instanceof Error ? error.message : String(error); - console.warn(`Could not read agent-judge prompt at ${c.resolvedPromptPath}: ${message}`); - } - } else if (c.prompt) { - customPrompt = c.prompt; - } - - let judgeTargetProvider: Provider | undefined; - if (c.target && targetResolver) { - judgeTargetProvider = targetResolver(c.target); - if (!judgeTargetProvider) { - throw new Error( - `agent-judge evaluator '${c.name}': target '${c.target}' not found in targets`, - ); - } - } - - return new AgentJudgeEvaluator({ - resolveJudgeProvider: async (ctx) => { - if (ctx.judgeProvider) return ctx.judgeProvider; - return judgeProvider; - }, - maxSteps: c.max_steps, - temperature: c.temperature, - evaluatorTemplate: customPrompt, - judgeTargetProvider, - }); -}; - /** Factory for `skill-trigger` evaluator. */ export const skillTriggerFactory: EvaluatorFactoryFn = (config) => { return new SkillTriggerEvaluator(config as SkillTriggerEvaluatorConfig); @@ -440,7 +412,6 @@ export function createBuiltinRegistry(): EvaluatorRegistry { .register('cost', costFactory) .register('token-usage', tokenUsageFactory) .register('execution-metrics', executionMetricsFactory) - .register('agent-judge', agentJudgeFactory) .register('skill-trigger', skillTriggerFactory) .register('contains', containsFactory) .register('contains-any', containsAnyFactory) diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts index b69c272ab..b174af42f 100644 --- a/packages/core/src/evaluation/types.ts +++ b/packages/core/src/evaluation/types.ts @@ -158,7 +158,6 @@ const EVALUATOR_KIND_VALUES = [ 'cost', 'token-usage', 'execution-metrics', - 'agent-judge', 'skill-trigger', 'contains', 'contains-any', @@ -337,6 +336,10 @@ export type LlmJudgeEvaluatorConfig = { readonly target?: string; /** Pass-through configuration for custom evaluator prompts (legacy, prefer prompt.config) */ readonly config?: Record; + /** Maximum agent steps for agentv built-in mode (default 10, max 50). Ignored in LLM mode. */ + readonly max_steps?: number; + /** Temperature override for judge calls */ + readonly temperature?: number; }; /** @@ -529,35 +532,6 @@ export type ExecutionMetricsEvaluatorConfig = { readonly negate?: boolean; }; -/** - * Configuration for the agent-judge evaluator. - * Runs an agentic investigation loop to audit workspaces and verify criteria. - * Two modes: - * - Built-in: Uses AI SDK generateText() with sandboxed filesystem tools - * - Judge target: Delegates to an external agent provider via Provider.invoke() - */ -export type AgentJudgeEvaluatorConfig = { - readonly name: string; - readonly type: 'agent-judge'; - /** Custom evaluation prompt (inline text or file path) */ - readonly prompt?: string; - readonly promptPath?: string; - /** Resolved absolute path for prompt file */ - readonly resolvedPromptPath?: string; - /** Rubric items for structured evaluation (reuses llm-judge rubric infra) */ - readonly rubrics?: readonly RubricItem[]; - /** Maximum agent steps for built-in mode (default 10, max 50) */ - readonly max_steps?: number; - /** Temperature for built-in mode (default 0) */ - readonly temperature?: number; - /** Target name — delegates agent loop to this provider instead of built-in mode */ - readonly target?: string; - readonly weight?: number; - readonly required?: boolean | number; - /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */ - readonly negate?: boolean; -}; - /** * Configuration for the contains assertion evaluator. * Checks whether the candidate output contains a specified substring. @@ -766,7 +740,6 @@ export type EvaluatorConfig = | CostEvaluatorConfig | TokenUsageEvaluatorConfig | ExecutionMetricsEvaluatorConfig - | AgentJudgeEvaluatorConfig | SkillTriggerEvaluatorConfig | ContainsEvaluatorConfig | ContainsAnyEvaluatorConfig diff --git a/packages/core/src/evaluation/validation/eval-file.schema.ts b/packages/core/src/evaluation/validation/eval-file.schema.ts index 690373b43..e3bad5fed 100644 --- a/packages/core/src/evaluation/validation/eval-file.schema.ts +++ b/packages/core/src/evaluation/validation/eval-file.schema.ts @@ -87,6 +87,8 @@ const LlmJudgeSchema = EvaluatorCommonSchema.extend({ model: z.string().optional(), target: z.string().optional(), config: z.record(z.unknown()).optional(), + max_steps: z.number().int().min(1).max(50).optional(), + temperature: z.number().min(0).max(2).optional(), }); /** Aggregator configs for composite evaluator */ @@ -189,15 +191,6 @@ const ExecutionMetricsSchema = EvaluatorCommonSchema.extend({ exploration_tolerance: z.number().min(0).optional(), }); -const AgentJudgeSchema = EvaluatorCommonSchema.extend({ - type: z.enum(['agent-judge', 'agent_judge']), - prompt: z.string().optional(), - rubrics: z.array(RubricItemSchema).optional(), - max_steps: z.number().int().min(1).max(50).optional(), - temperature: z.number().min(0).max(2).optional(), - target: z.string().optional(), -}); - const ContainsSchema = EvaluatorCommonSchema.extend({ type: z.literal('contains'), value: z.string(), @@ -233,7 +226,6 @@ const EvaluatorSchema = z.union([ CostSchema, TokenUsageSchema, ExecutionMetricsSchema, - AgentJudgeSchema, ContainsSchema, RegexSchema, IsJsonSchema, diff --git a/packages/core/test/evaluation/loaders/eval-yaml-transpiler.test.ts b/packages/core/test/evaluation/loaders/eval-yaml-transpiler.test.ts index de224a1a2..0647ce387 100644 --- a/packages/core/test/evaluation/loaders/eval-yaml-transpiler.test.ts +++ b/packages/core/test/evaluation/loaders/eval-yaml-transpiler.test.ts @@ -244,7 +244,7 @@ describe('transpileEvalYaml — NL assertions', () => { expect(evals[0].assertions).toContain('The answer is clear and concise'); }); - it('converts agent-judge with rubrics to multiple assertions', () => { + it('converts llm-judge with rubrics to multiple assertions (rubrics variant)', () => { const suite = { tests: [ { @@ -253,7 +253,7 @@ describe('transpileEvalYaml — NL assertions', () => { assertions: [ { type: 'skill-trigger', skill: 's', should_trigger: true }, { - type: 'agent-judge', + type: 'llm-judge', rubrics: [ { id: 'r1', outcome: 'Correct result returned' }, { id: 'r2', outcome: 'No unnecessary steps' }, @@ -269,6 +269,31 @@ describe('transpileEvalYaml — NL assertions', () => { expect(evals[0].assertions).toContain('No unnecessary steps'); }); + it('converts llm-judge with rubrics to multiple assertions', () => { + const suite = { + tests: [ + { + id: 't1', + input: 'test', + assertions: [ + { type: 'skill-trigger', skill: 's', should_trigger: true }, + { + type: 'llm-judge', + rubrics: [ + { id: 'r1', outcome: 'Response is accurate' }, + { id: 'r2', outcome: 'Formatting is correct' }, + ], + }, + ], + }, + ], + }; + const { files } = transpileEvalYaml(suite); + const evals = files.get('s')?.evals; + expect(evals[0].assertions).toContain('Response is accurate'); + expect(evals[0].assertions).toContain('Formatting is correct'); + }); + it('converts tool-trajectory to NL', () => { const suite = { tests: [ diff --git a/packages/core/test/evaluation/providers/agentv-provider.test.ts b/packages/core/test/evaluation/providers/agentv-provider.test.ts new file mode 100644 index 000000000..2b0c0aadd --- /dev/null +++ b/packages/core/test/evaluation/providers/agentv-provider.test.ts @@ -0,0 +1,133 @@ +import { describe, expect, it, vi } from 'vitest'; + +// Mock AI SDK provider packages before importing the provider. +// Each createXxx() returns a callable factory: createXxx()(modelName) => model stub. +vi.mock('@ai-sdk/openai', () => ({ + createOpenAI: () => (modelId: string) => ({ + modelId, + specificationVersion: 'v2', + provider: 'openai', + }), +})); + +vi.mock('@ai-sdk/anthropic', () => ({ + createAnthropic: () => (modelId: string) => ({ + modelId, + specificationVersion: 'v2', + provider: 'anthropic', + }), +})); + +vi.mock('@ai-sdk/azure', () => ({ + createAzure: () => (modelId: string) => ({ + modelId, + specificationVersion: 'v2', + provider: 'azure', + }), +})); + +vi.mock('@ai-sdk/google', () => ({ + createGoogleGenerativeAI: () => (modelId: string) => ({ + modelId, + specificationVersion: 'v2', + provider: 'google', + }), +})); + +import { AgentvProvider } from '../../../src/evaluation/providers/agentv-provider.js'; + +describe('AgentvProvider', () => { + it('has kind "agentv"', () => { + const provider = new AgentvProvider('test-judge', { + model: 'openai:gpt-5-mini', + temperature: 0, + }); + expect(provider.kind).toBe('agentv'); + }); + + it('has correct targetName', () => { + const provider = new AgentvProvider('my-judge', { + model: 'openai:gpt-5-mini', + temperature: 0, + }); + expect(provider.targetName).toBe('my-judge'); + }); + + it('has correct id format', () => { + const provider = new AgentvProvider('test-judge', { + model: 'openai:gpt-5-mini', + temperature: 0, + }); + expect(provider.id).toBe('agentv:test-judge'); + }); + + it('asLanguageModel() returns a defined LanguageModel', () => { + const provider = new AgentvProvider('test-judge', { + model: 'openai:gpt-5-mini', + temperature: 0, + }); + const model = provider.asLanguageModel(); + expect(model).toBeDefined(); + expect((model as unknown as { modelId: string }).modelId).toBe('gpt-5-mini'); + }); + + it('asLanguageModel() works with anthropic model strings', () => { + const provider = new AgentvProvider('test-judge', { + model: 'anthropic:claude-sonnet-4-20250514', + temperature: 0, + }); + const model = provider.asLanguageModel(); + expect(model).toBeDefined(); + expect((model as unknown as { modelId: string }).modelId).toBe('claude-sonnet-4-20250514'); + }); + + it('asLanguageModel() works with google model strings', () => { + const provider = new AgentvProvider('test-judge', { + model: 'google:gemini-2.5-flash', + temperature: 0, + }); + const model = provider.asLanguageModel(); + expect(model).toBeDefined(); + expect((model as unknown as { modelId: string }).modelId).toBe('gemini-2.5-flash'); + }); + + it('asLanguageModel() works with azure model strings', () => { + const provider = new AgentvProvider('test-judge', { + model: 'azure:gpt-4o-deployment', + temperature: 0, + }); + const model = provider.asLanguageModel(); + expect(model).toBeDefined(); + expect((model as unknown as { modelId: string }).modelId).toBe('gpt-4o-deployment'); + }); + + it('throws for unsupported provider prefix', () => { + expect( + () => + new AgentvProvider('test-judge', { + model: 'unsupported:some-model', + temperature: 0, + }), + ).toThrow('Unsupported AI SDK provider "unsupported"'); + }); + + it('throws for model string without colon separator', () => { + expect( + () => + new AgentvProvider('test-judge', { + model: 'gpt-5-mini', + temperature: 0, + }), + ).toThrow('Invalid model string "gpt-5-mini"'); + }); + + it('invoke() throws an error', async () => { + const provider = new AgentvProvider('test-judge', { + model: 'openai:gpt-5-mini', + temperature: 0, + }); + await expect(provider.invoke({ question: 'test' })).rejects.toThrow( + 'AgentvProvider does not support direct invoke()', + ); + }); +}); diff --git a/packages/core/test/evaluation/providers/targets.test.ts b/packages/core/test/evaluation/providers/targets.test.ts index eacd573b2..7c7d2b0c2 100644 --- a/packages/core/test/evaluation/providers/targets.test.ts +++ b/packages/core/test/evaluation/providers/targets.test.ts @@ -559,6 +559,57 @@ describe('resolveTargetDefinition', () => { ), ).toThrow(/workspace_template has been removed/i); }); + + it('resolves agentv target with model and default temperature', () => { + const target = resolveTargetDefinition( + { + name: 'agentv-judge', + provider: 'agentv', + model: 'openai:gpt-5-mini', + }, + {}, + ); + + expect(target.kind).toBe('agentv'); + if (target.kind !== 'agentv') { + throw new Error('expected agentv target'); + } + + expect(target.config.model).toBe('openai:gpt-5-mini'); + expect(target.config.temperature).toBe(0); + }); + + it('resolves agentv target with explicit temperature', () => { + const target = resolveTargetDefinition( + { + name: 'agentv-warm', + provider: 'agentv', + model: 'anthropic:claude-haiku-4.5', + temperature: 0.7, + }, + {}, + ); + + expect(target.kind).toBe('agentv'); + if (target.kind !== 'agentv') { + throw new Error('expected agentv target'); + } + + expect(target.config.model).toBe('anthropic:claude-haiku-4.5'); + expect(target.config.temperature).toBe(0.7); + }); + + it('throws when agentv target is missing model', () => { + expect(() => + resolveTargetDefinition( + { + name: 'agentv-no-model', + provider: 'agentv', + }, + {}, + ), + ).toThrow(/model/i); + }); }); describe('createProvider', () => { diff --git a/packages/eval/src/assertion.ts b/packages/eval/src/assertion.ts index dd28ea304..bb77b4710 100644 --- a/packages/eval/src/assertion.ts +++ b/packages/eval/src/assertion.ts @@ -47,7 +47,6 @@ export type AssertionType = | 'cost' | 'token-usage' | 'execution-metrics' - | 'agent-judge' | 'skill-trigger' | 'contains' | 'contains-any' @@ -67,7 +66,6 @@ export type AssertionType = | 'field_accuracy' | 'token_usage' | 'execution_metrics' - | 'agent_judge' | 'contains_any' | 'contains_all' | 'icontains_any' diff --git a/plugins/agentv-dev/agents/eval-analyzer.md b/plugins/agentv-dev/agents/eval-analyzer.md index 547c86267..31660128e 100644 --- a/plugins/agentv-dev/agents/eval-analyzer.md +++ b/plugins/agentv-dev/agents/eval-analyzer.md @@ -28,7 +28,7 @@ If `eval-path` is provided, also read the EVAL.yaml to understand evaluator conf ### Step 2: Deterministic-Upgrade Analysis -For each evaluator entry in `scores` where `type` is `"llm-judge"`, `"rubrics"`, or `"agent-judge"`, inspect the `reasoning`, `hits`, and `misses` fields for patterns that indicate a deterministic assertion would suffice: +For each evaluator entry in `scores` where `type` is `"llm-judge"` or `"rubrics"`, inspect the `reasoning`, `hits`, and `misses` fields for patterns that indicate a deterministic assertion would suffice: | Signal | Detection | Suggested Upgrade | |--------|-----------|-------------------| @@ -123,7 +123,7 @@ If a section has no findings, include the header with "None found." underneath. - **Be specific:** Every suggestion must include the test case ID, evaluator name, evidence from the results, and a concrete replacement config. - **Be conservative:** Only suggest deterministic upgrades when the pattern is clear and consistent. Partial or ambiguous evidence should be noted but not acted on. - **Prioritize by impact:** Order suggestions by estimated cost savings (LLM-judge → deterministic saves the most). -- **Handle all evaluator types:** Process `code-judge`, `tool-trajectory`, `llm-judge`, `agent-judge`, `rubrics`, `composite`, and all deterministic types. Only LLM-based types are candidates for deterministic upgrades. +- **Handle all evaluator types:** Process `code-judge`, `tool-trajectory`, `llm-judge`, `rubrics`, `composite`, and all deterministic types. Only LLM-based types are candidates for deterministic upgrades. - **Multi-provider awareness:** When results span multiple targets, note if a suggestion applies to all targets or is target-specific. - **No false positives:** It is better to miss a suggestion than to recommend an incorrect upgrade. If unsure, add the finding to a "Needs Review" subsection with your reasoning. diff --git a/plugins/agentv-dev/skills/agentv-eval-writer/SKILL.md b/plugins/agentv-dev/skills/agentv-eval-writer/SKILL.md index 95d7bf796..5ae6275a3 100644 --- a/plugins/agentv-dev/skills/agentv-eval-writer/SKILL.md +++ b/plugins/agentv-dev/skills/agentv-eval-writer/SKILL.md @@ -222,7 +222,7 @@ tests: |----------|-------------|----------| | `criteria` + **no `assertions`** | Implicit `llm-judge` runs automatically against `criteria` | No | | `criteria` + **`assertions` with only deterministic evaluators** (contains, regex, etc.) | Only declared evaluators run. `criteria` is **not evaluated**. | Yes — warns that no evaluator will consume criteria | -| `criteria` + **`assertions` with a judge** (llm-judge, code-judge, agent-judge, rubrics) | Declared evaluators run. Judges receive `criteria` as input. | No | +| `criteria` + **`assertions` with a judge** (llm-judge, code-judge, rubrics) | Declared evaluators run. Judges receive `criteria` as input. | No | ### No assertions → implicit llm-judge diff --git a/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json b/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json index 9093c7e48..483031bf6 100644 --- a/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json +++ b/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json @@ -421,6 +421,16 @@ "config": { "type": "object", "additionalProperties": {} + }, + "max_steps": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "temperature": { + "type": "number", + "minimum": 0, + "maximum": 2 } }, "required": ["type"], @@ -930,112 +940,6 @@ "required": ["type"], "additionalProperties": false }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": 0, - "maximum": 1 - } - ] - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["agent-judge", "agent_judge"] - }, - "prompt": { - "type": "string" - }, - "rubrics": { - "type": "array", - "items": { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "outcome": { - "type": "string" - }, - "weight": { - "type": "number" - }, - "required": { - "type": "boolean" - }, - "required_min_score": { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - "score_ranges": { - "type": "array", - "items": { - "type": "object", - "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ - { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - { - "type": "integer", - "minimum": 0, - "maximum": 10 - } - ] - }, - "outcome": { - "type": "string", - "minLength": 1 - } - }, - "required": ["score_range", "outcome"], - "additionalProperties": false - } - } - }, - "additionalProperties": false - } - }, - "max_steps": { - "type": "integer", - "minimum": 1, - "maximum": 50 - }, - "temperature": { - "type": "number", - "minimum": 0, - "maximum": 2 - }, - "target": { - "type": "string" - } - }, - "required": ["type"], - "additionalProperties": false - }, { "type": "object", "properties": { @@ -1495,6 +1399,16 @@ "config": { "type": "object", "additionalProperties": {} + }, + "max_steps": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "temperature": { + "type": "number", + "minimum": 0, + "maximum": 2 } }, "required": ["type"], @@ -2004,112 +1918,6 @@ "required": ["type"], "additionalProperties": false }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": 0, - "maximum": 1 - } - ] - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["agent-judge", "agent_judge"] - }, - "prompt": { - "type": "string" - }, - "rubrics": { - "type": "array", - "items": { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "outcome": { - "type": "string" - }, - "weight": { - "type": "number" - }, - "required": { - "type": "boolean" - }, - "required_min_score": { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - "score_ranges": { - "type": "array", - "items": { - "type": "object", - "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ - { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - { - "type": "integer", - "minimum": 0, - "maximum": 10 - } - ] - }, - "outcome": { - "type": "string", - "minLength": 1 - } - }, - "required": ["score_range", "outcome"], - "additionalProperties": false - } - } - }, - "additionalProperties": false - } - }, - "max_steps": { - "type": "integer", - "minimum": 1, - "maximum": 50 - }, - "temperature": { - "type": "number", - "minimum": 0, - "maximum": 2 - }, - "target": { - "type": "string" - } - }, - "required": ["type"], - "additionalProperties": false - }, { "type": "object", "properties": { @@ -2569,6 +2377,16 @@ "config": { "type": "object", "additionalProperties": {} + }, + "max_steps": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "temperature": { + "type": "number", + "minimum": 0, + "maximum": 2 } }, "required": ["type"], @@ -3105,125 +2923,19 @@ }, "type": { "type": "string", - "enum": ["agent-judge", "agent_judge"] + "const": "contains" }, - "prompt": { - "type": "string" - }, - "rubrics": { - "type": "array", - "items": { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "outcome": { - "type": "string" - }, - "weight": { - "type": "number" - }, - "required": { - "type": "boolean" - }, - "required_min_score": { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - "score_ranges": { - "type": "array", - "items": { - "type": "object", - "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ - { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - { - "type": "integer", - "minimum": 0, - "maximum": 10 - } - ] - }, - "outcome": { - "type": "string", - "minLength": 1 - } - }, - "required": ["score_range", "outcome"], - "additionalProperties": false - } - } - }, - "additionalProperties": false - } - }, - "max_steps": { - "type": "integer", - "minimum": 1, - "maximum": 50 - }, - "temperature": { - "type": "number", - "minimum": 0, - "maximum": 2 - }, - "target": { - "type": "string" - } - }, - "required": ["type"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": 0, - "maximum": 1 - } - ] - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "contains" - }, - "value": { - "type": "string" - } - }, - "required": ["type", "value"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { "type": "string" }, "weight": { @@ -3655,6 +3367,16 @@ "config": { "type": "object", "additionalProperties": {} + }, + "max_steps": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "temperature": { + "type": "number", + "minimum": 0, + "maximum": 2 } }, "required": ["type"], @@ -4164,112 +3886,6 @@ "required": ["type"], "additionalProperties": false }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": 0, - "maximum": 1 - } - ] - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["agent-judge", "agent_judge"] - }, - "prompt": { - "type": "string" - }, - "rubrics": { - "type": "array", - "items": { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "outcome": { - "type": "string" - }, - "weight": { - "type": "number" - }, - "required": { - "type": "boolean" - }, - "required_min_score": { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - "score_ranges": { - "type": "array", - "items": { - "type": "object", - "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ - { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - { - "type": "integer", - "minimum": 0, - "maximum": 10 - } - ] - }, - "outcome": { - "type": "string", - "minLength": 1 - } - }, - "required": ["score_range", "outcome"], - "additionalProperties": false - } - } - }, - "additionalProperties": false - } - }, - "max_steps": { - "type": "integer", - "minimum": 1, - "maximum": 50 - }, - "temperature": { - "type": "number", - "minimum": 0, - "maximum": 2 - }, - "target": { - "type": "string" - } - }, - "required": ["type"], - "additionalProperties": false - }, { "type": "object", "properties": { @@ -4729,6 +4345,16 @@ "config": { "type": "object", "additionalProperties": {} + }, + "max_steps": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "temperature": { + "type": "number", + "minimum": 0, + "maximum": 2 } }, "required": ["type"], @@ -5265,125 +4891,19 @@ }, "type": { "type": "string", - "enum": ["agent-judge", "agent_judge"] + "const": "contains" }, - "prompt": { - "type": "string" - }, - "rubrics": { - "type": "array", - "items": { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "outcome": { - "type": "string" - }, - "weight": { - "type": "number" - }, - "required": { - "type": "boolean" - }, - "required_min_score": { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - "score_ranges": { - "type": "array", - "items": { - "type": "object", - "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ - { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - { - "type": "integer", - "minimum": 0, - "maximum": 10 - } - ] - }, - "outcome": { - "type": "string", - "minLength": 1 - } - }, - "required": ["score_range", "outcome"], - "additionalProperties": false - } - } - }, - "additionalProperties": false - } - }, - "max_steps": { - "type": "integer", - "minimum": 1, - "maximum": 50 - }, - "temperature": { - "type": "number", - "minimum": 0, - "maximum": 2 - }, - "target": { - "type": "string" - } - }, - "required": ["type"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": 0, - "maximum": 1 - } - ] - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "contains" - }, - "value": { - "type": "string" - } - }, - "required": ["type", "value"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { "type": "string" }, "weight": { @@ -5803,6 +5323,16 @@ "config": { "type": "object", "additionalProperties": {} + }, + "max_steps": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "temperature": { + "type": "number", + "minimum": 0, + "maximum": 2 } }, "required": ["type"], @@ -6312,112 +5842,6 @@ "required": ["type"], "additionalProperties": false }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": 0, - "maximum": 1 - } - ] - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["agent-judge", "agent_judge"] - }, - "prompt": { - "type": "string" - }, - "rubrics": { - "type": "array", - "items": { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "outcome": { - "type": "string" - }, - "weight": { - "type": "number" - }, - "required": { - "type": "boolean" - }, - "required_min_score": { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - "score_ranges": { - "type": "array", - "items": { - "type": "object", - "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ - { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - { - "type": "integer", - "minimum": 0, - "maximum": 10 - } - ] - }, - "outcome": { - "type": "string", - "minLength": 1 - } - }, - "required": ["score_range", "outcome"], - "additionalProperties": false - } - } - }, - "additionalProperties": false - } - }, - "max_steps": { - "type": "integer", - "minimum": 1, - "maximum": 50 - }, - "temperature": { - "type": "number", - "minimum": 0, - "maximum": 2 - }, - "target": { - "type": "string" - } - }, - "required": ["type"], - "additionalProperties": false - }, { "type": "object", "properties": { @@ -7292,6 +6716,16 @@ "config": { "type": "object", "additionalProperties": {} + }, + "max_steps": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "temperature": { + "type": "number", + "minimum": 0, + "maximum": 2 } }, "required": ["type"], @@ -7828,125 +7262,19 @@ }, "type": { "type": "string", - "enum": ["agent-judge", "agent_judge"] + "const": "contains" }, - "prompt": { - "type": "string" - }, - "rubrics": { - "type": "array", - "items": { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "outcome": { - "type": "string" - }, - "weight": { - "type": "number" - }, - "required": { - "type": "boolean" - }, - "required_min_score": { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - "score_ranges": { - "type": "array", - "items": { - "type": "object", - "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ - { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - { - "type": "integer", - "minimum": 0, - "maximum": 10 - } - ] - }, - "outcome": { - "type": "string", - "minLength": 1 - } - }, - "required": ["score_range", "outcome"], - "additionalProperties": false - } - } - }, - "additionalProperties": false - } - }, - "max_steps": { - "type": "integer", - "minimum": 1, - "maximum": 50 - }, - "temperature": { - "type": "number", - "minimum": 0, - "maximum": 2 - }, - "target": { - "type": "string" - } - }, - "required": ["type"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": 0, - "maximum": 1 - } - ] - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "contains" - }, - "value": { - "type": "string" - } - }, - "required": ["type", "value"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { "type": "string" }, "weight": { @@ -8366,6 +7694,16 @@ "config": { "type": "object", "additionalProperties": {} + }, + "max_steps": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "temperature": { + "type": "number", + "minimum": 0, + "maximum": 2 } }, "required": ["type"], @@ -8875,112 +8213,6 @@ "required": ["type"], "additionalProperties": false }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": 0, - "maximum": 1 - } - ] - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["agent-judge", "agent_judge"] - }, - "prompt": { - "type": "string" - }, - "rubrics": { - "type": "array", - "items": { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "outcome": { - "type": "string" - }, - "weight": { - "type": "number" - }, - "required": { - "type": "boolean" - }, - "required_min_score": { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - "score_ranges": { - "type": "array", - "items": { - "type": "object", - "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ - { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - { - "type": "integer", - "minimum": 0, - "maximum": 10 - } - ] - }, - "outcome": { - "type": "string", - "minLength": 1 - } - }, - "required": ["score_range", "outcome"], - "additionalProperties": false - } - } - }, - "additionalProperties": false - } - }, - "max_steps": { - "type": "integer", - "minimum": 1, - "maximum": 50 - }, - "temperature": { - "type": "number", - "minimum": 0, - "maximum": 2 - }, - "target": { - "type": "string" - } - }, - "required": ["type"], - "additionalProperties": false - }, { "type": "object", "properties": { @@ -9440,6 +8672,16 @@ "config": { "type": "object", "additionalProperties": {} + }, + "max_steps": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "temperature": { + "type": "number", + "minimum": 0, + "maximum": 2 } }, "required": ["type"], @@ -9976,125 +9218,19 @@ }, "type": { "type": "string", - "enum": ["agent-judge", "agent_judge"] + "const": "contains" }, - "prompt": { - "type": "string" - }, - "rubrics": { - "type": "array", - "items": { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "outcome": { - "type": "string" - }, - "weight": { - "type": "number" - }, - "required": { - "type": "boolean" - }, - "required_min_score": { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - "score_ranges": { - "type": "array", - "items": { - "type": "object", - "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ - { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - { - "type": "integer", - "minimum": 0, - "maximum": 10 - } - ] - }, - "outcome": { - "type": "string", - "minLength": 1 - } - }, - "required": ["score_range", "outcome"], - "additionalProperties": false - } - } - }, - "additionalProperties": false - } - }, - "max_steps": { - "type": "integer", - "minimum": 1, - "maximum": 50 - }, - "temperature": { - "type": "number", - "minimum": 0, - "maximum": 2 - }, - "target": { - "type": "string" - } - }, - "required": ["type"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": 0, - "maximum": 1 - } - ] - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "contains" - }, - "value": { - "type": "string" - } - }, - "required": ["type", "value"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { "type": "string" }, "weight": { @@ -10526,6 +9662,16 @@ "config": { "type": "object", "additionalProperties": {} + }, + "max_steps": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "temperature": { + "type": "number", + "minimum": 0, + "maximum": 2 } }, "required": ["type"], @@ -11035,112 +10181,6 @@ "required": ["type"], "additionalProperties": false }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": 0, - "maximum": 1 - } - ] - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["agent-judge", "agent_judge"] - }, - "prompt": { - "type": "string" - }, - "rubrics": { - "type": "array", - "items": { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "outcome": { - "type": "string" - }, - "weight": { - "type": "number" - }, - "required": { - "type": "boolean" - }, - "required_min_score": { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - "score_ranges": { - "type": "array", - "items": { - "type": "object", - "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ - { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - { - "type": "integer", - "minimum": 0, - "maximum": 10 - } - ] - }, - "outcome": { - "type": "string", - "minLength": 1 - } - }, - "required": ["score_range", "outcome"], - "additionalProperties": false - } - } - }, - "additionalProperties": false - } - }, - "max_steps": { - "type": "integer", - "minimum": 1, - "maximum": 50 - }, - "temperature": { - "type": "number", - "minimum": 0, - "maximum": 2 - }, - "target": { - "type": "string" - } - }, - "required": ["type"], - "additionalProperties": false - }, { "type": "object", "properties": { @@ -11600,6 +10640,16 @@ "config": { "type": "object", "additionalProperties": {} + }, + "max_steps": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "temperature": { + "type": "number", + "minimum": 0, + "maximum": 2 } }, "required": ["type"], @@ -12136,125 +11186,19 @@ }, "type": { "type": "string", - "enum": ["agent-judge", "agent_judge"] + "const": "contains" }, - "prompt": { - "type": "string" - }, - "rubrics": { - "type": "array", - "items": { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "outcome": { - "type": "string" - }, - "weight": { - "type": "number" - }, - "required": { - "type": "boolean" - }, - "required_min_score": { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - "score_ranges": { - "type": "array", - "items": { - "type": "object", - "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ - { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - { - "type": "integer", - "minimum": 0, - "maximum": 10 - } - ] - }, - "outcome": { - "type": "string", - "minLength": 1 - } - }, - "required": ["score_range", "outcome"], - "additionalProperties": false - } - } - }, - "additionalProperties": false - } - }, - "max_steps": { - "type": "integer", - "minimum": 1, - "maximum": 50 - }, - "temperature": { - "type": "number", - "minimum": 0, - "maximum": 2 - }, - "target": { - "type": "string" - } - }, - "required": ["type"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": 0, - "maximum": 1 - } - ] - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "contains" - }, - "value": { - "type": "string" - } - }, - "required": ["type", "value"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { "type": "string" }, "weight": { @@ -12674,6 +11618,16 @@ "config": { "type": "object", "additionalProperties": {} + }, + "max_steps": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "temperature": { + "type": "number", + "minimum": 0, + "maximum": 2 } }, "required": ["type"], @@ -13183,112 +12137,6 @@ "required": ["type"], "additionalProperties": false }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": 0, - "maximum": 1 - } - ] - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["agent-judge", "agent_judge"] - }, - "prompt": { - "type": "string" - }, - "rubrics": { - "type": "array", - "items": { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "outcome": { - "type": "string" - }, - "weight": { - "type": "number" - }, - "required": { - "type": "boolean" - }, - "required_min_score": { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - "score_ranges": { - "type": "array", - "items": { - "type": "object", - "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ - { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - { - "type": "integer", - "minimum": 0, - "maximum": 10 - } - ] - }, - "outcome": { - "type": "string", - "minLength": 1 - } - }, - "required": ["score_range", "outcome"], - "additionalProperties": false - } - } - }, - "additionalProperties": false - } - }, - "max_steps": { - "type": "integer", - "minimum": 1, - "maximum": 50 - }, - "temperature": { - "type": "number", - "minimum": 0, - "maximum": 2 - }, - "target": { - "type": "string" - } - }, - "required": ["type"], - "additionalProperties": false - }, { "type": "object", "properties": { @@ -14067,6 +12915,16 @@ "config": { "type": "object", "additionalProperties": {} + }, + "max_steps": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "temperature": { + "type": "number", + "minimum": 0, + "maximum": 2 } }, "required": ["type"], @@ -14603,125 +13461,19 @@ }, "type": { "type": "string", - "enum": ["agent-judge", "agent_judge"] + "const": "contains" }, - "prompt": { - "type": "string" - }, - "rubrics": { - "type": "array", - "items": { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "outcome": { - "type": "string" - }, - "weight": { - "type": "number" - }, - "required": { - "type": "boolean" - }, - "required_min_score": { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - "score_ranges": { - "type": "array", - "items": { - "type": "object", - "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ - { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - { - "type": "integer", - "minimum": 0, - "maximum": 10 - } - ] - }, - "outcome": { - "type": "string", - "minLength": 1 - } - }, - "required": ["score_range", "outcome"], - "additionalProperties": false - } - } - }, - "additionalProperties": false - } - }, - "max_steps": { - "type": "integer", - "minimum": 1, - "maximum": 50 - }, - "temperature": { - "type": "number", - "minimum": 0, - "maximum": 2 - }, - "target": { - "type": "string" - } - }, - "required": ["type"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": 0, - "maximum": 1 - } - ] - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "contains" - }, - "value": { - "type": "string" - } - }, - "required": ["type", "value"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { + "value": { + "type": "string" + } + }, + "required": ["type", "value"], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { "type": "string" }, "weight": { @@ -15141,6 +13893,16 @@ "config": { "type": "object", "additionalProperties": {} + }, + "max_steps": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "temperature": { + "type": "number", + "minimum": 0, + "maximum": 2 } }, "required": ["type"], @@ -15650,112 +14412,6 @@ "required": ["type"], "additionalProperties": false }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": 0, - "maximum": 1 - } - ] - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["agent-judge", "agent_judge"] - }, - "prompt": { - "type": "string" - }, - "rubrics": { - "type": "array", - "items": { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "outcome": { - "type": "string" - }, - "weight": { - "type": "number" - }, - "required": { - "type": "boolean" - }, - "required_min_score": { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - "score_ranges": { - "type": "array", - "items": { - "type": "object", - "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ - { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - { - "type": "integer", - "minimum": 0, - "maximum": 10 - } - ] - }, - "outcome": { - "type": "string", - "minLength": 1 - } - }, - "required": ["score_range", "outcome"], - "additionalProperties": false - } - } - }, - "additionalProperties": false - } - }, - "max_steps": { - "type": "integer", - "minimum": 1, - "maximum": 50 - }, - "temperature": { - "type": "number", - "minimum": 0, - "maximum": 2 - }, - "target": { - "type": "string" - } - }, - "required": ["type"], - "additionalProperties": false - }, { "type": "object", "properties": { @@ -16215,6 +14871,16 @@ "config": { "type": "object", "additionalProperties": {} + }, + "max_steps": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "temperature": { + "type": "number", + "minimum": 0, + "maximum": 2 } }, "required": ["type"], @@ -16696,135 +15362,29 @@ "minimum": 0 }, "max_llm_calls": { - "type": "number", - "minimum": 0 - }, - "max_tokens": { - "type": "number", - "minimum": 0 - }, - "max_cost_usd": { - "type": "number", - "minimum": 0 - }, - "max_duration_ms": { - "type": "number", - "minimum": 0 - }, - "target_exploration_ratio": { - "type": "number", - "minimum": 0, - "maximum": 1 - }, - "exploration_tolerance": { - "type": "number", - "minimum": 0 - } - }, - "required": ["type"], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": 0, - "maximum": 1 - } - ] - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["agent-judge", "agent_judge"] - }, - "prompt": { - "type": "string" - }, - "rubrics": { - "type": "array", - "items": { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "outcome": { - "type": "string" - }, - "weight": { - "type": "number" - }, - "required": { - "type": "boolean" - }, - "required_min_score": { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - "score_ranges": { - "type": "array", - "items": { - "type": "object", - "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ - { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - { - "type": "integer", - "minimum": 0, - "maximum": 10 - } - ] - }, - "outcome": { - "type": "string", - "minLength": 1 - } - }, - "required": ["score_range", "outcome"], - "additionalProperties": false - } - } - }, - "additionalProperties": false - } + "type": "number", + "minimum": 0 }, - "max_steps": { - "type": "integer", - "minimum": 1, - "maximum": 50 + "max_tokens": { + "type": "number", + "minimum": 0 }, - "temperature": { + "max_cost_usd": { + "type": "number", + "minimum": 0 + }, + "max_duration_ms": { + "type": "number", + "minimum": 0 + }, + "target_exploration_ratio": { "type": "number", "minimum": 0, - "maximum": 2 + "maximum": 1 }, - "target": { - "type": "string" + "exploration_tolerance": { + "type": "number", + "minimum": 0 } }, "required": ["type"], @@ -17335,6 +15895,16 @@ "config": { "type": "object", "additionalProperties": {} + }, + "max_steps": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "temperature": { + "type": "number", + "minimum": 0, + "maximum": 2 } }, "required": ["type"], @@ -17844,112 +16414,6 @@ "required": ["type"], "additionalProperties": false }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": 0, - "maximum": 1 - } - ] - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["agent-judge", "agent_judge"] - }, - "prompt": { - "type": "string" - }, - "rubrics": { - "type": "array", - "items": { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "outcome": { - "type": "string" - }, - "weight": { - "type": "number" - }, - "required": { - "type": "boolean" - }, - "required_min_score": { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - "score_ranges": { - "type": "array", - "items": { - "type": "object", - "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ - { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - { - "type": "integer", - "minimum": 0, - "maximum": 10 - } - ] - }, - "outcome": { - "type": "string", - "minLength": 1 - } - }, - "required": ["score_range", "outcome"], - "additionalProperties": false - } - } - }, - "additionalProperties": false - } - }, - "max_steps": { - "type": "integer", - "minimum": 1, - "maximum": 50 - }, - "temperature": { - "type": "number", - "minimum": 0, - "maximum": 2 - }, - "target": { - "type": "string" - } - }, - "required": ["type"], - "additionalProperties": false - }, { "type": "object", "properties": { @@ -18409,6 +16873,16 @@ "config": { "type": "object", "additionalProperties": {} + }, + "max_steps": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "temperature": { + "type": "number", + "minimum": 0, + "maximum": 2 } }, "required": ["type"], @@ -18918,112 +17392,6 @@ "required": ["type"], "additionalProperties": false }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": 0, - "maximum": 1 - } - ] - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": ["agent-judge", "agent_judge"] - }, - "prompt": { - "type": "string" - }, - "rubrics": { - "type": "array", - "items": { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "outcome": { - "type": "string" - }, - "weight": { - "type": "number" - }, - "required": { - "type": "boolean" - }, - "required_min_score": { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - "score_ranges": { - "type": "array", - "items": { - "type": "object", - "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ - { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - { - "type": "integer", - "minimum": 0, - "maximum": 10 - } - ] - }, - "outcome": { - "type": "string", - "minLength": 1 - } - }, - "required": ["score_range", "outcome"], - "additionalProperties": false - } - } - }, - "additionalProperties": false - } - }, - "max_steps": { - "type": "integer", - "minimum": 1, - "maximum": 50 - }, - "temperature": { - "type": "number", - "minimum": 0, - "maximum": 2 - }, - "target": { - "type": "string" - } - }, - "required": ["type"], - "additionalProperties": false - }, { "type": "object", "properties": {