From 2cb3a005e880cc1c6a8b443ec31ea2c95863a3a5 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 15 Mar 2026 12:39:08 +0000 Subject: [PATCH 01/13] feat: add agentv to ProviderKind Co-Authored-By: Claude Opus 4.6 --- packages/core/src/evaluation/providers/types.ts | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/packages/core/src/evaluation/providers/types.ts b/packages/core/src/evaluation/providers/types.ts index af5e3b6a1..e0106071a 100644 --- a/packages/core/src/evaluation/providers/types.ts +++ b/packages/core/src/evaluation/providers/types.ts @@ -25,7 +25,8 @@ export type ProviderKind = | 'cli' | 'mock' | 'vscode' - | 'vscode-insiders'; + | 'vscode-insiders' + | 'agentv'; /** * Agent providers that have filesystem access and don't need unwrapped guidelines. @@ -63,6 +64,7 @@ export const KNOWN_PROVIDERS: readonly ProviderKind[] = [ 'mock', 'vscode', 'vscode-insiders', + 'agentv', ] as const; /** From 80a20c1ea250c66d740ceb36e1429dc86ace1c12 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 15 Mar 2026 12:42:43 +0000 Subject: [PATCH 02/13] feat: add agentv provider to target resolution Co-Authored-By: Claude Opus 4.6 --- .../core/src/evaluation/providers/targets.ts | 30 +++++++++++ .../test/evaluation/providers/targets.test.ts | 51 +++++++++++++++++++ 2 files changed, 81 insertions(+) diff --git a/packages/core/src/evaluation/providers/targets.ts b/packages/core/src/evaluation/providers/targets.ts index aa30b06b6..26f827eae 100644 --- a/packages/core/src/evaluation/providers/targets.ts +++ b/packages/core/src/evaluation/providers/targets.ts @@ -514,6 +514,11 @@ export interface VSCodeResolvedConfig { readonly timeoutMs?: number; } +export interface AgentVResolvedConfig { + readonly model: string; + readonly temperature: number; +} + /** * Healthcheck configuration type derived from CliHealthcheckSchema. * Supports both HTTP and command-based healthchecks. @@ -628,6 +633,14 @@ export type ResolvedTarget = readonly providerBatching?: boolean; readonly config: VSCodeResolvedConfig; } + | { + readonly kind: 'agentv'; + readonly name: string; + readonly judgeTarget?: string; + readonly workers?: number; + readonly providerBatching?: boolean; + readonly config: AgentVResolvedConfig; + } | { readonly kind: 'cli'; readonly name: string; @@ -841,6 +854,23 @@ export function resolveTargetDefinition( providerBatching, config: resolveVSCodeConfig(parsed, env, provider === 'vscode-insiders', evalFilePath), }; + case 'agentv': { + const model = typeof parsed.model === 'string' ? parsed.model : undefined; + if (!model) { + throw new Error( + `Target "${parsed.name}" (provider: agentv) requires a "model" field (e.g., "openai:gpt-5-mini")`, + ); + } + const temperature = typeof parsed.temperature === 'number' ? parsed.temperature : 0; + return { + kind: 'agentv', + name: parsed.name, + judgeTarget: parsed.judge_target, + workers: typeof parsed.workers === 'number' ? parsed.workers : undefined, + providerBatching, + config: { model, temperature }, + }; + } case 'cli': return { kind: 'cli', diff --git a/packages/core/test/evaluation/providers/targets.test.ts b/packages/core/test/evaluation/providers/targets.test.ts index eacd573b2..7c7d2b0c2 100644 --- a/packages/core/test/evaluation/providers/targets.test.ts +++ b/packages/core/test/evaluation/providers/targets.test.ts @@ -559,6 +559,57 @@ describe('resolveTargetDefinition', () => { ), ).toThrow(/workspace_template has been removed/i); }); + + it('resolves agentv target with model and default temperature', () => { + const target = resolveTargetDefinition( + { + name: 'agentv-judge', + provider: 'agentv', + model: 'openai:gpt-5-mini', + }, + {}, + ); + + expect(target.kind).toBe('agentv'); + if (target.kind !== 'agentv') { + throw new Error('expected agentv target'); + } + + expect(target.config.model).toBe('openai:gpt-5-mini'); + expect(target.config.temperature).toBe(0); + }); + + it('resolves agentv target with explicit temperature', () => { + const target = resolveTargetDefinition( + { + name: 'agentv-warm', + provider: 'agentv', + model: 'anthropic:claude-haiku-4.5', + temperature: 0.7, + }, + {}, + ); + + expect(target.kind).toBe('agentv'); + if (target.kind !== 'agentv') { + throw new Error('expected agentv target'); + } + + expect(target.config.model).toBe('anthropic:claude-haiku-4.5'); + expect(target.config.temperature).toBe(0.7); + }); + + it('throws when agentv target is missing model', () => { + expect(() => + resolveTargetDefinition( + { + name: 'agentv-no-model', + provider: 'agentv', + }, + {}, + ), + ).toThrow(/model/i); + }); }); describe('createProvider', () => { From 64d8d6d37c2c9cfe72feb21ebab84e56e0f913e2 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 15 Mar 2026 12:51:21 +0000 Subject: [PATCH 03/13] feat: add agentv provider implementation Co-Authored-By: Claude Opus 4.6 --- bun.lock | 7 +- packages/core/package.json | 3 +- .../evaluation/providers/agentv-provider.ts | 74 ++++++++++ .../core/src/evaluation/providers/index.ts | 3 + .../providers/agentv-provider.test.ts | 132 ++++++++++++++++++ 5 files changed, 215 insertions(+), 4 deletions(-) create mode 100644 packages/core/src/evaluation/providers/agentv-provider.ts create mode 100644 packages/core/test/evaluation/providers/agentv-provider.test.ts diff --git a/bun.lock b/bun.lock index 200a436cc..70471cff6 100644 --- a/bun.lock +++ b/bun.lock @@ -24,7 +24,7 @@ }, "apps/cli": { "name": "agentv", - "version": "2.12.0", + "version": "2.19.0", "bin": { "agentv": "./dist/cli.js", }, @@ -61,13 +61,14 @@ }, "packages/core": { "name": "@agentv/core", - "version": "2.12.0", + "version": "2.19.0", "dependencies": { "@agentclientprotocol/sdk": "^0.14.1", "@agentv/eval": "workspace:*", "@ai-sdk/anthropic": "^2.0.53", "@ai-sdk/azure": "^2.0.78", "@ai-sdk/google": "^2.0.44", + "@ai-sdk/openai": "^2.0.0", "@anthropic-ai/claude-agent-sdk": "^0.2.49", "@github/copilot-sdk": "^0.1.25", "@mariozechner/pi-agent-core": "^0.54.2", @@ -95,7 +96,7 @@ }, "packages/eval": { "name": "@agentv/eval", - "version": "2.12.0", + "version": "2.19.0", "dependencies": { "zod": "^3.23.8", }, diff --git a/packages/core/package.json b/packages/core/package.json index d0c0a031e..600890177 100644 --- a/packages/core/package.json +++ b/packages/core/package.json @@ -39,11 +39,12 @@ }, "files": ["dist", "README.md"], "dependencies": { - "@agentv/eval": "workspace:*", "@agentclientprotocol/sdk": "^0.14.1", + "@agentv/eval": "workspace:*", "@ai-sdk/anthropic": "^2.0.53", "@ai-sdk/azure": "^2.0.78", "@ai-sdk/google": "^2.0.44", + "@ai-sdk/openai": "^2.0.0", "@anthropic-ai/claude-agent-sdk": "^0.2.49", "@github/copilot-sdk": "^0.1.25", "@mariozechner/pi-agent-core": "^0.54.2", diff --git a/packages/core/src/evaluation/providers/agentv-provider.ts b/packages/core/src/evaluation/providers/agentv-provider.ts new file mode 100644 index 000000000..8e5e2c589 --- /dev/null +++ b/packages/core/src/evaluation/providers/agentv-provider.ts @@ -0,0 +1,74 @@ +import { createAnthropic } from '@ai-sdk/anthropic'; +import { createAzure } from '@ai-sdk/azure'; +import { createGoogleGenerativeAI } from '@ai-sdk/google'; +import { createOpenAI } from '@ai-sdk/openai'; +import { type LanguageModel, createProviderRegistry } from 'ai'; + +import type { AgentVResolvedConfig } from './targets.js'; +import type { Provider, ProviderRequest, ProviderResponse } from './types.js'; + +/** + * Lazily-created singleton provider registry for resolving AI SDK model strings. + * Maps provider prefixes (e.g., "openai", "anthropic") to their AI SDK provider + * implementations so that model strings like "openai:gpt-5-mini" can be resolved + * to LanguageModel instances. + */ +let _registry: { languageModel: (id: string) => LanguageModel } | null = null; + +function getAiSdkRegistry(): { languageModel: (id: string) => LanguageModel } { + if (!_registry) { + // Cast through unknown: the registry's languageModel signature uses narrowed + // literal types, but we need to accept arbitrary model strings at runtime. + _registry = createProviderRegistry({ + openai: createOpenAI(), + anthropic: createAnthropic(), + azure: createAzure(), + google: createGoogleGenerativeAI(), + }) as unknown as { languageModel: (id: string) => LanguageModel }; + } + return _registry; +} + +/** + * AgentV built-in provider for LLM judge evaluation. + * + * Resolves an AI SDK model string (e.g., "openai:gpt-5-mini", "anthropic:claude-sonnet-4-20250514") + * to a Vercel AI SDK LanguageModel using createProviderRegistry. This provider is used + * exclusively for judge evaluation — it does not support direct agent invocation. + * + * Usage: `--judge-target agentv --model openai:gpt-5-mini` + */ +export class AgentvProvider implements Provider { + readonly id: string; + readonly kind = 'agentv' as const; + readonly targetName: string; + + private readonly model: LanguageModel; + private readonly config: AgentVResolvedConfig; + + constructor(targetName: string, config: AgentVResolvedConfig) { + this.id = `agentv:${targetName}`; + this.targetName = targetName; + this.config = config; + + const registry = getAiSdkRegistry(); + this.model = registry.languageModel(config.model); + } + + /** + * Direct invoke is not supported for the agentv provider. + * Use asLanguageModel() with generateText() instead. + */ + async invoke(_request: ProviderRequest): Promise { + throw new Error( + 'AgentvProvider does not support direct invoke(). Use asLanguageModel() with generateText() instead.', + ); + } + + /** + * Returns the resolved AI SDK LanguageModel for use with generateText/generateObject. + */ + asLanguageModel(): LanguageModel { + return this.model; + } +} diff --git a/packages/core/src/evaluation/providers/index.ts b/packages/core/src/evaluation/providers/index.ts index 62cd8eef8..6ec6e2dfa 100644 --- a/packages/core/src/evaluation/providers/index.ts +++ b/packages/core/src/evaluation/providers/index.ts @@ -1,3 +1,4 @@ +import { AgentvProvider } from './agentv-provider.js'; import { AnthropicProvider, AzureProvider, GeminiProvider } from './ai-sdk.js'; import { ClaudeCliProvider } from './claude-cli.js'; import { ClaudeSdkProvider } from './claude-sdk.js'; @@ -30,6 +31,7 @@ export type { } from './types.js'; export type { + AgentVResolvedConfig, AnthropicResolvedConfig, AzureResolvedConfig, ClaudeResolvedConfig, @@ -95,6 +97,7 @@ export function createBuiltinProviderRegistry(): ProviderRegistry { // claude-sdk is the explicit SDK provider (requires @anthropic-ai/claude-agent-sdk) .register('claude-sdk', (t) => new ClaudeSdkProvider(t.name, t.config as never)) .register('mock', (t) => new MockProvider(t.name, t.config as never)) + .register('agentv', (t) => new AgentvProvider(t.name, t.config as never)) .register('vscode', (t) => new VSCodeProvider(t.name, t.config as never, 'vscode')) .register( 'vscode-insiders', diff --git a/packages/core/test/evaluation/providers/agentv-provider.test.ts b/packages/core/test/evaluation/providers/agentv-provider.test.ts new file mode 100644 index 000000000..b2b3be2c2 --- /dev/null +++ b/packages/core/test/evaluation/providers/agentv-provider.test.ts @@ -0,0 +1,132 @@ +import { describe, expect, it, vi } from 'vitest'; + +// Mock AI SDK provider packages before importing the provider +vi.mock('@ai-sdk/openai', () => ({ + createOpenAI: () => { + const provider = (modelId: string) => ({ + modelId, + specificationVersion: 'v2', + provider: 'openai', + }); + provider.languageModel = (modelId: string) => ({ + modelId, + specificationVersion: 'v2', + provider: 'openai', + }); + provider.chatModel = provider.languageModel; + provider.textEmbeddingModel = () => ({}); + return provider; + }, +})); + +vi.mock('@ai-sdk/anthropic', () => ({ + createAnthropic: () => { + const provider = (modelId: string) => ({ + modelId, + specificationVersion: 'v2', + provider: 'anthropic', + }); + provider.languageModel = (modelId: string) => ({ + modelId, + specificationVersion: 'v2', + provider: 'anthropic', + }); + provider.chatModel = provider.languageModel; + provider.textEmbeddingModel = () => ({}); + return provider; + }, +})); + +vi.mock('@ai-sdk/azure', () => ({ + createAzure: () => { + const provider = (modelId: string) => ({ + modelId, + specificationVersion: 'v2', + provider: 'azure', + }); + provider.languageModel = (modelId: string) => ({ + modelId, + specificationVersion: 'v2', + provider: 'azure', + }); + provider.chatModel = provider.languageModel; + provider.textEmbeddingModel = () => ({}); + return provider; + }, +})); + +vi.mock('@ai-sdk/google', () => ({ + createGoogleGenerativeAI: () => { + const provider = (modelId: string) => ({ + modelId, + specificationVersion: 'v2', + provider: 'google', + }); + provider.languageModel = (modelId: string) => ({ + modelId, + specificationVersion: 'v2', + provider: 'google', + }); + provider.chatModel = provider.languageModel; + provider.textEmbeddingModel = () => ({}); + return provider; + }, +})); + +import { AgentvProvider } from '../../../src/evaluation/providers/agentv-provider.js'; + +describe('AgentvProvider', () => { + it('has kind "agentv"', () => { + const provider = new AgentvProvider('test-judge', { + model: 'openai:gpt-5-mini', + temperature: 0, + }); + expect(provider.kind).toBe('agentv'); + }); + + it('has correct targetName', () => { + const provider = new AgentvProvider('my-judge', { + model: 'openai:gpt-5-mini', + temperature: 0, + }); + expect(provider.targetName).toBe('my-judge'); + }); + + it('has correct id format', () => { + const provider = new AgentvProvider('test-judge', { + model: 'openai:gpt-5-mini', + temperature: 0, + }); + expect(provider.id).toBe('agentv:test-judge'); + }); + + it('asLanguageModel() returns a defined LanguageModel', () => { + const provider = new AgentvProvider('test-judge', { + model: 'openai:gpt-5-mini', + temperature: 0, + }); + const model = provider.asLanguageModel(); + expect(model).toBeDefined(); + expect(model.modelId).toBe('gpt-5-mini'); + }); + + it('asLanguageModel() works with anthropic model strings', () => { + const provider = new AgentvProvider('test-judge', { + model: 'anthropic:claude-sonnet-4-20250514', + temperature: 0, + }); + const model = provider.asLanguageModel(); + expect(model).toBeDefined(); + expect(model.modelId).toBe('claude-sonnet-4-20250514'); + }); + + it('invoke() throws an error', async () => { + const provider = new AgentvProvider('test-judge', { + model: 'openai:gpt-5-mini', + temperature: 0, + }); + await expect(provider.invoke({ question: 'test' })).rejects.toThrow( + 'AgentvProvider does not support direct invoke()', + ); + }); +}); From d6dbacdd89781ac842e8d90860dd4a403e30e393 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 15 Mar 2026 12:54:54 +0000 Subject: [PATCH 04/13] fix: rewrite agentv provider to use direct SDK calls instead of registry Replace createProviderRegistry with direct createOpenAI/createAnthropic/ createAzure/createGoogleGenerativeAI calls to resolve v2/v3 spec version type compatibility issues. Parse "provider:model" strings manually via a switch statement. Simplify test mocks and add coverage for google, azure, and error cases. Co-Authored-By: Claude Opus 4.6 --- .../evaluation/providers/agentv-provider.ts | 62 +++++---- .../providers/agentv-provider.test.ts | 123 +++++++++--------- 2 files changed, 100 insertions(+), 85 deletions(-) diff --git a/packages/core/src/evaluation/providers/agentv-provider.ts b/packages/core/src/evaluation/providers/agentv-provider.ts index 8e5e2c589..c9b1484a2 100644 --- a/packages/core/src/evaluation/providers/agentv-provider.ts +++ b/packages/core/src/evaluation/providers/agentv-provider.ts @@ -2,39 +2,57 @@ import { createAnthropic } from '@ai-sdk/anthropic'; import { createAzure } from '@ai-sdk/azure'; import { createGoogleGenerativeAI } from '@ai-sdk/google'; import { createOpenAI } from '@ai-sdk/openai'; -import { type LanguageModel, createProviderRegistry } from 'ai'; +import type { LanguageModel } from 'ai'; import type { AgentVResolvedConfig } from './targets.js'; import type { Provider, ProviderRequest, ProviderResponse } from './types.js'; /** - * Lazily-created singleton provider registry for resolving AI SDK model strings. - * Maps provider prefixes (e.g., "openai", "anthropic") to their AI SDK provider - * implementations so that model strings like "openai:gpt-5-mini" can be resolved - * to LanguageModel instances. + * Parse a model string like "openai:gpt-5-mini" into provider prefix and model name. */ -let _registry: { languageModel: (id: string) => LanguageModel } | null = null; +function parseModelString(model: string): { provider: string; modelName: string } { + const colonIndex = model.indexOf(':'); + if (colonIndex === -1) { + throw new Error( + `Invalid model string "${model}". Expected format "provider:model" (e.g., "openai:gpt-5-mini")`, + ); + } + return { + provider: model.slice(0, colonIndex), + modelName: model.slice(colonIndex + 1), + }; +} -function getAiSdkRegistry(): { languageModel: (id: string) => LanguageModel } { - if (!_registry) { - // Cast through unknown: the registry's languageModel signature uses narrowed - // literal types, but we need to accept arbitrary model strings at runtime. - _registry = createProviderRegistry({ - openai: createOpenAI(), - anthropic: createAnthropic(), - azure: createAzure(), - google: createGoogleGenerativeAI(), - }) as unknown as { languageModel: (id: string) => LanguageModel }; +/** + * Create a LanguageModel from a model string using the appropriate AI SDK provider. + */ +function createLanguageModel(modelString: string): LanguageModel { + const { provider, modelName } = parseModelString(modelString); + + switch (provider) { + case 'openai': + return createOpenAI()(modelName); + case 'anthropic': + return createAnthropic()(modelName); + case 'azure': + return createAzure()(modelName); + case 'google': + return createGoogleGenerativeAI()(modelName); + default: + throw new Error( + `Unsupported AI SDK provider "${provider}" in model string "${modelString}". ` + + 'Supported providers: openai, anthropic, azure, google', + ); } - return _registry; } /** * AgentV built-in provider for LLM judge evaluation. * * Resolves an AI SDK model string (e.g., "openai:gpt-5-mini", "anthropic:claude-sonnet-4-20250514") - * to a Vercel AI SDK LanguageModel using createProviderRegistry. This provider is used - * exclusively for judge evaluation — it does not support direct agent invocation. + * to a Vercel AI SDK LanguageModel by parsing the provider prefix and creating the appropriate + * AI SDK provider directly. This provider is used exclusively for judge evaluation — it does not + * support direct agent invocation. * * Usage: `--judge-target agentv --model openai:gpt-5-mini` */ @@ -44,15 +62,11 @@ export class AgentvProvider implements Provider { readonly targetName: string; private readonly model: LanguageModel; - private readonly config: AgentVResolvedConfig; constructor(targetName: string, config: AgentVResolvedConfig) { this.id = `agentv:${targetName}`; this.targetName = targetName; - this.config = config; - - const registry = getAiSdkRegistry(); - this.model = registry.languageModel(config.model); + this.model = createLanguageModel(config.model); } /** diff --git a/packages/core/test/evaluation/providers/agentv-provider.test.ts b/packages/core/test/evaluation/providers/agentv-provider.test.ts index b2b3be2c2..f2c66a523 100644 --- a/packages/core/test/evaluation/providers/agentv-provider.test.ts +++ b/packages/core/test/evaluation/providers/agentv-provider.test.ts @@ -1,76 +1,37 @@ import { describe, expect, it, vi } from 'vitest'; -// Mock AI SDK provider packages before importing the provider +// Mock AI SDK provider packages before importing the provider. +// Each createXxx() returns a callable factory: createXxx()(modelName) => model stub. vi.mock('@ai-sdk/openai', () => ({ - createOpenAI: () => { - const provider = (modelId: string) => ({ - modelId, - specificationVersion: 'v2', - provider: 'openai', - }); - provider.languageModel = (modelId: string) => ({ - modelId, - specificationVersion: 'v2', - provider: 'openai', - }); - provider.chatModel = provider.languageModel; - provider.textEmbeddingModel = () => ({}); - return provider; - }, + createOpenAI: () => (modelId: string) => ({ + modelId, + specificationVersion: 'v2', + provider: 'openai', + }), })); vi.mock('@ai-sdk/anthropic', () => ({ - createAnthropic: () => { - const provider = (modelId: string) => ({ - modelId, - specificationVersion: 'v2', - provider: 'anthropic', - }); - provider.languageModel = (modelId: string) => ({ - modelId, - specificationVersion: 'v2', - provider: 'anthropic', - }); - provider.chatModel = provider.languageModel; - provider.textEmbeddingModel = () => ({}); - return provider; - }, + createAnthropic: () => (modelId: string) => ({ + modelId, + specificationVersion: 'v2', + provider: 'anthropic', + }), })); vi.mock('@ai-sdk/azure', () => ({ - createAzure: () => { - const provider = (modelId: string) => ({ - modelId, - specificationVersion: 'v2', - provider: 'azure', - }); - provider.languageModel = (modelId: string) => ({ - modelId, - specificationVersion: 'v2', - provider: 'azure', - }); - provider.chatModel = provider.languageModel; - provider.textEmbeddingModel = () => ({}); - return provider; - }, + createAzure: () => (modelId: string) => ({ + modelId, + specificationVersion: 'v2', + provider: 'azure', + }), })); vi.mock('@ai-sdk/google', () => ({ - createGoogleGenerativeAI: () => { - const provider = (modelId: string) => ({ - modelId, - specificationVersion: 'v2', - provider: 'google', - }); - provider.languageModel = (modelId: string) => ({ - modelId, - specificationVersion: 'v2', - provider: 'google', - }); - provider.chatModel = provider.languageModel; - provider.textEmbeddingModel = () => ({}); - return provider; - }, + createGoogleGenerativeAI: () => (modelId: string) => ({ + modelId, + specificationVersion: 'v2', + provider: 'google', + }), })); import { AgentvProvider } from '../../../src/evaluation/providers/agentv-provider.js'; @@ -120,6 +81,46 @@ describe('AgentvProvider', () => { expect(model.modelId).toBe('claude-sonnet-4-20250514'); }); + it('asLanguageModel() works with google model strings', () => { + const provider = new AgentvProvider('test-judge', { + model: 'google:gemini-2.5-flash', + temperature: 0, + }); + const model = provider.asLanguageModel(); + expect(model).toBeDefined(); + expect(model.modelId).toBe('gemini-2.5-flash'); + }); + + it('asLanguageModel() works with azure model strings', () => { + const provider = new AgentvProvider('test-judge', { + model: 'azure:gpt-4o-deployment', + temperature: 0, + }); + const model = provider.asLanguageModel(); + expect(model).toBeDefined(); + expect(model.modelId).toBe('gpt-4o-deployment'); + }); + + it('throws for unsupported provider prefix', () => { + expect( + () => + new AgentvProvider('test-judge', { + model: 'unsupported:some-model', + temperature: 0, + }), + ).toThrow('Unsupported AI SDK provider "unsupported"'); + }); + + it('throws for model string without colon separator', () => { + expect( + () => + new AgentvProvider('test-judge', { + model: 'gpt-5-mini', + temperature: 0, + }), + ).toThrow('Invalid model string "gpt-5-mini"'); + }); + it('invoke() throws an error', async () => { const provider = new AgentvProvider('test-judge', { model: 'openai:gpt-5-mini', From d58b34cd5eddb2d7c9a6fd10687e6bcfdfdb3337 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 15 Mar 2026 12:56:29 +0000 Subject: [PATCH 05/13] fix: cast openai v3 model to LanguageModel, fix test assertions --- packages/core/src/evaluation/providers/agentv-provider.ts | 4 +++- .../test/evaluation/providers/agentv-provider.test.ts | 8 ++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/packages/core/src/evaluation/providers/agentv-provider.ts b/packages/core/src/evaluation/providers/agentv-provider.ts index c9b1484a2..06abd9cfe 100644 --- a/packages/core/src/evaluation/providers/agentv-provider.ts +++ b/packages/core/src/evaluation/providers/agentv-provider.ts @@ -31,7 +31,9 @@ function createLanguageModel(modelString: string): LanguageModel { switch (provider) { case 'openai': - return createOpenAI()(modelName); + // Cast: @ai-sdk/openai may return LanguageModelV3 while the rest of the + // codebase uses LanguageModelV2. The runtime API is compatible. + return createOpenAI()(modelName) as unknown as LanguageModel; case 'anthropic': return createAnthropic()(modelName); case 'azure': diff --git a/packages/core/test/evaluation/providers/agentv-provider.test.ts b/packages/core/test/evaluation/providers/agentv-provider.test.ts index f2c66a523..8670f4ec3 100644 --- a/packages/core/test/evaluation/providers/agentv-provider.test.ts +++ b/packages/core/test/evaluation/providers/agentv-provider.test.ts @@ -68,7 +68,7 @@ describe('AgentvProvider', () => { }); const model = provider.asLanguageModel(); expect(model).toBeDefined(); - expect(model.modelId).toBe('gpt-5-mini'); + expect((model as any).modelId).toBe('gpt-5-mini'); }); it('asLanguageModel() works with anthropic model strings', () => { @@ -78,7 +78,7 @@ describe('AgentvProvider', () => { }); const model = provider.asLanguageModel(); expect(model).toBeDefined(); - expect(model.modelId).toBe('claude-sonnet-4-20250514'); + expect((model as any).modelId).toBe('claude-sonnet-4-20250514'); }); it('asLanguageModel() works with google model strings', () => { @@ -88,7 +88,7 @@ describe('AgentvProvider', () => { }); const model = provider.asLanguageModel(); expect(model).toBeDefined(); - expect(model.modelId).toBe('gemini-2.5-flash'); + expect((model as any).modelId).toBe('gemini-2.5-flash'); }); it('asLanguageModel() works with azure model strings', () => { @@ -98,7 +98,7 @@ describe('AgentvProvider', () => { }); const model = provider.asLanguageModel(); expect(model).toBeDefined(); - expect(model.modelId).toBe('gpt-4o-deployment'); + expect((model as any).modelId).toBe('gpt-4o-deployment'); }); it('throws for unsupported provider prefix', () => { From 4f8f9f0bda18e0a244b917ccb1b4c3a32fd0bca3 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 15 Mar 2026 13:14:27 +0000 Subject: [PATCH 06/13] feat: absorb agent-judge into llm-judge with auto-detection Remove agent-judge as a separate evaluator type. LlmJudgeEvaluator now auto-detects mode based on the resolved judge provider: - LLM providers (azure, anthropic, gemini): structured JSON mode - Agent providers (claude-cli, copilot, etc.): delegate mode - agentv provider: built-in AI SDK agent mode with filesystem tools Closes #614 --- .../src/evaluation/evaluators/agent-judge.ts | 598 --- .../core/src/evaluation/evaluators/index.ts | 3 - .../src/evaluation/evaluators/llm-judge.ts | 674 ++- .../evaluation/loaders/evaluator-parser.ts | 33 +- .../evaluation/registry/builtin-evaluators.ts | 55 +- packages/core/src/evaluation/types.ts | 35 +- .../evaluation/validation/eval-file.schema.ts | 2 + packages/eval/src/assertion.ts | 2 - .../references/eval-schema.json | 3870 ++++++++++++++--- 9 files changed, 3865 insertions(+), 1407 deletions(-) delete mode 100644 packages/core/src/evaluation/evaluators/agent-judge.ts diff --git a/packages/core/src/evaluation/evaluators/agent-judge.ts b/packages/core/src/evaluation/evaluators/agent-judge.ts deleted file mode 100644 index 2dc00f769..000000000 --- a/packages/core/src/evaluation/evaluators/agent-judge.ts +++ /dev/null @@ -1,598 +0,0 @@ -import fs from 'node:fs/promises'; -import path from 'node:path'; - -import { generateText, stepCountIs, tool } from 'ai'; -import { z } from 'zod'; - -import { extractLastAssistantContent } from '../providers/types.js'; -import type { Provider } from '../providers/types.js'; -import { TEMPLATE_VARIABLES } from '../template-variables.js'; -import type { JsonObject, RubricItem } from '../types.js'; -import { - buildOutputSchema, - buildRubricOutputSchema, - calculateRubricScore, - freeformEvaluationSchema, - rubricEvaluationSchema, - substituteVariables, -} from './llm-judge.js'; -import { clampScore, isNonEmptyString, parseJsonFromText, scoreToVerdict } from './scoring.js'; -import type { EvaluationContext, EvaluationScore, Evaluator } from './types.js'; - -const DEFAULT_MAX_STEPS = 10; -const MAX_STEPS_LIMIT = 50; -const MAX_FILE_SIZE = 50 * 1024; // 50KB -const MAX_SEARCH_MATCHES = 20; - -/** - * Directories/patterns to skip during file search. - */ -const SEARCH_SKIP_DIRS = new Set([ - 'node_modules', - '.git', - '.next', - 'dist', - '__pycache__', - '.cache', -]); - -/** - * Binary file extensions to skip during search. - */ -const BINARY_EXTENSIONS = new Set([ - '.png', - '.jpg', - '.jpeg', - '.gif', - '.ico', - '.svg', - '.woff', - '.woff2', - '.ttf', - '.eot', - '.mp3', - '.mp4', - '.wav', - '.zip', - '.tar', - '.gz', - '.pdf', - '.exe', - '.dll', - '.so', - '.dylib', -]); - -export interface AgentJudgeEvaluatorOptions { - readonly resolveJudgeProvider: (ctx: EvaluationContext) => Promise; - readonly maxSteps?: number; - readonly temperature?: number; - readonly evaluatorTemplate?: string; - readonly judgeTargetProvider?: Provider; -} - -export class AgentJudgeEvaluator implements Evaluator { - readonly kind = 'agent-judge'; - - private readonly resolveJudgeProvider: (ctx: EvaluationContext) => Promise; - private readonly maxSteps: number; - private readonly temperature: number; - private readonly evaluatorTemplate?: string; - private readonly judgeTargetProvider?: Provider; - - constructor(options: AgentJudgeEvaluatorOptions) { - this.resolveJudgeProvider = options.resolveJudgeProvider; - this.maxSteps = Math.min(options.maxSteps ?? DEFAULT_MAX_STEPS, MAX_STEPS_LIMIT); - this.temperature = options.temperature ?? 0; - this.evaluatorTemplate = options.evaluatorTemplate; - this.judgeTargetProvider = options.judgeTargetProvider; - } - - async evaluate(context: EvaluationContext): Promise { - if (this.judgeTargetProvider) { - return this.evaluateWithJudgeTarget(context); - } - return this.evaluateBuiltIn(context); - } - - /** - * Built-in mode: Uses Vercel AI SDK generateText() with sandboxed filesystem tools. - */ - private async evaluateBuiltIn(context: EvaluationContext): Promise { - const judgeProvider = await this.resolveJudgeProvider(context); - if (!judgeProvider) { - throw new Error('No judge provider available for agent-judge evaluation'); - } - - const model = judgeProvider.asLanguageModel?.(); - if (!model) { - throw new Error( - `Judge provider '${judgeProvider.targetName}' does not support asLanguageModel() — required for built-in agent-judge mode`, - ); - } - - const workspacePath = context.workspacePath; - if (!workspacePath) { - throw new Error( - 'agent-judge evaluator requires a workspace_template target (workspacePath is not set)', - ); - } - - const systemPrompt = this.buildSystemPrompt(context); - const userPrompt = this.buildUserPrompt(context); - - const config = context.evaluator; - const rubrics = config?.type === 'agent-judge' ? config.rubrics : undefined; - - const fsTools = createFilesystemTools(workspacePath); - - const evaluatorRawRequest: JsonObject = { - mode: 'built-in', - systemPrompt, - userPrompt, - target: judgeProvider.targetName, - maxSteps: this.maxSteps, - }; - - try { - const { text, steps } = await generateText({ - model, - system: systemPrompt, - prompt: userPrompt, - tools: fsTools, - stopWhen: stepCountIs(this.maxSteps), - temperature: this.temperature, - }); - - const toolCallCount = steps.reduce((count, step) => count + (step.toolCalls?.length ?? 0), 0); - - const details: JsonObject = { - mode: 'built-in', - steps: steps.length, - tool_calls: toolCallCount, - }; - - return this.parseResult(text, rubrics, evaluatorRawRequest, details); - } catch (error) { - const message = error instanceof Error ? error.message : String(error); - return { - score: 0, - verdict: 'fail', - hits: [], - misses: [`agent-judge built-in evaluation failed: ${message}`], - expectedAspectCount: 1, - evaluatorRawRequest, - details: { mode: 'built-in', error: message }, - }; - } - } - - /** - * Judge target mode: Delegates to an external agent provider via Provider.invoke(). - */ - private async evaluateWithJudgeTarget(context: EvaluationContext): Promise { - const provider = this.judgeTargetProvider as Provider; - - const workspacePath = context.workspacePath; - const prompt = this.buildDelegatedPrompt(context); - - const evaluatorRawRequest: JsonObject = { - mode: 'judge_target', - judge_target: provider.targetName, - prompt, - }; - - try { - const response = await provider.invoke({ - question: prompt, - cwd: workspacePath, - evalCaseId: context.evalCase.id, - attempt: context.attempt, - }); - - const assistantContent = extractLastAssistantContent(response.output); - if (!assistantContent) { - return { - score: 0, - verdict: 'fail', - hits: [], - misses: ['agent-judge judge_target returned no assistant response'], - expectedAspectCount: 1, - evaluatorRawRequest, - details: { mode: 'judge_target', judge_target: provider.targetName }, - }; - } - - const config = context.evaluator; - const rubrics = config?.type === 'agent-judge' ? config.rubrics : undefined; - - const details: JsonObject = { - mode: 'judge_target', - judge_target: provider.targetName, - }; - - return this.parseResult(assistantContent, rubrics, evaluatorRawRequest, details); - } catch (error) { - const message = error instanceof Error ? error.message : String(error); - return { - score: 0, - verdict: 'fail', - hits: [], - misses: [`agent-judge judge_target evaluation failed: ${message}`], - expectedAspectCount: 1, - evaluatorRawRequest, - details: { - mode: 'judge_target', - judge_target: provider.targetName, - error: message, - }, - }; - } - } - - /** - * Parse the agent's response text into an EvaluationScore. - * Supports both freeform and rubric modes. - */ - private parseResult( - text: string, - rubrics: readonly RubricItem[] | undefined, - evaluatorRawRequest: JsonObject, - details: JsonObject, - ): EvaluationScore { - try { - const parsed = parseJsonFromText(text); - - if (rubrics && rubrics.length > 0) { - const data = rubricEvaluationSchema.parse(parsed); - const { score, verdict, hits, misses } = calculateRubricScore(data, rubrics); - return { - score, - verdict, - hits, - misses, - expectedAspectCount: rubrics.length, - reasoning: data.overall_reasoning, - evaluatorRawRequest, - details, - }; - } - - const data = freeformEvaluationSchema.parse(parsed); - const score = clampScore(data.score); - const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : []; - const misses = Array.isArray(data.misses) - ? data.misses.filter(isNonEmptyString).slice(0, 4) - : []; - - return { - score, - verdict: scoreToVerdict(score), - hits, - misses, - expectedAspectCount: Math.max(hits.length + misses.length, 1), - reasoning: data.reasoning, - evaluatorRawRequest, - details, - }; - } catch { - return { - score: 0, - verdict: 'fail', - hits: [], - misses: ['Failed to parse agent-judge response as valid evaluation JSON'], - expectedAspectCount: 1, - evaluatorRawRequest, - details, - }; - } - } - - /** - * Build system prompt for built-in mode. - * Includes output format instructions. - */ - private buildSystemPrompt(context: EvaluationContext): string { - const config = context.evaluator; - const rubrics = config?.type === 'agent-judge' ? config.rubrics : undefined; - - const parts: string[] = [ - 'You are an expert evaluator with access to the workspace filesystem.', - 'Use the provided tools to investigate the workspace and verify the criteria are met.', - 'Thoroughly examine relevant files before making your assessment.', - '', - ]; - - if (rubrics && rubrics.length > 0) { - parts.push(buildRubricOutputSchema()); - } else { - parts.push(buildOutputSchema()); - } - - return parts.join('\n'); - } - - /** - * Build user prompt for built-in mode. - * Uses custom template if provided, otherwise builds default prompt. - */ - private buildUserPrompt(context: EvaluationContext): string { - const formattedQuestion = - context.promptInputs.question && context.promptInputs.question.trim().length > 0 - ? context.promptInputs.question - : context.evalCase.question; - - const variables: Record = { - [TEMPLATE_VARIABLES.ANSWER]: context.candidate.trim(), - [TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context.evalCase.reference_answer ?? '').trim(), - [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(), - [TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(), - [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? '', - }; - - if (this.evaluatorTemplate) { - return substituteVariables(this.evaluatorTemplate, variables); - } - - const config = context.evaluator; - const rubrics = config?.type === 'agent-judge' ? config.rubrics : undefined; - - const parts: string[] = [ - 'Evaluate the candidate answer by investigating the workspace.', - '', - '[[ ## question ## ]]', - formattedQuestion, - '', - '[[ ## criteria ## ]]', - context.evalCase.criteria, - '', - ]; - - if (context.evalCase.reference_answer && context.evalCase.reference_answer.trim().length > 0) { - parts.push('[[ ## reference_answer ## ]]', context.evalCase.reference_answer, ''); - } - - parts.push('[[ ## answer ## ]]', context.candidate, ''); - - if (context.fileChanges) { - parts.push('[[ ## file_changes ## ]]', context.fileChanges, ''); - } - - if (rubrics && rubrics.length > 0) { - parts.push('[[ ## rubrics ## ]]'); - for (const rubric of rubrics) { - const requiredLabel = rubric.required ? ' (REQUIRED)' : ''; - const weightLabel = rubric.weight !== 1.0 ? ` (weight: ${rubric.weight})` : ''; - parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.outcome}`); - } - parts.push( - '', - 'For each rubric, investigate the workspace to determine if it is satisfied. Provide brief reasoning.', - ); - } else { - parts.push( - 'Investigate the workspace to verify the criteria. Provide a score between 0.0 and 1.0.', - ); - } - - return parts.join('\n'); - } - - /** - * Build the full evaluation prompt for judge target mode (delegation). - * Combines task context, criteria, candidate info, and output format instructions. - */ - private buildDelegatedPrompt(context: EvaluationContext): string { - const formattedQuestion = - context.promptInputs.question && context.promptInputs.question.trim().length > 0 - ? context.promptInputs.question - : context.evalCase.question; - - const config = context.evaluator; - const rubrics = config?.type === 'agent-judge' ? config.rubrics : undefined; - - if (this.evaluatorTemplate) { - const variables: Record = { - [TEMPLATE_VARIABLES.ANSWER]: context.candidate.trim(), - [TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context.evalCase.reference_answer ?? '').trim(), - [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(), - [TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(), - [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? '', - }; - const customPrompt = substituteVariables(this.evaluatorTemplate, variables); - - const outputSchema = - rubrics && rubrics.length > 0 ? buildRubricOutputSchema() : buildOutputSchema(); - - return `${customPrompt}\n\n${outputSchema}`; - } - - const parts: string[] = [ - 'You are an expert evaluator. Investigate the workspace to verify the criteria are met.', - '', - '[[ ## question ## ]]', - formattedQuestion, - '', - '[[ ## criteria ## ]]', - context.evalCase.criteria, - '', - ]; - - if (context.evalCase.reference_answer && context.evalCase.reference_answer.trim().length > 0) { - parts.push('[[ ## reference_answer ## ]]', context.evalCase.reference_answer, ''); - } - - parts.push('[[ ## answer ## ]]', context.candidate, ''); - - if (context.fileChanges) { - parts.push('[[ ## file_changes ## ]]', context.fileChanges, ''); - } - - if (rubrics && rubrics.length > 0) { - parts.push('[[ ## rubrics ## ]]'); - for (const rubric of rubrics) { - const requiredLabel = rubric.required ? ' (REQUIRED)' : ''; - const weightLabel = rubric.weight !== 1.0 ? ` (weight: ${rubric.weight})` : ''; - parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.outcome}`); - } - parts.push(''); - parts.push(buildRubricOutputSchema()); - } else { - parts.push(buildOutputSchema()); - } - - return parts.join('\n'); - } -} - -// --------------------------------------------------------------------------- -// Sandboxed filesystem tools for built-in mode -// --------------------------------------------------------------------------- - -/** - * Resolve a relative path within the sandbox, preventing path traversal. - * Returns the absolute path if valid, or throws if the path escapes the sandbox. - */ -function resolveSandboxed(basePath: string, relativePath: string): string { - const resolved = path.resolve(basePath, relativePath); - if (!resolved.startsWith(basePath + path.sep) && resolved !== basePath) { - throw new Error(`Path '${relativePath}' is outside the workspace`); - } - return resolved; -} - -/** - * Create sandboxed filesystem tools for the AI SDK agent loop. - */ -function createFilesystemTools(workspacePath: string) { - return { - list_files: tool({ - description: - 'List files and directories at a relative path within the workspace. Returns names only (single level, no recursion).', - inputSchema: z.object({ - path: z.string().describe('Relative path within workspace (use "." for root)').default('.'), - }), - execute: async (input: { path: string }) => { - try { - const resolved = resolveSandboxed(workspacePath, input.path); - const entries = await fs.readdir(resolved, { withFileTypes: true }); - return entries - .map((e) => ({ - name: e.name, - type: e.isDirectory() ? 'directory' : 'file', - })) - .slice(0, 100); - } catch (error) { - return { error: error instanceof Error ? error.message : String(error) }; - } - }, - }), - - read_file: tool({ - description: - 'Read the content of a file at a relative path within the workspace. Large files are truncated at 50KB.', - inputSchema: z.object({ - path: z.string().describe('Relative path to file within workspace'), - }), - execute: async (input: { path: string }) => { - try { - const resolved = resolveSandboxed(workspacePath, input.path); - const stat = await fs.stat(resolved); - if (stat.isDirectory()) { - return { error: `'${input.path}' is a directory, not a file` }; - } - const buffer = Buffer.alloc(Math.min(stat.size, MAX_FILE_SIZE)); - const fd = await fs.open(resolved, 'r'); - try { - await fd.read(buffer, 0, buffer.length, 0); - } finally { - await fd.close(); - } - const content = buffer.toString('utf-8'); - const truncated = stat.size > MAX_FILE_SIZE; - return { content, truncated, size: stat.size }; - } catch (error) { - return { error: error instanceof Error ? error.message : String(error) }; - } - }, - }), - - search_files: tool({ - description: - 'Search for a regex pattern across files in the workspace. Returns up to 20 matches. Skips binary files and node_modules/.git.', - inputSchema: z.object({ - pattern: z.string().describe('Regex pattern to search for'), - path: z.string().describe('Relative path to search within (use "." for root)').default('.'), - }), - execute: async (input: { pattern: string; path: string }) => { - try { - const resolved = resolveSandboxed(workspacePath, input.path); - const regex = new RegExp(input.pattern, 'gi'); - const matches: Array<{ file: string; line: number; text: string }> = []; - - await searchDirectory(resolved, workspacePath, regex, matches); - - return { matches, total: matches.length }; - } catch (error) { - return { error: error instanceof Error ? error.message : String(error) }; - } - }, - }), - }; -} - -/** - * Recursively search a directory for regex matches. - */ -async function searchDirectory( - dirPath: string, - workspacePath: string, - regex: RegExp, - matches: Array<{ file: string; line: number; text: string }>, -): Promise { - if (matches.length >= MAX_SEARCH_MATCHES) return; - - let entries: import('node:fs').Dirent[]; - try { - entries = await fs.readdir(dirPath, { withFileTypes: true }); - } catch { - return; - } - - for (const entry of entries) { - if (matches.length >= MAX_SEARCH_MATCHES) return; - - if (SEARCH_SKIP_DIRS.has(entry.name)) continue; - - const fullPath = path.join(dirPath, entry.name); - - if (entry.isDirectory()) { - await searchDirectory(fullPath, workspacePath, regex, matches); - } else if (entry.isFile()) { - const ext = path.extname(entry.name).toLowerCase(); - if (BINARY_EXTENSIONS.has(ext)) continue; - - try { - const stat = await fs.stat(fullPath); - if (stat.size > MAX_FILE_SIZE) continue; - - const content = await fs.readFile(fullPath, 'utf-8'); - const lines = content.split('\n'); - - for (let i = 0; i < lines.length; i++) { - if (matches.length >= MAX_SEARCH_MATCHES) return; - regex.lastIndex = 0; - if (regex.test(lines[i])) { - matches.push({ - file: path.relative(workspacePath, fullPath), - line: i + 1, - text: lines[i].substring(0, 200), - }); - } - } - } catch { - // Skip unreadable files - } - } - } -} diff --git a/packages/core/src/evaluation/evaluators/index.ts b/packages/core/src/evaluation/evaluators/index.ts index 157ff7c99..a64705fbe 100644 --- a/packages/core/src/evaluation/evaluators/index.ts +++ b/packages/core/src/evaluation/evaluators/index.ts @@ -52,9 +52,6 @@ export { } from './llm-judge.js'; export type { LlmJudgeEvaluatorOptions } from './llm-judge.js'; -export { AgentJudgeEvaluator } from './agent-judge.js'; -export type { AgentJudgeEvaluatorOptions } from './agent-judge.js'; - export { SkillTriggerEvaluator } from './skill-trigger.js'; export { assembleLlmJudgePrompt } from './llm-judge-prompt.js'; diff --git a/packages/core/src/evaluation/evaluators/llm-judge.ts b/packages/core/src/evaluation/evaluators/llm-judge.ts index 46125f3e7..91e6578bc 100644 --- a/packages/core/src/evaluation/evaluators/llm-judge.ts +++ b/packages/core/src/evaluation/evaluators/llm-judge.ts @@ -1,14 +1,65 @@ -import { generateText } from 'ai'; +import fs from 'node:fs/promises'; +import path from 'node:path'; + +import { generateText, stepCountIs, tool } from 'ai'; import { z } from 'zod'; import type { Provider, ProviderResponse } from '../providers/types.js'; -import { extractLastAssistantContent } from '../providers/types.js'; +import { extractLastAssistantContent, isAgentProvider } from '../providers/types.js'; import { TEMPLATE_VARIABLES } from '../template-variables.js'; import type { TokenUsage } from '../trace.js'; import type { JsonObject, RubricItem } from '../types.js'; import { clampScore, isNonEmptyString, parseJsonFromText, scoreToVerdict } from './scoring.js'; import type { EvaluationContext, EvaluationScore, Evaluator } from './types.js'; +// --------------------------------------------------------------------------- +// Constants for built-in agent mode (filesystem tools) +// --------------------------------------------------------------------------- + +const DEFAULT_MAX_STEPS = 10; +const MAX_STEPS_LIMIT = 50; +const MAX_FILE_SIZE = 50 * 1024; // 50KB +const MAX_SEARCH_MATCHES = 20; + +/** + * Directories/patterns to skip during file search. + */ +const SEARCH_SKIP_DIRS = new Set([ + 'node_modules', + '.git', + '.next', + 'dist', + '__pycache__', + '.cache', +]); + +/** + * Binary file extensions to skip during search. + */ +const BINARY_EXTENSIONS = new Set([ + '.png', + '.jpg', + '.jpeg', + '.gif', + '.ico', + '.svg', + '.woff', + '.woff2', + '.ttf', + '.eot', + '.mp3', + '.mp4', + '.wav', + '.zip', + '.tar', + '.gz', + '.pdf', + '.exe', + '.dll', + '.so', + '.dylib', +]); + /** * Default evaluator template for the user prompt (variables will be substituted). * Custom evaluators can override this via evaluatorTemplate option. @@ -38,6 +89,8 @@ export interface LlmJudgeEvaluatorOptions { readonly maxOutputTokens?: number; readonly temperature?: number; readonly evaluatorTemplate?: string; + readonly maxSteps?: number; + readonly judgeTargetProvider?: Provider; } const freeformEvaluationSchema = z.object({ @@ -82,20 +135,40 @@ export class LlmJudgeEvaluator implements Evaluator { private readonly maxOutputTokens?: number; private readonly temperature?: number; private readonly evaluatorTemplate?: string; + private readonly maxSteps: number; + private readonly judgeTargetProvider?: Provider; constructor(options: LlmJudgeEvaluatorOptions) { this.resolveJudgeProvider = options.resolveJudgeProvider; this.maxOutputTokens = options.maxOutputTokens; this.temperature = options.temperature; this.evaluatorTemplate = options.evaluatorTemplate; + this.maxSteps = Math.min(options.maxSteps ?? DEFAULT_MAX_STEPS, MAX_STEPS_LIMIT); + this.judgeTargetProvider = options.judgeTargetProvider; } async evaluate(context: EvaluationContext): Promise { + // Delegate mode: judge target provider is an agent provider — send prompt via invoke() + if (this.judgeTargetProvider) { + return this.evaluateWithJudgeTarget(context); + } + const judgeProvider = await this.resolveJudgeProvider(context); if (!judgeProvider) { throw new Error('No judge provider available for LLM grading'); } + // Built-in agent mode: agentv provider → AI SDK generateText with filesystem tools + if (judgeProvider.kind === 'agentv') { + return this.evaluateBuiltIn(context, judgeProvider); + } + + // Delegate mode: resolved provider is an agent provider → send prompt via invoke() + if (isAgentProvider(judgeProvider)) { + return this.evaluateWithDelegatedAgent(context, judgeProvider); + } + + // LLM mode: structured JSON evaluation const config = context.evaluator; if (config?.type === 'llm-judge' && config.rubrics && config.rubrics.length > 0) { return this.evaluateWithRubrics(context, judgeProvider, config.rubrics); @@ -104,6 +177,10 @@ export class LlmJudgeEvaluator implements Evaluator { return this.evaluateFreeform(context, judgeProvider); } + // --------------------------------------------------------------------------- + // LLM mode (existing) + // --------------------------------------------------------------------------- + private async evaluateFreeform( context: EvaluationContext, judgeProvider: Provider, @@ -177,7 +254,7 @@ export class LlmJudgeEvaluator implements Evaluator { tokenUsage, }; } catch (e: unknown) { - // Judge parse failure → skip (not silent zero). + // Judge parse failure -> skip (not silent zero). // Signals infrastructure error to downstream consumers, excluded from score averages. const message = e instanceof Error ? e.message : String(e); const evalName = context.evaluator?.name ?? 'llm-judge'; @@ -314,6 +391,437 @@ export class LlmJudgeEvaluator implements Evaluator { } } + // --------------------------------------------------------------------------- + // Built-in agent mode (agentv provider — AI SDK generateText with filesystem tools) + // --------------------------------------------------------------------------- + + /** + * Built-in mode: Uses Vercel AI SDK generateText() with sandboxed filesystem tools. + */ + private async evaluateBuiltIn( + context: EvaluationContext, + judgeProvider: Provider, + ): Promise { + const model = judgeProvider.asLanguageModel?.(); + if (!model) { + throw new Error( + `Judge provider '${judgeProvider.targetName}' does not support asLanguageModel() — required for built-in agent mode`, + ); + } + + const workspacePath = context.workspacePath; + if (!workspacePath) { + throw new Error( + 'llm-judge built-in agent mode requires a workspace_template target (workspacePath is not set)', + ); + } + + const systemPrompt = this.buildAgentSystemPrompt(context); + const userPrompt = this.buildAgentUserPrompt(context); + + const config = context.evaluator; + const rubrics = config?.type === 'llm-judge' ? config.rubrics : undefined; + + const fsTools = createFilesystemTools(workspacePath); + + const evaluatorRawRequest: JsonObject = { + mode: 'built-in', + systemPrompt, + userPrompt, + target: judgeProvider.targetName, + maxSteps: this.maxSteps, + }; + + try { + const { text, steps } = await generateText({ + model, + system: systemPrompt, + prompt: userPrompt, + tools: fsTools, + stopWhen: stepCountIs(this.maxSteps), + temperature: this.temperature ?? 0, + }); + + const toolCallCount = steps.reduce((count, step) => count + (step.toolCalls?.length ?? 0), 0); + + const details: JsonObject = { + mode: 'built-in', + steps: steps.length, + tool_calls: toolCallCount, + }; + + return this.parseAgentResult(text, rubrics, evaluatorRawRequest, details); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + return { + score: 0, + verdict: 'fail', + hits: [], + misses: [`llm-judge built-in evaluation failed: ${message}`], + expectedAspectCount: 1, + evaluatorRawRequest, + details: { mode: 'built-in', error: message }, + }; + } + } + + // --------------------------------------------------------------------------- + // Delegate mode (agent provider — send prompt via Provider.invoke()) + // --------------------------------------------------------------------------- + + /** + * Judge target mode: Delegates to an explicit judgeTargetProvider via Provider.invoke(). + */ + private async evaluateWithJudgeTarget(context: EvaluationContext): Promise { + const provider = this.judgeTargetProvider as Provider; + + const workspacePath = context.workspacePath; + const prompt = this.buildDelegatedPrompt(context); + + const evaluatorRawRequest: JsonObject = { + mode: 'judge_target', + judge_target: provider.targetName, + prompt, + }; + + try { + const response = await provider.invoke({ + question: prompt, + cwd: workspacePath, + evalCaseId: context.evalCase.id, + attempt: context.attempt, + }); + + const assistantContent = extractLastAssistantContent(response.output); + if (!assistantContent) { + return { + score: 0, + verdict: 'fail', + hits: [], + misses: ['llm-judge judge_target returned no assistant response'], + expectedAspectCount: 1, + evaluatorRawRequest, + details: { mode: 'judge_target', judge_target: provider.targetName }, + }; + } + + const config = context.evaluator; + const rubrics = config?.type === 'llm-judge' ? config.rubrics : undefined; + + const details: JsonObject = { + mode: 'judge_target', + judge_target: provider.targetName, + }; + + return this.parseAgentResult(assistantContent, rubrics, evaluatorRawRequest, details); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + return { + score: 0, + verdict: 'fail', + hits: [], + misses: [`llm-judge judge_target evaluation failed: ${message}`], + expectedAspectCount: 1, + evaluatorRawRequest, + details: { + mode: 'judge_target', + judge_target: provider.targetName, + error: message, + }, + }; + } + } + + /** + * Delegate mode: resolved provider is an agent provider — send prompt via invoke(). + */ + private async evaluateWithDelegatedAgent( + context: EvaluationContext, + judgeProvider: Provider, + ): Promise { + const workspacePath = context.workspacePath; + const prompt = this.buildDelegatedPrompt(context); + + const evaluatorRawRequest: JsonObject = { + mode: 'judge_target', + judge_target: judgeProvider.targetName, + prompt, + }; + + try { + const response = await judgeProvider.invoke({ + question: prompt, + cwd: workspacePath, + evalCaseId: context.evalCase.id, + attempt: context.attempt, + }); + + const assistantContent = extractLastAssistantContent(response.output); + if (!assistantContent) { + return { + score: 0, + verdict: 'fail', + hits: [], + misses: ['llm-judge delegate returned no assistant response'], + expectedAspectCount: 1, + evaluatorRawRequest, + details: { mode: 'judge_target', judge_target: judgeProvider.targetName }, + }; + } + + const config = context.evaluator; + const rubrics = config?.type === 'llm-judge' ? config.rubrics : undefined; + + const details: JsonObject = { + mode: 'judge_target', + judge_target: judgeProvider.targetName, + }; + + return this.parseAgentResult(assistantContent, rubrics, evaluatorRawRequest, details); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + return { + score: 0, + verdict: 'fail', + hits: [], + misses: [`llm-judge delegate evaluation failed: ${message}`], + expectedAspectCount: 1, + evaluatorRawRequest, + details: { + mode: 'judge_target', + judge_target: judgeProvider.targetName, + error: message, + }, + }; + } + } + + // --------------------------------------------------------------------------- + // Prompt builders for agent modes + // --------------------------------------------------------------------------- + + /** + * Build system prompt for built-in agent mode. + * Includes output format instructions. + */ + private buildAgentSystemPrompt(context: EvaluationContext): string { + const config = context.evaluator; + const rubrics = config?.type === 'llm-judge' ? config.rubrics : undefined; + + const parts: string[] = [ + 'You are an expert evaluator with access to the workspace filesystem.', + 'Use the provided tools to investigate the workspace and verify the criteria are met.', + 'Thoroughly examine relevant files before making your assessment.', + '', + ]; + + if (rubrics && rubrics.length > 0) { + parts.push(buildRubricOutputSchema()); + } else { + parts.push(buildOutputSchema()); + } + + return parts.join('\n'); + } + + /** + * Build user prompt for built-in agent mode. + * Uses custom template if provided, otherwise builds default prompt. + */ + private buildAgentUserPrompt(context: EvaluationContext): string { + const formattedQuestion = + context.promptInputs.question && context.promptInputs.question.trim().length > 0 + ? context.promptInputs.question + : context.evalCase.question; + + const variables: Record = { + [TEMPLATE_VARIABLES.ANSWER]: context.candidate.trim(), + [TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context.evalCase.reference_answer ?? '').trim(), + [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(), + [TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(), + [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? '', + }; + + if (this.evaluatorTemplate) { + return substituteVariables(this.evaluatorTemplate, variables); + } + + const config = context.evaluator; + const rubrics = config?.type === 'llm-judge' ? config.rubrics : undefined; + + const parts: string[] = [ + 'Evaluate the candidate answer by investigating the workspace.', + '', + '[[ ## question ## ]]', + formattedQuestion, + '', + '[[ ## criteria ## ]]', + context.evalCase.criteria, + '', + ]; + + if (context.evalCase.reference_answer && context.evalCase.reference_answer.trim().length > 0) { + parts.push('[[ ## reference_answer ## ]]', context.evalCase.reference_answer, ''); + } + + parts.push('[[ ## answer ## ]]', context.candidate, ''); + + if (context.fileChanges) { + parts.push('[[ ## file_changes ## ]]', context.fileChanges, ''); + } + + if (rubrics && rubrics.length > 0) { + parts.push('[[ ## rubrics ## ]]'); + for (const rubric of rubrics) { + const requiredLabel = rubric.required ? ' (REQUIRED)' : ''; + const weightLabel = rubric.weight !== 1.0 ? ` (weight: ${rubric.weight})` : ''; + parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.outcome}`); + } + parts.push( + '', + 'For each rubric, investigate the workspace to determine if it is satisfied. Provide brief reasoning.', + ); + } else { + parts.push( + 'Investigate the workspace to verify the criteria. Provide a score between 0.0 and 1.0.', + ); + } + + return parts.join('\n'); + } + + /** + * Build the full evaluation prompt for delegate mode (agent providers). + * Combines task context, criteria, candidate info, and output format instructions. + */ + private buildDelegatedPrompt(context: EvaluationContext): string { + const formattedQuestion = + context.promptInputs.question && context.promptInputs.question.trim().length > 0 + ? context.promptInputs.question + : context.evalCase.question; + + const config = context.evaluator; + const rubrics = config?.type === 'llm-judge' ? config.rubrics : undefined; + + if (this.evaluatorTemplate) { + const variables: Record = { + [TEMPLATE_VARIABLES.ANSWER]: context.candidate.trim(), + [TEMPLATE_VARIABLES.REFERENCE_ANSWER]: (context.evalCase.reference_answer ?? '').trim(), + [TEMPLATE_VARIABLES.CRITERIA]: context.evalCase.criteria.trim(), + [TEMPLATE_VARIABLES.QUESTION]: formattedQuestion.trim(), + [TEMPLATE_VARIABLES.FILE_CHANGES]: context.fileChanges ?? '', + }; + const customPrompt = substituteVariables(this.evaluatorTemplate, variables); + + const outputSchema = + rubrics && rubrics.length > 0 ? buildRubricOutputSchema() : buildOutputSchema(); + + return `${customPrompt}\n\n${outputSchema}`; + } + + const parts: string[] = [ + 'You are an expert evaluator. Investigate the workspace to verify the criteria are met.', + '', + '[[ ## question ## ]]', + formattedQuestion, + '', + '[[ ## criteria ## ]]', + context.evalCase.criteria, + '', + ]; + + if (context.evalCase.reference_answer && context.evalCase.reference_answer.trim().length > 0) { + parts.push('[[ ## reference_answer ## ]]', context.evalCase.reference_answer, ''); + } + + parts.push('[[ ## answer ## ]]', context.candidate, ''); + + if (context.fileChanges) { + parts.push('[[ ## file_changes ## ]]', context.fileChanges, ''); + } + + if (rubrics && rubrics.length > 0) { + parts.push('[[ ## rubrics ## ]]'); + for (const rubric of rubrics) { + const requiredLabel = rubric.required ? ' (REQUIRED)' : ''; + const weightLabel = rubric.weight !== 1.0 ? ` (weight: ${rubric.weight})` : ''; + parts.push(`- [${rubric.id}]${requiredLabel}${weightLabel}: ${rubric.outcome}`); + } + parts.push(''); + parts.push(buildRubricOutputSchema()); + } else { + parts.push(buildOutputSchema()); + } + + return parts.join('\n'); + } + + // --------------------------------------------------------------------------- + // Agent result parser (shared by built-in and delegate modes) + // --------------------------------------------------------------------------- + + /** + * Parse the agent's response text into an EvaluationScore. + * Supports both freeform and rubric modes. + */ + private parseAgentResult( + text: string, + rubrics: readonly RubricItem[] | undefined, + evaluatorRawRequest: JsonObject, + details: JsonObject, + ): EvaluationScore { + try { + const parsed = parseJsonFromText(text); + + if (rubrics && rubrics.length > 0) { + const data = rubricEvaluationSchema.parse(parsed); + const { score, verdict, hits, misses } = calculateRubricScore(data, rubrics); + return { + score, + verdict, + hits, + misses, + expectedAspectCount: rubrics.length, + reasoning: data.overall_reasoning, + evaluatorRawRequest, + details, + }; + } + + const data = freeformEvaluationSchema.parse(parsed); + const score = clampScore(data.score); + const hits = Array.isArray(data.hits) ? data.hits.filter(isNonEmptyString).slice(0, 4) : []; + const misses = Array.isArray(data.misses) + ? data.misses.filter(isNonEmptyString).slice(0, 4) + : []; + + return { + score, + verdict: scoreToVerdict(score), + hits, + misses, + expectedAspectCount: Math.max(hits.length + misses.length, 1), + reasoning: data.reasoning, + evaluatorRawRequest, + details, + }; + } catch { + return { + score: 0, + verdict: 'fail', + hits: [], + misses: ['Failed to parse llm-judge agent response as valid evaluation JSON'], + expectedAspectCount: 1, + evaluatorRawRequest, + details, + }; + } + } + + // --------------------------------------------------------------------------- + // LLM mode prompt builders + // --------------------------------------------------------------------------- + /** * Build prompt for score-range rubric evaluation. */ @@ -421,6 +929,10 @@ export class LlmJudgeEvaluator implements Evaluator { return parts.join('\n'); } + // --------------------------------------------------------------------------- + // LLM mode retry logic + // --------------------------------------------------------------------------- + private async runWithRetry(options: { readonly context: EvaluationContext; readonly judgeProvider: Provider; @@ -474,6 +986,10 @@ export class LlmJudgeEvaluator implements Evaluator { } } +// --------------------------------------------------------------------------- +// Output schema builders (exported for reuse) +// --------------------------------------------------------------------------- + /** * Build the mandatory output schema that all evaluators must follow. * This schema is always appended to the evaluator template. @@ -656,3 +1172,155 @@ function calculateScoreRangeResult( }, }; } + +// --------------------------------------------------------------------------- +// Sandboxed filesystem tools for built-in agent mode +// --------------------------------------------------------------------------- + +/** + * Resolve a relative path within the sandbox, preventing path traversal. + * Returns the absolute path if valid, or throws if the path escapes the sandbox. + */ +function resolveSandboxed(basePath: string, relativePath: string): string { + const resolved = path.resolve(basePath, relativePath); + if (!resolved.startsWith(basePath + path.sep) && resolved !== basePath) { + throw new Error(`Path '${relativePath}' is outside the workspace`); + } + return resolved; +} + +/** + * Create sandboxed filesystem tools for the AI SDK agent loop. + */ +function createFilesystemTools(workspacePath: string) { + return { + list_files: tool({ + description: + 'List files and directories at a relative path within the workspace. Returns names only (single level, no recursion).', + inputSchema: z.object({ + path: z.string().describe('Relative path within workspace (use "." for root)').default('.'), + }), + execute: async (input: { path: string }) => { + try { + const resolved = resolveSandboxed(workspacePath, input.path); + const entries = await fs.readdir(resolved, { withFileTypes: true }); + return entries + .map((e) => ({ + name: e.name, + type: e.isDirectory() ? 'directory' : 'file', + })) + .slice(0, 100); + } catch (error) { + return { error: error instanceof Error ? error.message : String(error) }; + } + }, + }), + + read_file: tool({ + description: + 'Read the content of a file at a relative path within the workspace. Large files are truncated at 50KB.', + inputSchema: z.object({ + path: z.string().describe('Relative path to file within workspace'), + }), + execute: async (input: { path: string }) => { + try { + const resolved = resolveSandboxed(workspacePath, input.path); + const stat = await fs.stat(resolved); + if (stat.isDirectory()) { + return { error: `'${input.path}' is a directory, not a file` }; + } + const buffer = Buffer.alloc(Math.min(stat.size, MAX_FILE_SIZE)); + const fd = await fs.open(resolved, 'r'); + try { + await fd.read(buffer, 0, buffer.length, 0); + } finally { + await fd.close(); + } + const content = buffer.toString('utf-8'); + const truncated = stat.size > MAX_FILE_SIZE; + return { content, truncated, size: stat.size }; + } catch (error) { + return { error: error instanceof Error ? error.message : String(error) }; + } + }, + }), + + search_files: tool({ + description: + 'Search for a regex pattern across files in the workspace. Returns up to 20 matches. Skips binary files and node_modules/.git.', + inputSchema: z.object({ + pattern: z.string().describe('Regex pattern to search for'), + path: z.string().describe('Relative path to search within (use "." for root)').default('.'), + }), + execute: async (input: { pattern: string; path: string }) => { + try { + const resolved = resolveSandboxed(workspacePath, input.path); + const regex = new RegExp(input.pattern, 'gi'); + const matches: Array<{ file: string; line: number; text: string }> = []; + + await searchDirectory(resolved, workspacePath, regex, matches); + + return { matches, total: matches.length }; + } catch (error) { + return { error: error instanceof Error ? error.message : String(error) }; + } + }, + }), + }; +} + +/** + * Recursively search a directory for regex matches. + */ +async function searchDirectory( + dirPath: string, + workspacePath: string, + regex: RegExp, + matches: Array<{ file: string; line: number; text: string }>, +): Promise { + if (matches.length >= MAX_SEARCH_MATCHES) return; + + let entries: import('node:fs').Dirent[]; + try { + entries = await fs.readdir(dirPath, { withFileTypes: true }); + } catch { + return; + } + + for (const entry of entries) { + if (matches.length >= MAX_SEARCH_MATCHES) return; + + if (SEARCH_SKIP_DIRS.has(entry.name)) continue; + + const fullPath = path.join(dirPath, entry.name); + + if (entry.isDirectory()) { + await searchDirectory(fullPath, workspacePath, regex, matches); + } else if (entry.isFile()) { + const ext = path.extname(entry.name).toLowerCase(); + if (BINARY_EXTENSIONS.has(ext)) continue; + + try { + const stat = await fs.stat(fullPath); + if (stat.size > MAX_FILE_SIZE) continue; + + const content = await fs.readFile(fullPath, 'utf-8'); + const lines = content.split('\n'); + + for (let i = 0; i < lines.length; i++) { + if (matches.length >= MAX_SEARCH_MATCHES) return; + regex.lastIndex = 0; + if (regex.test(lines[i])) { + matches.push({ + file: path.relative(workspacePath, fullPath), + line: i + 1, + text: lines[i].substring(0, 200), + }); + } + } + } catch { + // Skip unreadable files + } + } + } +} diff --git a/packages/core/src/evaluation/loaders/evaluator-parser.ts b/packages/core/src/evaluation/loaders/evaluator-parser.ts index 2eb72cb92..e931db1c8 100644 --- a/packages/core/src/evaluation/loaders/evaluator-parser.ts +++ b/packages/core/src/evaluation/loaders/evaluator-parser.ts @@ -134,7 +134,9 @@ async function parseEvaluatorList( const typeValue = typeof rawType === 'string' ? normalizeEvaluatorType(rawType) : rawType; // Unknown types are treated as custom assertion types (resolved via registry discovery) - const isCustomType = typeof typeValue === 'string' && !isEvaluatorKind(typeValue); + // 'agent-judge' is a known alias (maps to 'llm-judge'), not a custom type + const isCustomType = + typeof typeValue === 'string' && !isEvaluatorKind(typeValue) && typeValue !== 'agent-judge'; if (typeof typeValue !== 'string') { logWarning(`Skipping evaluator with invalid type in '${evalId}'`); continue; @@ -852,7 +854,8 @@ async function parseEvaluatorList( continue; } - if (typeValue === 'agent-judge') { + // Backward compat: agent-judge / agent_judge → llm-judge with agent-specific fields + if ((typeValue as string) === 'agent-judge') { // Validate max_steps (1-50) const rawMaxSteps = rawEvaluator.max_steps ?? rawEvaluator.maxSteps; let maxSteps: number | undefined; @@ -864,7 +867,7 @@ async function parseEvaluatorList( rawMaxSteps > 50 ) { logWarning( - `Skipping agent-judge evaluator '${name}' in '${evalId}': max_steps must be an integer 1-50`, + `Skipping llm-judge evaluator '${name}' in '${evalId}': max_steps must be an integer 1-50`, ); continue; } @@ -877,7 +880,7 @@ async function parseEvaluatorList( if (rawTemperature !== undefined) { if (typeof rawTemperature !== 'number' || rawTemperature < 0 || rawTemperature > 2) { logWarning( - `Skipping agent-judge evaluator '${name}' in '${evalId}': temperature must be a number 0-2`, + `Skipping llm-judge evaluator '${name}' in '${evalId}': temperature must be a number 0-2`, ); continue; } @@ -910,7 +913,7 @@ async function parseEvaluatorList( evaluators.push({ name, - type: 'agent-judge', + type: 'llm-judge', ...(agentPrompt ? { prompt: agentPrompt } : {}), ...(agentPromptPath ? { promptPath: agentPromptPath, resolvedPromptPath: agentPromptPath } @@ -1266,6 +1269,9 @@ async function parseEvaluatorList( 'config', 'required', 'negate', + 'max_steps', + 'maxSteps', + 'temperature', ]); const config: Record = {}; for (const [key, value] of Object.entries(rawEvaluator)) { @@ -1284,6 +1290,21 @@ async function parseEvaluatorList( const finalConfig = promptScriptConfig ?? (Object.keys(mergedConfig).length > 0 ? mergedConfig : undefined); + // Parse optional max_steps and temperature (used in agent mode) + const rawMaxStepsLlm = rawEvaluator.max_steps ?? rawEvaluator.maxSteps; + const llmMaxSteps = + typeof rawMaxStepsLlm === 'number' && + Number.isInteger(rawMaxStepsLlm) && + rawMaxStepsLlm >= 1 && + rawMaxStepsLlm <= 50 + ? rawMaxStepsLlm + : undefined; + const rawTempLlm = rawEvaluator.temperature; + const llmTemperature = + typeof rawTempLlm === 'number' && rawTempLlm >= 0 && rawTempLlm <= 2 + ? rawTempLlm + : undefined; + evaluators.push({ name, type: 'llm-judge', @@ -1297,6 +1318,8 @@ async function parseEvaluatorList( ...(required !== undefined ? { required } : {}), ...(negate !== undefined ? { negate } : {}), ...(finalConfig ? { config: finalConfig } : {}), + ...(llmMaxSteps !== undefined ? { max_steps: llmMaxSteps } : {}), + ...(llmTemperature !== undefined ? { temperature: llmTemperature } : {}), }); } diff --git a/packages/core/src/evaluation/registry/builtin-evaluators.ts b/packages/core/src/evaluation/registry/builtin-evaluators.ts index dee6b0237..a370c08b4 100644 --- a/packages/core/src/evaluation/registry/builtin-evaluators.ts +++ b/packages/core/src/evaluation/registry/builtin-evaluators.ts @@ -6,9 +6,7 @@ * the EvaluatorRegistry at startup. */ -import { readFileSync } from 'node:fs'; import { - AgentJudgeEvaluator, CodeEvaluator, CompositeEvaluator, CostEvaluator, @@ -34,10 +32,10 @@ import { } from '../evaluators.js'; import { InlineAssertEvaluator } from '../evaluators/inline-assert.js'; import { resolveCustomPrompt } from '../evaluators/prompt-resolution.js'; +import { isAgentProvider } from '../providers/types.js'; import type { Provider } from '../providers/types.js'; import type { ToolTrajectoryEvaluatorConfig } from '../trace.js'; import type { - AgentJudgeEvaluatorConfig, CodeEvaluatorConfig, CompositeEvaluatorConfig, ContainsAllEvaluatorConfig, @@ -74,6 +72,11 @@ export const INLINE_ASSERT_FN = Symbol.for('agentv.inline-assert-fn'); * Factory for `llm-judge` evaluators. * Creates a wrapper that resolves custom prompts at evaluation time and * optionally overrides the judge target per evaluator. + * + * Auto-detects mode based on the resolved judge provider: + * - LLM providers (azure, anthropic, gemini): structured JSON mode + * - Agent providers (claude-cli, copilot, etc.): delegate mode + * - agentv provider: built-in AI SDK agent mode with filesystem tools */ export const llmJudgeFactory: EvaluatorFactoryFn = (config, context) => { const c = config as LlmJudgeEvaluatorConfig; @@ -88,12 +91,18 @@ export const llmJudgeFactory: EvaluatorFactoryFn = (config, context) => { if (!judgeTargetProvider) { throw new Error(`llm-judge evaluator '${c.name}': target '${c.target}' not found in targets`); } + // Only pass judgeTargetProvider for agent providers (delegate mode). + // LLM providers use the normal resolveJudgeProvider path for structured JSON mode. + const isAgent = isAgentProvider(judgeTargetProvider) || judgeTargetProvider.kind === 'agentv'; evaluator = new LlmJudgeEvaluator({ resolveJudgeProvider: async (evalContext) => { if (judgeTargetProvider) return judgeTargetProvider; if (evalContext.judgeProvider) return evalContext.judgeProvider; return judgeProvider; }, + maxSteps: c.max_steps, + temperature: c.temperature, + ...(isAgent ? { judgeTargetProvider } : {}), }); } @@ -198,45 +207,6 @@ export const executionMetricsFactory: EvaluatorFactoryFn = (config) => { }); }; -/** Factory for `agent-judge` evaluators. */ -export const agentJudgeFactory: EvaluatorFactoryFn = (config, context) => { - const c = config as AgentJudgeEvaluatorConfig; - const { judgeProvider, targetResolver } = context; - - let customPrompt: string | undefined; - if (c.resolvedPromptPath) { - try { - customPrompt = readFileSync(c.resolvedPromptPath, 'utf-8'); - } catch (error) { - const message = error instanceof Error ? error.message : String(error); - console.warn(`Could not read agent-judge prompt at ${c.resolvedPromptPath}: ${message}`); - } - } else if (c.prompt) { - customPrompt = c.prompt; - } - - let judgeTargetProvider: Provider | undefined; - if (c.target && targetResolver) { - judgeTargetProvider = targetResolver(c.target); - if (!judgeTargetProvider) { - throw new Error( - `agent-judge evaluator '${c.name}': target '${c.target}' not found in targets`, - ); - } - } - - return new AgentJudgeEvaluator({ - resolveJudgeProvider: async (ctx) => { - if (ctx.judgeProvider) return ctx.judgeProvider; - return judgeProvider; - }, - maxSteps: c.max_steps, - temperature: c.temperature, - evaluatorTemplate: customPrompt, - judgeTargetProvider, - }); -}; - /** Factory for `skill-trigger` evaluator. */ export const skillTriggerFactory: EvaluatorFactoryFn = (config) => { return new SkillTriggerEvaluator(config as SkillTriggerEvaluatorConfig); @@ -440,7 +410,6 @@ export function createBuiltinRegistry(): EvaluatorRegistry { .register('cost', costFactory) .register('token-usage', tokenUsageFactory) .register('execution-metrics', executionMetricsFactory) - .register('agent-judge', agentJudgeFactory) .register('skill-trigger', skillTriggerFactory) .register('contains', containsFactory) .register('contains-any', containsAnyFactory) diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts index b69c272ab..b174af42f 100644 --- a/packages/core/src/evaluation/types.ts +++ b/packages/core/src/evaluation/types.ts @@ -158,7 +158,6 @@ const EVALUATOR_KIND_VALUES = [ 'cost', 'token-usage', 'execution-metrics', - 'agent-judge', 'skill-trigger', 'contains', 'contains-any', @@ -337,6 +336,10 @@ export type LlmJudgeEvaluatorConfig = { readonly target?: string; /** Pass-through configuration for custom evaluator prompts (legacy, prefer prompt.config) */ readonly config?: Record; + /** Maximum agent steps for agentv built-in mode (default 10, max 50). Ignored in LLM mode. */ + readonly max_steps?: number; + /** Temperature override for judge calls */ + readonly temperature?: number; }; /** @@ -529,35 +532,6 @@ export type ExecutionMetricsEvaluatorConfig = { readonly negate?: boolean; }; -/** - * Configuration for the agent-judge evaluator. - * Runs an agentic investigation loop to audit workspaces and verify criteria. - * Two modes: - * - Built-in: Uses AI SDK generateText() with sandboxed filesystem tools - * - Judge target: Delegates to an external agent provider via Provider.invoke() - */ -export type AgentJudgeEvaluatorConfig = { - readonly name: string; - readonly type: 'agent-judge'; - /** Custom evaluation prompt (inline text or file path) */ - readonly prompt?: string; - readonly promptPath?: string; - /** Resolved absolute path for prompt file */ - readonly resolvedPromptPath?: string; - /** Rubric items for structured evaluation (reuses llm-judge rubric infra) */ - readonly rubrics?: readonly RubricItem[]; - /** Maximum agent steps for built-in mode (default 10, max 50) */ - readonly max_steps?: number; - /** Temperature for built-in mode (default 0) */ - readonly temperature?: number; - /** Target name — delegates agent loop to this provider instead of built-in mode */ - readonly target?: string; - readonly weight?: number; - readonly required?: boolean | number; - /** When true, inverts the evaluator score (1 - score) and swaps pass/fail verdict */ - readonly negate?: boolean; -}; - /** * Configuration for the contains assertion evaluator. * Checks whether the candidate output contains a specified substring. @@ -766,7 +740,6 @@ export type EvaluatorConfig = | CostEvaluatorConfig | TokenUsageEvaluatorConfig | ExecutionMetricsEvaluatorConfig - | AgentJudgeEvaluatorConfig | SkillTriggerEvaluatorConfig | ContainsEvaluatorConfig | ContainsAnyEvaluatorConfig diff --git a/packages/core/src/evaluation/validation/eval-file.schema.ts b/packages/core/src/evaluation/validation/eval-file.schema.ts index 690373b43..977b68daa 100644 --- a/packages/core/src/evaluation/validation/eval-file.schema.ts +++ b/packages/core/src/evaluation/validation/eval-file.schema.ts @@ -87,6 +87,8 @@ const LlmJudgeSchema = EvaluatorCommonSchema.extend({ model: z.string().optional(), target: z.string().optional(), config: z.record(z.unknown()).optional(), + max_steps: z.number().int().min(1).max(50).optional(), + temperature: z.number().min(0).max(2).optional(), }); /** Aggregator configs for composite evaluator */ diff --git a/packages/eval/src/assertion.ts b/packages/eval/src/assertion.ts index dd28ea304..bb77b4710 100644 --- a/packages/eval/src/assertion.ts +++ b/packages/eval/src/assertion.ts @@ -47,7 +47,6 @@ export type AssertionType = | 'cost' | 'token-usage' | 'execution-metrics' - | 'agent-judge' | 'skill-trigger' | 'contains' | 'contains-any' @@ -67,7 +66,6 @@ export type AssertionType = | 'field_accuracy' | 'token_usage' | 'execution_metrics' - | 'agent_judge' | 'contains_any' | 'contains_all' | 'icontains_any' diff --git a/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json b/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json index 9093c7e48..b55528f3c 100644 --- a/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json +++ b/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json @@ -53,7 +53,12 @@ "properties": { "role": { "type": "string", - "enum": ["system", "user", "assistant", "tool"] + "enum": [ + "system", + "user", + "assistant", + "tool" + ] }, "content": { "anyOf": [ @@ -67,20 +72,29 @@ "properties": { "type": { "type": "string", - "enum": ["text", "file"] + "enum": [ + "text", + "file" + ] }, "value": { "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false } } ] } }, - "required": ["role", "content"], + "required": [ + "role", + "content" + ], "additionalProperties": false } } @@ -115,7 +129,12 @@ "properties": { "role": { "type": "string", - "enum": ["system", "user", "assistant", "tool"] + "enum": [ + "system", + "user", + "assistant", + "tool" + ] }, "content": { "anyOf": [ @@ -129,20 +148,29 @@ "properties": { "type": { "type": "string", - "enum": ["text", "file"] + "enum": [ + "text", + "file" + ] }, "value": { "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false } } ] } }, - "required": ["role", "content"], + "required": [ + "role", + "content" + ], "additionalProperties": false } } @@ -164,7 +192,12 @@ "properties": { "role": { "type": "string", - "enum": ["system", "user", "assistant", "tool"] + "enum": [ + "system", + "user", + "assistant", + "tool" + ] }, "content": { "anyOf": [ @@ -178,20 +211,29 @@ "properties": { "type": { "type": "string", - "enum": ["text", "file"] + "enum": [ + "text", + "file" + ] }, "value": { "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false } } ] } }, - "required": ["role", "content"], + "required": [ + "role", + "content" + ], "additionalProperties": false } } @@ -228,7 +270,10 @@ }, "type": { "type": "string", - "enum": ["code-judge", "code_judge"] + "enum": [ + "code-judge", + "code_judge" + ] }, "command": { "anyOf": [ @@ -280,7 +325,10 @@ "additionalProperties": {} } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false }, { @@ -310,7 +358,10 @@ }, "type": { "type": "string", - "enum": ["llm-judge", "llm_judge"] + "enum": [ + "llm-judge", + "llm_judge" + ] }, "prompt": { "anyOf": [ @@ -404,7 +455,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -421,9 +475,21 @@ "config": { "type": "object", "additionalProperties": {} + }, + "max_steps": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "temperature": { + "type": "number", + "minimum": 0, + "maximum": 2 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -483,7 +549,9 @@ } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -499,7 +567,10 @@ "maximum": 1 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -516,7 +587,10 @@ "type": "string" } }, - "required": ["type", "path"], + "required": [ + "type", + "path" + ], "additionalProperties": false }, { @@ -533,13 +607,18 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false } ] } }, - "required": ["type", "aggregator"], + "required": [ + "type", + "aggregator" + ], "additionalProperties": false }, { @@ -569,11 +648,20 @@ }, "type": { "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] + "enum": [ + "tool-trajectory", + "tool_trajectory" + ] }, "mode": { "type": "string", - "enum": ["any_order", "in_order", "exact", "subset", "superset"] + "enum": [ + "any_order", + "in_order", + "exact", + "subset", + "superset" + ] }, "minimums": { "type": "object", @@ -614,7 +702,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -628,7 +721,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -639,7 +737,9 @@ ] } }, - "required": ["tool"], + "required": [ + "tool" + ], "additionalProperties": false } }, @@ -647,7 +747,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -661,7 +766,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -672,7 +782,10 @@ ] } }, - "required": ["type", "mode"], + "required": [ + "type", + "mode" + ], "additionalProperties": false }, { @@ -702,7 +815,10 @@ }, "type": { "type": "string", - "enum": ["field-accuracy", "field_accuracy"] + "enum": [ + "field-accuracy", + "field_accuracy" + ] }, "fields": { "type": "array", @@ -714,7 +830,11 @@ }, "match": { "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] + "enum": [ + "exact", + "numeric_tolerance", + "date" + ] }, "required": { "type": "boolean" @@ -736,17 +856,26 @@ } } }, - "required": ["path", "match"], + "required": [ + "path", + "match" + ], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": ["weighted_average", "all_or_nothing"] + "enum": [ + "weighted_average", + "all_or_nothing" + ] } }, - "required": ["type", "fields"], + "required": [ + "type", + "fields" + ], "additionalProperties": false }, { @@ -783,7 +912,10 @@ "minimum": 0 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -820,7 +952,10 @@ "minimum": 0 } }, - "required": ["type", "budget"], + "required": [ + "type", + "budget" + ], "additionalProperties": false }, { @@ -850,7 +985,10 @@ }, "type": { "type": "string", - "enum": ["token-usage", "token_usage"] + "enum": [ + "token-usage", + "token_usage" + ] }, "max_total": { "type": "number", @@ -865,7 +1003,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -895,7 +1035,10 @@ }, "type": { "type": "string", - "enum": ["execution-metrics", "execution_metrics"] + "enum": [ + "execution-metrics", + "execution_metrics" + ] }, "max_tool_calls": { "type": "number", @@ -927,7 +1070,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -957,7 +1102,10 @@ }, "type": { "type": "string", - "enum": ["agent-judge", "agent_judge"] + "enum": [ + "agent-judge", + "agent_judge" + ] }, "prompt": { "type": "string" @@ -1011,7 +1159,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -1033,7 +1184,9 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -1069,7 +1222,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -1105,7 +1261,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -1135,10 +1294,15 @@ }, "type": { "type": "string", - "enum": ["is-json", "is_json"] + "enum": [ + "is-json", + "is_json" + ] } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -1174,7 +1338,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -1255,7 +1422,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -1265,7 +1435,10 @@ "minItems": 1 } }, - "required": ["type", "criteria"], + "required": [ + "type", + "criteria" + ], "additionalProperties": false } ] @@ -1302,7 +1475,10 @@ }, "type": { "type": "string", - "enum": ["code-judge", "code_judge"] + "enum": [ + "code-judge", + "code_judge" + ] }, "command": { "anyOf": [ @@ -1354,7 +1530,10 @@ "additionalProperties": {} } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false }, { @@ -1384,7 +1563,10 @@ }, "type": { "type": "string", - "enum": ["llm-judge", "llm_judge"] + "enum": [ + "llm-judge", + "llm_judge" + ] }, "prompt": { "anyOf": [ @@ -1478,7 +1660,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -1495,9 +1680,21 @@ "config": { "type": "object", "additionalProperties": {} + }, + "max_steps": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "temperature": { + "type": "number", + "minimum": 0, + "maximum": 2 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -1557,7 +1754,9 @@ } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -1573,7 +1772,10 @@ "maximum": 1 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -1590,7 +1792,10 @@ "type": "string" } }, - "required": ["type", "path"], + "required": [ + "type", + "path" + ], "additionalProperties": false }, { @@ -1607,13 +1812,18 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false } ] } }, - "required": ["type", "aggregator"], + "required": [ + "type", + "aggregator" + ], "additionalProperties": false }, { @@ -1643,11 +1853,20 @@ }, "type": { "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] + "enum": [ + "tool-trajectory", + "tool_trajectory" + ] }, "mode": { "type": "string", - "enum": ["any_order", "in_order", "exact", "subset", "superset"] + "enum": [ + "any_order", + "in_order", + "exact", + "subset", + "superset" + ] }, "minimums": { "type": "object", @@ -1688,7 +1907,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -1702,7 +1926,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -1713,7 +1942,9 @@ ] } }, - "required": ["tool"], + "required": [ + "tool" + ], "additionalProperties": false } }, @@ -1721,7 +1952,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -1735,7 +1971,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -1746,7 +1987,10 @@ ] } }, - "required": ["type", "mode"], + "required": [ + "type", + "mode" + ], "additionalProperties": false }, { @@ -1776,7 +2020,10 @@ }, "type": { "type": "string", - "enum": ["field-accuracy", "field_accuracy"] + "enum": [ + "field-accuracy", + "field_accuracy" + ] }, "fields": { "type": "array", @@ -1788,7 +2035,11 @@ }, "match": { "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] + "enum": [ + "exact", + "numeric_tolerance", + "date" + ] }, "required": { "type": "boolean" @@ -1810,17 +2061,26 @@ } } }, - "required": ["path", "match"], + "required": [ + "path", + "match" + ], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": ["weighted_average", "all_or_nothing"] + "enum": [ + "weighted_average", + "all_or_nothing" + ] } }, - "required": ["type", "fields"], + "required": [ + "type", + "fields" + ], "additionalProperties": false }, { @@ -1857,7 +2117,10 @@ "minimum": 0 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -1894,7 +2157,10 @@ "minimum": 0 } }, - "required": ["type", "budget"], + "required": [ + "type", + "budget" + ], "additionalProperties": false }, { @@ -1924,7 +2190,10 @@ }, "type": { "type": "string", - "enum": ["token-usage", "token_usage"] + "enum": [ + "token-usage", + "token_usage" + ] }, "max_total": { "type": "number", @@ -1939,7 +2208,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -1969,7 +2240,10 @@ }, "type": { "type": "string", - "enum": ["execution-metrics", "execution_metrics"] + "enum": [ + "execution-metrics", + "execution_metrics" + ] }, "max_tool_calls": { "type": "number", @@ -2001,7 +2275,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -2031,7 +2307,10 @@ }, "type": { "type": "string", - "enum": ["agent-judge", "agent_judge"] + "enum": [ + "agent-judge", + "agent_judge" + ] }, "prompt": { "type": "string" @@ -2085,7 +2364,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -2107,7 +2389,9 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -2143,7 +2427,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -2179,7 +2466,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -2209,10 +2499,15 @@ }, "type": { "type": "string", - "enum": ["is-json", "is_json"] + "enum": [ + "is-json", + "is_json" + ] } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -2248,7 +2543,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -2329,7 +2627,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -2339,7 +2640,10 @@ "minItems": 1 } }, - "required": ["type", "criteria"], + "required": [ + "type", + "criteria" + ], "additionalProperties": false } ] @@ -2376,7 +2680,10 @@ }, "type": { "type": "string", - "enum": ["code-judge", "code_judge"] + "enum": [ + "code-judge", + "code_judge" + ] }, "command": { "anyOf": [ @@ -2428,7 +2735,10 @@ "additionalProperties": {} } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false }, { @@ -2458,7 +2768,10 @@ }, "type": { "type": "string", - "enum": ["llm-judge", "llm_judge"] + "enum": [ + "llm-judge", + "llm_judge" + ] }, "prompt": { "anyOf": [ @@ -2552,7 +2865,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -2569,9 +2885,21 @@ "config": { "type": "object", "additionalProperties": {} + }, + "max_steps": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "temperature": { + "type": "number", + "minimum": 0, + "maximum": 2 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -2631,7 +2959,9 @@ } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -2647,7 +2977,10 @@ "maximum": 1 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -2664,7 +2997,10 @@ "type": "string" } }, - "required": ["type", "path"], + "required": [ + "type", + "path" + ], "additionalProperties": false }, { @@ -2681,13 +3017,18 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false } ] } }, - "required": ["type", "aggregator"], + "required": [ + "type", + "aggregator" + ], "additionalProperties": false }, { @@ -2717,11 +3058,20 @@ }, "type": { "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] + "enum": [ + "tool-trajectory", + "tool_trajectory" + ] }, "mode": { "type": "string", - "enum": ["any_order", "in_order", "exact", "subset", "superset"] + "enum": [ + "any_order", + "in_order", + "exact", + "subset", + "superset" + ] }, "minimums": { "type": "object", @@ -2762,7 +3112,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -2776,7 +3131,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -2787,7 +3147,9 @@ ] } }, - "required": ["tool"], + "required": [ + "tool" + ], "additionalProperties": false } }, @@ -2795,7 +3157,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -2809,7 +3176,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -2820,7 +3192,10 @@ ] } }, - "required": ["type", "mode"], + "required": [ + "type", + "mode" + ], "additionalProperties": false }, { @@ -2850,7 +3225,10 @@ }, "type": { "type": "string", - "enum": ["field-accuracy", "field_accuracy"] + "enum": [ + "field-accuracy", + "field_accuracy" + ] }, "fields": { "type": "array", @@ -2862,7 +3240,11 @@ }, "match": { "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] + "enum": [ + "exact", + "numeric_tolerance", + "date" + ] }, "required": { "type": "boolean" @@ -2884,17 +3266,26 @@ } } }, - "required": ["path", "match"], + "required": [ + "path", + "match" + ], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": ["weighted_average", "all_or_nothing"] + "enum": [ + "weighted_average", + "all_or_nothing" + ] } }, - "required": ["type", "fields"], + "required": [ + "type", + "fields" + ], "additionalProperties": false }, { @@ -2931,7 +3322,10 @@ "minimum": 0 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -2968,7 +3362,10 @@ "minimum": 0 } }, - "required": ["type", "budget"], + "required": [ + "type", + "budget" + ], "additionalProperties": false }, { @@ -2998,7 +3395,10 @@ }, "type": { "type": "string", - "enum": ["token-usage", "token_usage"] + "enum": [ + "token-usage", + "token_usage" + ] }, "max_total": { "type": "number", @@ -3013,7 +3413,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -3043,7 +3445,10 @@ }, "type": { "type": "string", - "enum": ["execution-metrics", "execution_metrics"] + "enum": [ + "execution-metrics", + "execution_metrics" + ] }, "max_tool_calls": { "type": "number", @@ -3075,7 +3480,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -3105,7 +3512,10 @@ }, "type": { "type": "string", - "enum": ["agent-judge", "agent_judge"] + "enum": [ + "agent-judge", + "agent_judge" + ] }, "prompt": { "type": "string" @@ -3159,7 +3569,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -3181,7 +3594,9 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -3217,7 +3632,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -3253,7 +3671,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -3283,10 +3704,15 @@ }, "type": { "type": "string", - "enum": ["is-json", "is_json"] + "enum": [ + "is-json", + "is_json" + ] } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -3322,7 +3748,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -3403,7 +3832,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -3413,7 +3845,10 @@ "minItems": 1 } }, - "required": ["type", "criteria"], + "required": [ + "type", + "criteria" + ], "additionalProperties": false } ] @@ -3462,7 +3897,10 @@ }, "type": { "type": "string", - "enum": ["code-judge", "code_judge"] + "enum": [ + "code-judge", + "code_judge" + ] }, "command": { "anyOf": [ @@ -3514,7 +3952,10 @@ "additionalProperties": {} } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false }, { @@ -3544,7 +3985,10 @@ }, "type": { "type": "string", - "enum": ["llm-judge", "llm_judge"] + "enum": [ + "llm-judge", + "llm_judge" + ] }, "prompt": { "anyOf": [ @@ -3638,7 +4082,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -3655,9 +4102,21 @@ "config": { "type": "object", "additionalProperties": {} + }, + "max_steps": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "temperature": { + "type": "number", + "minimum": 0, + "maximum": 2 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -3717,7 +4176,9 @@ } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -3733,7 +4194,10 @@ "maximum": 1 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -3750,7 +4214,10 @@ "type": "string" } }, - "required": ["type", "path"], + "required": [ + "type", + "path" + ], "additionalProperties": false }, { @@ -3767,13 +4234,18 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false } ] } }, - "required": ["type", "aggregator"], + "required": [ + "type", + "aggregator" + ], "additionalProperties": false }, { @@ -3803,11 +4275,20 @@ }, "type": { "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] + "enum": [ + "tool-trajectory", + "tool_trajectory" + ] }, "mode": { "type": "string", - "enum": ["any_order", "in_order", "exact", "subset", "superset"] + "enum": [ + "any_order", + "in_order", + "exact", + "subset", + "superset" + ] }, "minimums": { "type": "object", @@ -3848,7 +4329,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -3862,7 +4348,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -3873,7 +4364,9 @@ ] } }, - "required": ["tool"], + "required": [ + "tool" + ], "additionalProperties": false } }, @@ -3881,7 +4374,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -3895,7 +4393,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -3906,7 +4409,10 @@ ] } }, - "required": ["type", "mode"], + "required": [ + "type", + "mode" + ], "additionalProperties": false }, { @@ -3936,7 +4442,10 @@ }, "type": { "type": "string", - "enum": ["field-accuracy", "field_accuracy"] + "enum": [ + "field-accuracy", + "field_accuracy" + ] }, "fields": { "type": "array", @@ -3948,7 +4457,11 @@ }, "match": { "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] + "enum": [ + "exact", + "numeric_tolerance", + "date" + ] }, "required": { "type": "boolean" @@ -3970,17 +4483,26 @@ } } }, - "required": ["path", "match"], + "required": [ + "path", + "match" + ], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": ["weighted_average", "all_or_nothing"] + "enum": [ + "weighted_average", + "all_or_nothing" + ] } }, - "required": ["type", "fields"], + "required": [ + "type", + "fields" + ], "additionalProperties": false }, { @@ -4017,7 +4539,10 @@ "minimum": 0 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -4054,7 +4579,10 @@ "minimum": 0 } }, - "required": ["type", "budget"], + "required": [ + "type", + "budget" + ], "additionalProperties": false }, { @@ -4084,7 +4612,10 @@ }, "type": { "type": "string", - "enum": ["token-usage", "token_usage"] + "enum": [ + "token-usage", + "token_usage" + ] }, "max_total": { "type": "number", @@ -4099,7 +4630,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -4129,7 +4662,10 @@ }, "type": { "type": "string", - "enum": ["execution-metrics", "execution_metrics"] + "enum": [ + "execution-metrics", + "execution_metrics" + ] }, "max_tool_calls": { "type": "number", @@ -4161,7 +4697,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -4191,7 +4729,10 @@ }, "type": { "type": "string", - "enum": ["agent-judge", "agent_judge"] + "enum": [ + "agent-judge", + "agent_judge" + ] }, "prompt": { "type": "string" @@ -4245,7 +4786,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -4267,7 +4811,9 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -4303,7 +4849,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -4339,7 +4888,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -4369,10 +4921,15 @@ }, "type": { "type": "string", - "enum": ["is-json", "is_json"] + "enum": [ + "is-json", + "is_json" + ] } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -4408,7 +4965,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -4489,7 +5049,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -4499,7 +5062,10 @@ "minItems": 1 } }, - "required": ["type", "criteria"], + "required": [ + "type", + "criteria" + ], "additionalProperties": false } ] @@ -4536,7 +5102,10 @@ }, "type": { "type": "string", - "enum": ["code-judge", "code_judge"] + "enum": [ + "code-judge", + "code_judge" + ] }, "command": { "anyOf": [ @@ -4588,7 +5157,10 @@ "additionalProperties": {} } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false }, { @@ -4618,7 +5190,10 @@ }, "type": { "type": "string", - "enum": ["llm-judge", "llm_judge"] + "enum": [ + "llm-judge", + "llm_judge" + ] }, "prompt": { "anyOf": [ @@ -4712,7 +5287,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -4729,9 +5307,21 @@ "config": { "type": "object", "additionalProperties": {} + }, + "max_steps": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "temperature": { + "type": "number", + "minimum": 0, + "maximum": 2 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -4791,7 +5381,9 @@ } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -4807,7 +5399,10 @@ "maximum": 1 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -4824,7 +5419,10 @@ "type": "string" } }, - "required": ["type", "path"], + "required": [ + "type", + "path" + ], "additionalProperties": false }, { @@ -4841,13 +5439,18 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false } ] } }, - "required": ["type", "aggregator"], + "required": [ + "type", + "aggregator" + ], "additionalProperties": false }, { @@ -4877,11 +5480,20 @@ }, "type": { "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] + "enum": [ + "tool-trajectory", + "tool_trajectory" + ] }, "mode": { "type": "string", - "enum": ["any_order", "in_order", "exact", "subset", "superset"] + "enum": [ + "any_order", + "in_order", + "exact", + "subset", + "superset" + ] }, "minimums": { "type": "object", @@ -4922,7 +5534,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -4936,7 +5553,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -4947,7 +5569,9 @@ ] } }, - "required": ["tool"], + "required": [ + "tool" + ], "additionalProperties": false } }, @@ -4955,7 +5579,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -4969,7 +5598,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -4980,7 +5614,10 @@ ] } }, - "required": ["type", "mode"], + "required": [ + "type", + "mode" + ], "additionalProperties": false }, { @@ -5010,7 +5647,10 @@ }, "type": { "type": "string", - "enum": ["field-accuracy", "field_accuracy"] + "enum": [ + "field-accuracy", + "field_accuracy" + ] }, "fields": { "type": "array", @@ -5022,7 +5662,11 @@ }, "match": { "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] + "enum": [ + "exact", + "numeric_tolerance", + "date" + ] }, "required": { "type": "boolean" @@ -5044,17 +5688,26 @@ } } }, - "required": ["path", "match"], + "required": [ + "path", + "match" + ], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": ["weighted_average", "all_or_nothing"] + "enum": [ + "weighted_average", + "all_or_nothing" + ] } }, - "required": ["type", "fields"], + "required": [ + "type", + "fields" + ], "additionalProperties": false }, { @@ -5091,7 +5744,10 @@ "minimum": 0 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -5128,7 +5784,10 @@ "minimum": 0 } }, - "required": ["type", "budget"], + "required": [ + "type", + "budget" + ], "additionalProperties": false }, { @@ -5158,7 +5817,10 @@ }, "type": { "type": "string", - "enum": ["token-usage", "token_usage"] + "enum": [ + "token-usage", + "token_usage" + ] }, "max_total": { "type": "number", @@ -5173,7 +5835,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -5203,7 +5867,10 @@ }, "type": { "type": "string", - "enum": ["execution-metrics", "execution_metrics"] + "enum": [ + "execution-metrics", + "execution_metrics" + ] }, "max_tool_calls": { "type": "number", @@ -5235,7 +5902,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -5265,7 +5934,10 @@ }, "type": { "type": "string", - "enum": ["agent-judge", "agent_judge"] + "enum": [ + "agent-judge", + "agent_judge" + ] }, "prompt": { "type": "string" @@ -5319,7 +5991,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -5341,7 +6016,9 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -5377,7 +6054,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -5413,7 +6093,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -5443,10 +6126,15 @@ }, "type": { "type": "string", - "enum": ["is-json", "is_json"] + "enum": [ + "is-json", + "is_json" + ] } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -5482,7 +6170,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -5563,7 +6254,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -5573,7 +6267,10 @@ "minItems": 1 } }, - "required": ["type", "criteria"], + "required": [ + "type", + "criteria" + ], "additionalProperties": false } ] @@ -5610,7 +6307,10 @@ }, "type": { "type": "string", - "enum": ["code-judge", "code_judge"] + "enum": [ + "code-judge", + "code_judge" + ] }, "command": { "anyOf": [ @@ -5662,7 +6362,10 @@ "additionalProperties": {} } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false }, { @@ -5692,7 +6395,10 @@ }, "type": { "type": "string", - "enum": ["llm-judge", "llm_judge"] + "enum": [ + "llm-judge", + "llm_judge" + ] }, "prompt": { "anyOf": [ @@ -5786,7 +6492,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -5803,9 +6512,21 @@ "config": { "type": "object", "additionalProperties": {} + }, + "max_steps": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "temperature": { + "type": "number", + "minimum": 0, + "maximum": 2 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -5865,7 +6586,9 @@ } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -5881,7 +6604,10 @@ "maximum": 1 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -5898,7 +6624,10 @@ "type": "string" } }, - "required": ["type", "path"], + "required": [ + "type", + "path" + ], "additionalProperties": false }, { @@ -5915,13 +6644,18 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false } ] } }, - "required": ["type", "aggregator"], + "required": [ + "type", + "aggregator" + ], "additionalProperties": false }, { @@ -5951,11 +6685,20 @@ }, "type": { "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] + "enum": [ + "tool-trajectory", + "tool_trajectory" + ] }, "mode": { "type": "string", - "enum": ["any_order", "in_order", "exact", "subset", "superset"] + "enum": [ + "any_order", + "in_order", + "exact", + "subset", + "superset" + ] }, "minimums": { "type": "object", @@ -5996,7 +6739,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -6010,7 +6758,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -6021,7 +6774,9 @@ ] } }, - "required": ["tool"], + "required": [ + "tool" + ], "additionalProperties": false } }, @@ -6029,7 +6784,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -6043,7 +6803,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -6054,7 +6819,10 @@ ] } }, - "required": ["type", "mode"], + "required": [ + "type", + "mode" + ], "additionalProperties": false }, { @@ -6084,7 +6852,10 @@ }, "type": { "type": "string", - "enum": ["field-accuracy", "field_accuracy"] + "enum": [ + "field-accuracy", + "field_accuracy" + ] }, "fields": { "type": "array", @@ -6096,7 +6867,11 @@ }, "match": { "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] + "enum": [ + "exact", + "numeric_tolerance", + "date" + ] }, "required": { "type": "boolean" @@ -6118,17 +6893,26 @@ } } }, - "required": ["path", "match"], + "required": [ + "path", + "match" + ], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": ["weighted_average", "all_or_nothing"] + "enum": [ + "weighted_average", + "all_or_nothing" + ] } }, - "required": ["type", "fields"], + "required": [ + "type", + "fields" + ], "additionalProperties": false }, { @@ -6165,7 +6949,10 @@ "minimum": 0 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -6202,7 +6989,10 @@ "minimum": 0 } }, - "required": ["type", "budget"], + "required": [ + "type", + "budget" + ], "additionalProperties": false }, { @@ -6232,7 +7022,10 @@ }, "type": { "type": "string", - "enum": ["token-usage", "token_usage"] + "enum": [ + "token-usage", + "token_usage" + ] }, "max_total": { "type": "number", @@ -6247,7 +7040,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -6277,7 +7072,10 @@ }, "type": { "type": "string", - "enum": ["execution-metrics", "execution_metrics"] + "enum": [ + "execution-metrics", + "execution_metrics" + ] }, "max_tool_calls": { "type": "number", @@ -6309,7 +7107,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -6339,7 +7139,10 @@ }, "type": { "type": "string", - "enum": ["agent-judge", "agent_judge"] + "enum": [ + "agent-judge", + "agent_judge" + ] }, "prompt": { "type": "string" @@ -6393,7 +7196,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -6415,7 +7221,9 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -6451,7 +7259,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -6487,7 +7298,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -6517,10 +7331,15 @@ }, "type": { "type": "string", - "enum": ["is-json", "is_json"] + "enum": [ + "is-json", + "is_json" + ] } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -6556,7 +7375,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -6637,7 +7459,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -6647,7 +7472,10 @@ "minItems": 1 } }, - "required": ["type", "criteria"], + "required": [ + "type", + "criteria" + ], "additionalProperties": false } ] @@ -6668,7 +7496,11 @@ }, "strategy": { "type": "string", - "enum": ["pass_at_k", "mean", "confidence_interval"] + "enum": [ + "pass_at_k", + "mean", + "confidence_interval" + ] }, "cost_limit_usd": { "type": "number", @@ -6679,7 +7511,9 @@ "minimum": 0 } }, - "required": ["count"], + "required": [ + "count" + ], "additionalProperties": false }, "total_budget_usd": { @@ -6707,7 +7541,10 @@ }, "isolation": { "type": "string", - "enum": ["shared", "per_test"] + "enum": [ + "shared", + "per_test" + ] }, "repos": { "type": "array", @@ -6731,7 +7568,10 @@ "format": "uri" } }, - "required": ["type", "url"], + "required": [ + "type", + "url" + ], "additionalProperties": false }, { @@ -6745,7 +7585,10 @@ "type": "string" } }, - "required": ["type", "path"], + "required": [ + "type", + "path" + ], "additionalProperties": false } ] @@ -6758,7 +7601,10 @@ }, "resolve": { "type": "string", - "enum": ["remote", "local"] + "enum": [ + "remote", + "local" + ] }, "ancestor": { "type": "integer", @@ -6787,7 +7633,10 @@ "additionalProperties": false } }, - "required": ["path", "source"], + "required": [ + "path", + "source" + ], "additionalProperties": false } }, @@ -6823,7 +7672,11 @@ }, "reset": { "type": "string", - "enum": ["none", "fast", "strict"] + "enum": [ + "none", + "fast", + "strict" + ] } }, "additionalProperties": false @@ -6854,7 +7707,11 @@ }, "reset": { "type": "string", - "enum": ["none", "fast", "strict"] + "enum": [ + "none", + "fast", + "strict" + ] } }, "additionalProperties": false @@ -6885,7 +7742,11 @@ }, "reset": { "type": "string", - "enum": ["none", "fast", "strict"] + "enum": [ + "none", + "fast", + "strict" + ] } }, "additionalProperties": false @@ -6916,7 +7777,11 @@ }, "reset": { "type": "string", - "enum": ["none", "fast", "strict"] + "enum": [ + "none", + "fast", + "strict" + ] } }, "additionalProperties": false @@ -6926,7 +7791,11 @@ }, "mode": { "type": "string", - "enum": ["pooled", "temp", "static"] + "enum": [ + "pooled", + "temp", + "static" + ] }, "path": { "type": "string" @@ -6948,7 +7817,9 @@ "type": "string" } }, - "required": ["id"], + "required": [ + "id" + ], "additionalProperties": false } }, @@ -6986,7 +7857,12 @@ "properties": { "role": { "type": "string", - "enum": ["system", "user", "assistant", "tool"] + "enum": [ + "system", + "user", + "assistant", + "tool" + ] }, "content": { "anyOf": [ @@ -7000,20 +7876,29 @@ "properties": { "type": { "type": "string", - "enum": ["text", "file"] + "enum": [ + "text", + "file" + ] }, "value": { "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false } } ] } }, - "required": ["role", "content"], + "required": [ + "role", + "content" + ], "additionalProperties": false } } @@ -7035,7 +7920,12 @@ "properties": { "role": { "type": "string", - "enum": ["system", "user", "assistant", "tool"] + "enum": [ + "system", + "user", + "assistant", + "tool" + ] }, "content": { "anyOf": [ @@ -7049,20 +7939,29 @@ "properties": { "type": { "type": "string", - "enum": ["text", "file"] + "enum": [ + "text", + "file" + ] }, "value": { "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false } } ] } }, - "required": ["role", "content"], + "required": [ + "role", + "content" + ], "additionalProperties": false } } @@ -7099,7 +7998,10 @@ }, "type": { "type": "string", - "enum": ["code-judge", "code_judge"] + "enum": [ + "code-judge", + "code_judge" + ] }, "command": { "anyOf": [ @@ -7151,7 +8053,10 @@ "additionalProperties": {} } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false }, { @@ -7181,7 +8086,10 @@ }, "type": { "type": "string", - "enum": ["llm-judge", "llm_judge"] + "enum": [ + "llm-judge", + "llm_judge" + ] }, "prompt": { "anyOf": [ @@ -7275,7 +8183,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -7292,9 +8203,21 @@ "config": { "type": "object", "additionalProperties": {} + }, + "max_steps": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "temperature": { + "type": "number", + "minimum": 0, + "maximum": 2 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -7354,7 +8277,9 @@ } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -7370,7 +8295,10 @@ "maximum": 1 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -7387,7 +8315,10 @@ "type": "string" } }, - "required": ["type", "path"], + "required": [ + "type", + "path" + ], "additionalProperties": false }, { @@ -7404,13 +8335,18 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false } ] } }, - "required": ["type", "aggregator"], + "required": [ + "type", + "aggregator" + ], "additionalProperties": false }, { @@ -7440,11 +8376,20 @@ }, "type": { "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] + "enum": [ + "tool-trajectory", + "tool_trajectory" + ] }, "mode": { "type": "string", - "enum": ["any_order", "in_order", "exact", "subset", "superset"] + "enum": [ + "any_order", + "in_order", + "exact", + "subset", + "superset" + ] }, "minimums": { "type": "object", @@ -7485,7 +8430,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -7499,7 +8449,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -7510,7 +8465,9 @@ ] } }, - "required": ["tool"], + "required": [ + "tool" + ], "additionalProperties": false } }, @@ -7518,7 +8475,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -7532,7 +8494,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -7543,7 +8510,10 @@ ] } }, - "required": ["type", "mode"], + "required": [ + "type", + "mode" + ], "additionalProperties": false }, { @@ -7573,7 +8543,10 @@ }, "type": { "type": "string", - "enum": ["field-accuracy", "field_accuracy"] + "enum": [ + "field-accuracy", + "field_accuracy" + ] }, "fields": { "type": "array", @@ -7585,7 +8558,11 @@ }, "match": { "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] + "enum": [ + "exact", + "numeric_tolerance", + "date" + ] }, "required": { "type": "boolean" @@ -7607,17 +8584,26 @@ } } }, - "required": ["path", "match"], + "required": [ + "path", + "match" + ], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": ["weighted_average", "all_or_nothing"] + "enum": [ + "weighted_average", + "all_or_nothing" + ] } }, - "required": ["type", "fields"], + "required": [ + "type", + "fields" + ], "additionalProperties": false }, { @@ -7654,7 +8640,10 @@ "minimum": 0 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -7691,7 +8680,10 @@ "minimum": 0 } }, - "required": ["type", "budget"], + "required": [ + "type", + "budget" + ], "additionalProperties": false }, { @@ -7721,7 +8713,10 @@ }, "type": { "type": "string", - "enum": ["token-usage", "token_usage"] + "enum": [ + "token-usage", + "token_usage" + ] }, "max_total": { "type": "number", @@ -7736,7 +8731,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -7766,7 +8763,10 @@ }, "type": { "type": "string", - "enum": ["execution-metrics", "execution_metrics"] + "enum": [ + "execution-metrics", + "execution_metrics" + ] }, "max_tool_calls": { "type": "number", @@ -7798,7 +8798,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -7828,7 +8830,10 @@ }, "type": { "type": "string", - "enum": ["agent-judge", "agent_judge"] + "enum": [ + "agent-judge", + "agent_judge" + ] }, "prompt": { "type": "string" @@ -7882,7 +8887,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -7904,7 +8912,9 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -7940,7 +8950,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -7976,7 +8989,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -8006,10 +9022,15 @@ }, "type": { "type": "string", - "enum": ["is-json", "is_json"] + "enum": [ + "is-json", + "is_json" + ] } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -8045,7 +9066,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -8126,7 +9150,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -8136,7 +9163,10 @@ "minItems": 1 } }, - "required": ["type", "criteria"], + "required": [ + "type", + "criteria" + ], "additionalProperties": false } ] @@ -8173,7 +9203,10 @@ }, "type": { "type": "string", - "enum": ["code-judge", "code_judge"] + "enum": [ + "code-judge", + "code_judge" + ] }, "command": { "anyOf": [ @@ -8225,7 +9258,10 @@ "additionalProperties": {} } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false }, { @@ -8255,7 +9291,10 @@ }, "type": { "type": "string", - "enum": ["llm-judge", "llm_judge"] + "enum": [ + "llm-judge", + "llm_judge" + ] }, "prompt": { "anyOf": [ @@ -8349,7 +9388,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -8366,9 +9408,21 @@ "config": { "type": "object", "additionalProperties": {} + }, + "max_steps": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "temperature": { + "type": "number", + "minimum": 0, + "maximum": 2 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -8428,7 +9482,9 @@ } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -8444,7 +9500,10 @@ "maximum": 1 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -8461,7 +9520,10 @@ "type": "string" } }, - "required": ["type", "path"], + "required": [ + "type", + "path" + ], "additionalProperties": false }, { @@ -8478,13 +9540,18 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false } ] } }, - "required": ["type", "aggregator"], + "required": [ + "type", + "aggregator" + ], "additionalProperties": false }, { @@ -8514,11 +9581,20 @@ }, "type": { "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] + "enum": [ + "tool-trajectory", + "tool_trajectory" + ] }, "mode": { "type": "string", - "enum": ["any_order", "in_order", "exact", "subset", "superset"] + "enum": [ + "any_order", + "in_order", + "exact", + "subset", + "superset" + ] }, "minimums": { "type": "object", @@ -8559,7 +9635,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -8573,7 +9654,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -8584,7 +9670,9 @@ ] } }, - "required": ["tool"], + "required": [ + "tool" + ], "additionalProperties": false } }, @@ -8592,7 +9680,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -8606,7 +9699,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -8617,7 +9715,10 @@ ] } }, - "required": ["type", "mode"], + "required": [ + "type", + "mode" + ], "additionalProperties": false }, { @@ -8647,7 +9748,10 @@ }, "type": { "type": "string", - "enum": ["field-accuracy", "field_accuracy"] + "enum": [ + "field-accuracy", + "field_accuracy" + ] }, "fields": { "type": "array", @@ -8659,7 +9763,11 @@ }, "match": { "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] + "enum": [ + "exact", + "numeric_tolerance", + "date" + ] }, "required": { "type": "boolean" @@ -8681,17 +9789,26 @@ } } }, - "required": ["path", "match"], + "required": [ + "path", + "match" + ], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": ["weighted_average", "all_or_nothing"] + "enum": [ + "weighted_average", + "all_or_nothing" + ] } }, - "required": ["type", "fields"], + "required": [ + "type", + "fields" + ], "additionalProperties": false }, { @@ -8728,7 +9845,10 @@ "minimum": 0 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -8765,7 +9885,10 @@ "minimum": 0 } }, - "required": ["type", "budget"], + "required": [ + "type", + "budget" + ], "additionalProperties": false }, { @@ -8795,7 +9918,10 @@ }, "type": { "type": "string", - "enum": ["token-usage", "token_usage"] + "enum": [ + "token-usage", + "token_usage" + ] }, "max_total": { "type": "number", @@ -8810,7 +9936,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -8840,7 +9968,10 @@ }, "type": { "type": "string", - "enum": ["execution-metrics", "execution_metrics"] + "enum": [ + "execution-metrics", + "execution_metrics" + ] }, "max_tool_calls": { "type": "number", @@ -8872,7 +10003,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -8902,7 +10035,10 @@ }, "type": { "type": "string", - "enum": ["agent-judge", "agent_judge"] + "enum": [ + "agent-judge", + "agent_judge" + ] }, "prompt": { "type": "string" @@ -8956,7 +10092,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -8978,7 +10117,9 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -9014,7 +10155,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -9050,7 +10194,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -9080,10 +10227,15 @@ }, "type": { "type": "string", - "enum": ["is-json", "is_json"] + "enum": [ + "is-json", + "is_json" + ] } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -9119,7 +10271,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -9200,7 +10355,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -9210,7 +10368,10 @@ "minItems": 1 } }, - "required": ["type", "criteria"], + "required": [ + "type", + "criteria" + ], "additionalProperties": false } ] @@ -9247,7 +10408,10 @@ }, "type": { "type": "string", - "enum": ["code-judge", "code_judge"] + "enum": [ + "code-judge", + "code_judge" + ] }, "command": { "anyOf": [ @@ -9299,7 +10463,10 @@ "additionalProperties": {} } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false }, { @@ -9329,7 +10496,10 @@ }, "type": { "type": "string", - "enum": ["llm-judge", "llm_judge"] + "enum": [ + "llm-judge", + "llm_judge" + ] }, "prompt": { "anyOf": [ @@ -9423,7 +10593,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -9440,9 +10613,21 @@ "config": { "type": "object", "additionalProperties": {} + }, + "max_steps": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "temperature": { + "type": "number", + "minimum": 0, + "maximum": 2 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -9502,7 +10687,9 @@ } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -9518,7 +10705,10 @@ "maximum": 1 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -9535,7 +10725,10 @@ "type": "string" } }, - "required": ["type", "path"], + "required": [ + "type", + "path" + ], "additionalProperties": false }, { @@ -9552,13 +10745,18 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false } ] } }, - "required": ["type", "aggregator"], + "required": [ + "type", + "aggregator" + ], "additionalProperties": false }, { @@ -9588,11 +10786,20 @@ }, "type": { "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] + "enum": [ + "tool-trajectory", + "tool_trajectory" + ] }, "mode": { "type": "string", - "enum": ["any_order", "in_order", "exact", "subset", "superset"] + "enum": [ + "any_order", + "in_order", + "exact", + "subset", + "superset" + ] }, "minimums": { "type": "object", @@ -9633,7 +10840,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -9647,7 +10859,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -9658,7 +10875,9 @@ ] } }, - "required": ["tool"], + "required": [ + "tool" + ], "additionalProperties": false } }, @@ -9666,7 +10885,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -9680,7 +10904,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -9691,7 +10920,10 @@ ] } }, - "required": ["type", "mode"], + "required": [ + "type", + "mode" + ], "additionalProperties": false }, { @@ -9721,7 +10953,10 @@ }, "type": { "type": "string", - "enum": ["field-accuracy", "field_accuracy"] + "enum": [ + "field-accuracy", + "field_accuracy" + ] }, "fields": { "type": "array", @@ -9733,7 +10968,11 @@ }, "match": { "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] + "enum": [ + "exact", + "numeric_tolerance", + "date" + ] }, "required": { "type": "boolean" @@ -9755,17 +10994,26 @@ } } }, - "required": ["path", "match"], + "required": [ + "path", + "match" + ], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": ["weighted_average", "all_or_nothing"] + "enum": [ + "weighted_average", + "all_or_nothing" + ] } }, - "required": ["type", "fields"], + "required": [ + "type", + "fields" + ], "additionalProperties": false }, { @@ -9802,7 +11050,10 @@ "minimum": 0 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -9839,7 +11090,10 @@ "minimum": 0 } }, - "required": ["type", "budget"], + "required": [ + "type", + "budget" + ], "additionalProperties": false }, { @@ -9869,7 +11123,10 @@ }, "type": { "type": "string", - "enum": ["token-usage", "token_usage"] + "enum": [ + "token-usage", + "token_usage" + ] }, "max_total": { "type": "number", @@ -9884,7 +11141,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -9914,7 +11173,10 @@ }, "type": { "type": "string", - "enum": ["execution-metrics", "execution_metrics"] + "enum": [ + "execution-metrics", + "execution_metrics" + ] }, "max_tool_calls": { "type": "number", @@ -9946,7 +11208,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -9976,7 +11240,10 @@ }, "type": { "type": "string", - "enum": ["agent-judge", "agent_judge"] + "enum": [ + "agent-judge", + "agent_judge" + ] }, "prompt": { "type": "string" @@ -10030,7 +11297,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -10052,7 +11322,9 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -10088,7 +11360,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -10124,7 +11399,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -10154,10 +11432,15 @@ }, "type": { "type": "string", - "enum": ["is-json", "is_json"] + "enum": [ + "is-json", + "is_json" + ] } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -10193,7 +11476,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -10274,7 +11560,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -10284,7 +11573,10 @@ "minItems": 1 } }, - "required": ["type", "criteria"], + "required": [ + "type", + "criteria" + ], "additionalProperties": false } ] @@ -10333,7 +11625,10 @@ }, "type": { "type": "string", - "enum": ["code-judge", "code_judge"] + "enum": [ + "code-judge", + "code_judge" + ] }, "command": { "anyOf": [ @@ -10385,7 +11680,10 @@ "additionalProperties": {} } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false }, { @@ -10415,7 +11713,10 @@ }, "type": { "type": "string", - "enum": ["llm-judge", "llm_judge"] + "enum": [ + "llm-judge", + "llm_judge" + ] }, "prompt": { "anyOf": [ @@ -10509,7 +11810,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -10526,9 +11830,21 @@ "config": { "type": "object", "additionalProperties": {} + }, + "max_steps": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "temperature": { + "type": "number", + "minimum": 0, + "maximum": 2 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -10588,7 +11904,9 @@ } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -10604,7 +11922,10 @@ "maximum": 1 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -10621,7 +11942,10 @@ "type": "string" } }, - "required": ["type", "path"], + "required": [ + "type", + "path" + ], "additionalProperties": false }, { @@ -10638,13 +11962,18 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false } ] } }, - "required": ["type", "aggregator"], + "required": [ + "type", + "aggregator" + ], "additionalProperties": false }, { @@ -10674,11 +12003,20 @@ }, "type": { "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] + "enum": [ + "tool-trajectory", + "tool_trajectory" + ] }, "mode": { "type": "string", - "enum": ["any_order", "in_order", "exact", "subset", "superset"] + "enum": [ + "any_order", + "in_order", + "exact", + "subset", + "superset" + ] }, "minimums": { "type": "object", @@ -10719,7 +12057,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -10733,7 +12076,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -10744,7 +12092,9 @@ ] } }, - "required": ["tool"], + "required": [ + "tool" + ], "additionalProperties": false } }, @@ -10752,7 +12102,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -10766,7 +12121,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -10777,7 +12137,10 @@ ] } }, - "required": ["type", "mode"], + "required": [ + "type", + "mode" + ], "additionalProperties": false }, { @@ -10807,7 +12170,10 @@ }, "type": { "type": "string", - "enum": ["field-accuracy", "field_accuracy"] + "enum": [ + "field-accuracy", + "field_accuracy" + ] }, "fields": { "type": "array", @@ -10819,7 +12185,11 @@ }, "match": { "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] + "enum": [ + "exact", + "numeric_tolerance", + "date" + ] }, "required": { "type": "boolean" @@ -10841,17 +12211,26 @@ } } }, - "required": ["path", "match"], + "required": [ + "path", + "match" + ], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": ["weighted_average", "all_or_nothing"] + "enum": [ + "weighted_average", + "all_or_nothing" + ] } }, - "required": ["type", "fields"], + "required": [ + "type", + "fields" + ], "additionalProperties": false }, { @@ -10888,7 +12267,10 @@ "minimum": 0 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -10925,7 +12307,10 @@ "minimum": 0 } }, - "required": ["type", "budget"], + "required": [ + "type", + "budget" + ], "additionalProperties": false }, { @@ -10955,7 +12340,10 @@ }, "type": { "type": "string", - "enum": ["token-usage", "token_usage"] + "enum": [ + "token-usage", + "token_usage" + ] }, "max_total": { "type": "number", @@ -10970,7 +12358,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -11000,7 +12390,10 @@ }, "type": { "type": "string", - "enum": ["execution-metrics", "execution_metrics"] + "enum": [ + "execution-metrics", + "execution_metrics" + ] }, "max_tool_calls": { "type": "number", @@ -11032,7 +12425,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -11062,7 +12457,10 @@ }, "type": { "type": "string", - "enum": ["agent-judge", "agent_judge"] + "enum": [ + "agent-judge", + "agent_judge" + ] }, "prompt": { "type": "string" @@ -11116,7 +12514,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -11138,7 +12539,9 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -11174,7 +12577,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -11210,7 +12616,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -11240,10 +12649,15 @@ }, "type": { "type": "string", - "enum": ["is-json", "is_json"] + "enum": [ + "is-json", + "is_json" + ] } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -11279,7 +12693,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -11360,7 +12777,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -11370,7 +12790,10 @@ "minItems": 1 } }, - "required": ["type", "criteria"], + "required": [ + "type", + "criteria" + ], "additionalProperties": false } ] @@ -11407,7 +12830,10 @@ }, "type": { "type": "string", - "enum": ["code-judge", "code_judge"] + "enum": [ + "code-judge", + "code_judge" + ] }, "command": { "anyOf": [ @@ -11459,7 +12885,10 @@ "additionalProperties": {} } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false }, { @@ -11489,7 +12918,10 @@ }, "type": { "type": "string", - "enum": ["llm-judge", "llm_judge"] + "enum": [ + "llm-judge", + "llm_judge" + ] }, "prompt": { "anyOf": [ @@ -11583,7 +13015,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -11600,9 +13035,21 @@ "config": { "type": "object", "additionalProperties": {} + }, + "max_steps": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "temperature": { + "type": "number", + "minimum": 0, + "maximum": 2 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -11662,7 +13109,9 @@ } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -11678,7 +13127,10 @@ "maximum": 1 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -11695,7 +13147,10 @@ "type": "string" } }, - "required": ["type", "path"], + "required": [ + "type", + "path" + ], "additionalProperties": false }, { @@ -11712,13 +13167,18 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false } ] } }, - "required": ["type", "aggregator"], + "required": [ + "type", + "aggregator" + ], "additionalProperties": false }, { @@ -11748,11 +13208,20 @@ }, "type": { "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] + "enum": [ + "tool-trajectory", + "tool_trajectory" + ] }, "mode": { "type": "string", - "enum": ["any_order", "in_order", "exact", "subset", "superset"] + "enum": [ + "any_order", + "in_order", + "exact", + "subset", + "superset" + ] }, "minimums": { "type": "object", @@ -11793,7 +13262,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -11807,7 +13281,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -11818,7 +13297,9 @@ ] } }, - "required": ["tool"], + "required": [ + "tool" + ], "additionalProperties": false } }, @@ -11826,7 +13307,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -11840,7 +13326,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -11851,7 +13342,10 @@ ] } }, - "required": ["type", "mode"], + "required": [ + "type", + "mode" + ], "additionalProperties": false }, { @@ -11881,7 +13375,10 @@ }, "type": { "type": "string", - "enum": ["field-accuracy", "field_accuracy"] + "enum": [ + "field-accuracy", + "field_accuracy" + ] }, "fields": { "type": "array", @@ -11893,7 +13390,11 @@ }, "match": { "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] + "enum": [ + "exact", + "numeric_tolerance", + "date" + ] }, "required": { "type": "boolean" @@ -11915,17 +13416,26 @@ } } }, - "required": ["path", "match"], + "required": [ + "path", + "match" + ], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": ["weighted_average", "all_or_nothing"] + "enum": [ + "weighted_average", + "all_or_nothing" + ] } }, - "required": ["type", "fields"], + "required": [ + "type", + "fields" + ], "additionalProperties": false }, { @@ -11962,7 +13472,10 @@ "minimum": 0 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -11999,7 +13512,10 @@ "minimum": 0 } }, - "required": ["type", "budget"], + "required": [ + "type", + "budget" + ], "additionalProperties": false }, { @@ -12029,7 +13545,10 @@ }, "type": { "type": "string", - "enum": ["token-usage", "token_usage"] + "enum": [ + "token-usage", + "token_usage" + ] }, "max_total": { "type": "number", @@ -12044,7 +13563,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -12074,7 +13595,10 @@ }, "type": { "type": "string", - "enum": ["execution-metrics", "execution_metrics"] + "enum": [ + "execution-metrics", + "execution_metrics" + ] }, "max_tool_calls": { "type": "number", @@ -12106,7 +13630,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -12136,7 +13662,10 @@ }, "type": { "type": "string", - "enum": ["agent-judge", "agent_judge"] + "enum": [ + "agent-judge", + "agent_judge" + ] }, "prompt": { "type": "string" @@ -12190,7 +13719,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -12212,7 +13744,9 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -12248,7 +13782,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -12284,7 +13821,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -12314,10 +13854,15 @@ }, "type": { "type": "string", - "enum": ["is-json", "is_json"] + "enum": [ + "is-json", + "is_json" + ] } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -12353,7 +13898,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -12434,7 +13982,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -12444,7 +13995,10 @@ "minItems": 1 } }, - "required": ["type", "criteria"], + "required": [ + "type", + "criteria" + ], "additionalProperties": false } ] @@ -12481,7 +14035,10 @@ }, "type": { "type": "string", - "enum": ["code-judge", "code_judge"] + "enum": [ + "code-judge", + "code_judge" + ] }, "command": { "anyOf": [ @@ -12533,7 +14090,10 @@ "additionalProperties": {} } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false }, { @@ -12563,7 +14123,10 @@ }, "type": { "type": "string", - "enum": ["llm-judge", "llm_judge"] + "enum": [ + "llm-judge", + "llm_judge" + ] }, "prompt": { "anyOf": [ @@ -12657,7 +14220,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -12674,9 +14240,21 @@ "config": { "type": "object", "additionalProperties": {} + }, + "max_steps": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "temperature": { + "type": "number", + "minimum": 0, + "maximum": 2 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -12736,7 +14314,9 @@ } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -12752,7 +14332,10 @@ "maximum": 1 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -12769,7 +14352,10 @@ "type": "string" } }, - "required": ["type", "path"], + "required": [ + "type", + "path" + ], "additionalProperties": false }, { @@ -12786,13 +14372,18 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false } ] } }, - "required": ["type", "aggregator"], + "required": [ + "type", + "aggregator" + ], "additionalProperties": false }, { @@ -12822,11 +14413,20 @@ }, "type": { "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] + "enum": [ + "tool-trajectory", + "tool_trajectory" + ] }, "mode": { "type": "string", - "enum": ["any_order", "in_order", "exact", "subset", "superset"] + "enum": [ + "any_order", + "in_order", + "exact", + "subset", + "superset" + ] }, "minimums": { "type": "object", @@ -12867,7 +14467,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -12881,7 +14486,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -12892,7 +14502,9 @@ ] } }, - "required": ["tool"], + "required": [ + "tool" + ], "additionalProperties": false } }, @@ -12900,7 +14512,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -12914,7 +14531,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -12925,7 +14547,10 @@ ] } }, - "required": ["type", "mode"], + "required": [ + "type", + "mode" + ], "additionalProperties": false }, { @@ -12955,7 +14580,10 @@ }, "type": { "type": "string", - "enum": ["field-accuracy", "field_accuracy"] + "enum": [ + "field-accuracy", + "field_accuracy" + ] }, "fields": { "type": "array", @@ -12967,7 +14595,11 @@ }, "match": { "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] + "enum": [ + "exact", + "numeric_tolerance", + "date" + ] }, "required": { "type": "boolean" @@ -12989,17 +14621,26 @@ } } }, - "required": ["path", "match"], + "required": [ + "path", + "match" + ], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": ["weighted_average", "all_or_nothing"] + "enum": [ + "weighted_average", + "all_or_nothing" + ] } }, - "required": ["type", "fields"], + "required": [ + "type", + "fields" + ], "additionalProperties": false }, { @@ -13036,7 +14677,10 @@ "minimum": 0 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -13073,7 +14717,10 @@ "minimum": 0 } }, - "required": ["type", "budget"], + "required": [ + "type", + "budget" + ], "additionalProperties": false }, { @@ -13103,7 +14750,10 @@ }, "type": { "type": "string", - "enum": ["token-usage", "token_usage"] + "enum": [ + "token-usage", + "token_usage" + ] }, "max_total": { "type": "number", @@ -13118,7 +14768,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -13148,7 +14800,10 @@ }, "type": { "type": "string", - "enum": ["execution-metrics", "execution_metrics"] + "enum": [ + "execution-metrics", + "execution_metrics" + ] }, "max_tool_calls": { "type": "number", @@ -13180,7 +14835,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -13210,7 +14867,10 @@ }, "type": { "type": "string", - "enum": ["agent-judge", "agent_judge"] + "enum": [ + "agent-judge", + "agent_judge" + ] }, "prompt": { "type": "string" @@ -13264,7 +14924,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -13286,7 +14949,9 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -13322,7 +14987,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -13358,7 +15026,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -13388,10 +15059,15 @@ }, "type": { "type": "string", - "enum": ["is-json", "is_json"] + "enum": [ + "is-json", + "is_json" + ] } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -13427,7 +15103,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -13508,7 +15187,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -13518,7 +15200,10 @@ "minItems": 1 } }, - "required": ["type", "criteria"], + "required": [ + "type", + "criteria" + ], "additionalProperties": false } ] @@ -13539,7 +15224,11 @@ }, "strategy": { "type": "string", - "enum": ["pass_at_k", "mean", "confidence_interval"] + "enum": [ + "pass_at_k", + "mean", + "confidence_interval" + ] }, "cost_limit_usd": { "type": "number", @@ -13550,7 +15239,9 @@ "minimum": 0 } }, - "required": ["count"], + "required": [ + "count" + ], "additionalProperties": false }, "total_budget_usd": { @@ -13578,7 +15269,10 @@ }, "isolation": { "type": "string", - "enum": ["shared", "per_test"] + "enum": [ + "shared", + "per_test" + ] }, "repos": { "type": "array", @@ -13602,7 +15296,10 @@ "format": "uri" } }, - "required": ["type", "url"], + "required": [ + "type", + "url" + ], "additionalProperties": false }, { @@ -13616,7 +15313,10 @@ "type": "string" } }, - "required": ["type", "path"], + "required": [ + "type", + "path" + ], "additionalProperties": false } ] @@ -13629,7 +15329,10 @@ }, "resolve": { "type": "string", - "enum": ["remote", "local"] + "enum": [ + "remote", + "local" + ] }, "ancestor": { "type": "integer", @@ -13658,7 +15361,10 @@ "additionalProperties": false } }, - "required": ["path", "source"], + "required": [ + "path", + "source" + ], "additionalProperties": false } }, @@ -13694,7 +15400,11 @@ }, "reset": { "type": "string", - "enum": ["none", "fast", "strict"] + "enum": [ + "none", + "fast", + "strict" + ] } }, "additionalProperties": false @@ -13725,7 +15435,11 @@ }, "reset": { "type": "string", - "enum": ["none", "fast", "strict"] + "enum": [ + "none", + "fast", + "strict" + ] } }, "additionalProperties": false @@ -13756,7 +15470,11 @@ }, "reset": { "type": "string", - "enum": ["none", "fast", "strict"] + "enum": [ + "none", + "fast", + "strict" + ] } }, "additionalProperties": false @@ -13787,7 +15505,11 @@ }, "reset": { "type": "string", - "enum": ["none", "fast", "strict"] + "enum": [ + "none", + "fast", + "strict" + ] } }, "additionalProperties": false @@ -13797,7 +15519,11 @@ }, "mode": { "type": "string", - "enum": ["pooled", "temp", "static"] + "enum": [ + "pooled", + "temp", + "static" + ] }, "path": { "type": "string" @@ -13819,7 +15545,9 @@ "type": "string" } }, - "required": ["id"], + "required": [ + "id" + ], "additionalProperties": false } }, @@ -13874,7 +15602,10 @@ }, "type": { "type": "string", - "enum": ["code-judge", "code_judge"] + "enum": [ + "code-judge", + "code_judge" + ] }, "command": { "anyOf": [ @@ -13926,7 +15657,10 @@ "additionalProperties": {} } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false }, { @@ -13956,7 +15690,10 @@ }, "type": { "type": "string", - "enum": ["llm-judge", "llm_judge"] + "enum": [ + "llm-judge", + "llm_judge" + ] }, "prompt": { "anyOf": [ @@ -14050,7 +15787,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -14067,9 +15807,21 @@ "config": { "type": "object", "additionalProperties": {} + }, + "max_steps": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "temperature": { + "type": "number", + "minimum": 0, + "maximum": 2 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -14129,7 +15881,9 @@ } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -14145,7 +15899,10 @@ "maximum": 1 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -14162,7 +15919,10 @@ "type": "string" } }, - "required": ["type", "path"], + "required": [ + "type", + "path" + ], "additionalProperties": false }, { @@ -14179,13 +15939,18 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false } ] } }, - "required": ["type", "aggregator"], + "required": [ + "type", + "aggregator" + ], "additionalProperties": false }, { @@ -14215,11 +15980,20 @@ }, "type": { "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] + "enum": [ + "tool-trajectory", + "tool_trajectory" + ] }, "mode": { "type": "string", - "enum": ["any_order", "in_order", "exact", "subset", "superset"] + "enum": [ + "any_order", + "in_order", + "exact", + "subset", + "superset" + ] }, "minimums": { "type": "object", @@ -14260,7 +16034,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -14274,7 +16053,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -14285,7 +16069,9 @@ ] } }, - "required": ["tool"], + "required": [ + "tool" + ], "additionalProperties": false } }, @@ -14293,7 +16079,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -14307,7 +16098,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -14318,7 +16114,10 @@ ] } }, - "required": ["type", "mode"], + "required": [ + "type", + "mode" + ], "additionalProperties": false }, { @@ -14348,7 +16147,10 @@ }, "type": { "type": "string", - "enum": ["field-accuracy", "field_accuracy"] + "enum": [ + "field-accuracy", + "field_accuracy" + ] }, "fields": { "type": "array", @@ -14360,7 +16162,11 @@ }, "match": { "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] + "enum": [ + "exact", + "numeric_tolerance", + "date" + ] }, "required": { "type": "boolean" @@ -14382,17 +16188,26 @@ } } }, - "required": ["path", "match"], + "required": [ + "path", + "match" + ], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": ["weighted_average", "all_or_nothing"] + "enum": [ + "weighted_average", + "all_or_nothing" + ] } }, - "required": ["type", "fields"], + "required": [ + "type", + "fields" + ], "additionalProperties": false }, { @@ -14429,7 +16244,10 @@ "minimum": 0 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -14466,7 +16284,10 @@ "minimum": 0 } }, - "required": ["type", "budget"], + "required": [ + "type", + "budget" + ], "additionalProperties": false }, { @@ -14496,7 +16317,10 @@ }, "type": { "type": "string", - "enum": ["token-usage", "token_usage"] + "enum": [ + "token-usage", + "token_usage" + ] }, "max_total": { "type": "number", @@ -14511,7 +16335,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -14541,7 +16367,10 @@ }, "type": { "type": "string", - "enum": ["execution-metrics", "execution_metrics"] + "enum": [ + "execution-metrics", + "execution_metrics" + ] }, "max_tool_calls": { "type": "number", @@ -14573,7 +16402,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -14603,7 +16434,10 @@ }, "type": { "type": "string", - "enum": ["agent-judge", "agent_judge"] + "enum": [ + "agent-judge", + "agent_judge" + ] }, "prompt": { "type": "string" @@ -14657,7 +16491,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -14679,7 +16516,9 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -14715,7 +16554,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -14751,7 +16593,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -14781,10 +16626,15 @@ }, "type": { "type": "string", - "enum": ["is-json", "is_json"] + "enum": [ + "is-json", + "is_json" + ] } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -14820,7 +16670,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -14901,7 +16754,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -14911,7 +16767,10 @@ "minItems": 1 } }, - "required": ["type", "criteria"], + "required": [ + "type", + "criteria" + ], "additionalProperties": false } ] @@ -14948,7 +16807,10 @@ }, "type": { "type": "string", - "enum": ["code-judge", "code_judge"] + "enum": [ + "code-judge", + "code_judge" + ] }, "command": { "anyOf": [ @@ -15000,7 +16862,10 @@ "additionalProperties": {} } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false }, { @@ -15030,7 +16895,10 @@ }, "type": { "type": "string", - "enum": ["llm-judge", "llm_judge"] + "enum": [ + "llm-judge", + "llm_judge" + ] }, "prompt": { "anyOf": [ @@ -15124,7 +16992,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -15141,9 +17012,21 @@ "config": { "type": "object", "additionalProperties": {} + }, + "max_steps": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "temperature": { + "type": "number", + "minimum": 0, + "maximum": 2 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -15203,7 +17086,9 @@ } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -15219,7 +17104,10 @@ "maximum": 1 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -15236,7 +17124,10 @@ "type": "string" } }, - "required": ["type", "path"], + "required": [ + "type", + "path" + ], "additionalProperties": false }, { @@ -15253,13 +17144,18 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false } ] } }, - "required": ["type", "aggregator"], + "required": [ + "type", + "aggregator" + ], "additionalProperties": false }, { @@ -15289,11 +17185,20 @@ }, "type": { "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] + "enum": [ + "tool-trajectory", + "tool_trajectory" + ] }, "mode": { "type": "string", - "enum": ["any_order", "in_order", "exact", "subset", "superset"] + "enum": [ + "any_order", + "in_order", + "exact", + "subset", + "superset" + ] }, "minimums": { "type": "object", @@ -15334,7 +17239,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -15348,7 +17258,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -15359,7 +17274,9 @@ ] } }, - "required": ["tool"], + "required": [ + "tool" + ], "additionalProperties": false } }, @@ -15367,7 +17284,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -15381,7 +17303,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -15392,7 +17319,10 @@ ] } }, - "required": ["type", "mode"], + "required": [ + "type", + "mode" + ], "additionalProperties": false }, { @@ -15422,7 +17352,10 @@ }, "type": { "type": "string", - "enum": ["field-accuracy", "field_accuracy"] + "enum": [ + "field-accuracy", + "field_accuracy" + ] }, "fields": { "type": "array", @@ -15434,7 +17367,11 @@ }, "match": { "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] + "enum": [ + "exact", + "numeric_tolerance", + "date" + ] }, "required": { "type": "boolean" @@ -15456,17 +17393,26 @@ } } }, - "required": ["path", "match"], + "required": [ + "path", + "match" + ], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": ["weighted_average", "all_or_nothing"] + "enum": [ + "weighted_average", + "all_or_nothing" + ] } }, - "required": ["type", "fields"], + "required": [ + "type", + "fields" + ], "additionalProperties": false }, { @@ -15503,7 +17449,10 @@ "minimum": 0 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -15540,7 +17489,10 @@ "minimum": 0 } }, - "required": ["type", "budget"], + "required": [ + "type", + "budget" + ], "additionalProperties": false }, { @@ -15570,7 +17522,10 @@ }, "type": { "type": "string", - "enum": ["token-usage", "token_usage"] + "enum": [ + "token-usage", + "token_usage" + ] }, "max_total": { "type": "number", @@ -15585,7 +17540,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -15615,7 +17572,10 @@ }, "type": { "type": "string", - "enum": ["execution-metrics", "execution_metrics"] + "enum": [ + "execution-metrics", + "execution_metrics" + ] }, "max_tool_calls": { "type": "number", @@ -15647,7 +17607,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -15677,7 +17639,10 @@ }, "type": { "type": "string", - "enum": ["agent-judge", "agent_judge"] + "enum": [ + "agent-judge", + "agent_judge" + ] }, "prompt": { "type": "string" @@ -15731,7 +17696,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -15753,7 +17721,9 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -15789,7 +17759,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -15825,7 +17798,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -15855,10 +17831,15 @@ }, "type": { "type": "string", - "enum": ["is-json", "is_json"] + "enum": [ + "is-json", + "is_json" + ] } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -15894,7 +17875,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -15975,7 +17959,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -15985,7 +17972,10 @@ "minItems": 1 } }, - "required": ["type", "criteria"], + "required": [ + "type", + "criteria" + ], "additionalProperties": false } ] @@ -16022,7 +18012,10 @@ }, "type": { "type": "string", - "enum": ["code-judge", "code_judge"] + "enum": [ + "code-judge", + "code_judge" + ] }, "command": { "anyOf": [ @@ -16074,7 +18067,10 @@ "additionalProperties": {} } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false }, { @@ -16104,7 +18100,10 @@ }, "type": { "type": "string", - "enum": ["llm-judge", "llm_judge"] + "enum": [ + "llm-judge", + "llm_judge" + ] }, "prompt": { "anyOf": [ @@ -16198,7 +18197,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -16215,9 +18217,21 @@ "config": { "type": "object", "additionalProperties": {} + }, + "max_steps": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "temperature": { + "type": "number", + "minimum": 0, + "maximum": 2 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -16277,7 +18291,9 @@ } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -16293,7 +18309,10 @@ "maximum": 1 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -16310,7 +18329,10 @@ "type": "string" } }, - "required": ["type", "path"], + "required": [ + "type", + "path" + ], "additionalProperties": false }, { @@ -16327,13 +18349,18 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false } ] } }, - "required": ["type", "aggregator"], + "required": [ + "type", + "aggregator" + ], "additionalProperties": false }, { @@ -16363,11 +18390,20 @@ }, "type": { "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] + "enum": [ + "tool-trajectory", + "tool_trajectory" + ] }, "mode": { "type": "string", - "enum": ["any_order", "in_order", "exact", "subset", "superset"] + "enum": [ + "any_order", + "in_order", + "exact", + "subset", + "superset" + ] }, "minimums": { "type": "object", @@ -16408,7 +18444,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -16422,7 +18463,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -16433,7 +18479,9 @@ ] } }, - "required": ["tool"], + "required": [ + "tool" + ], "additionalProperties": false } }, @@ -16441,7 +18489,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -16455,7 +18508,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -16466,7 +18524,10 @@ ] } }, - "required": ["type", "mode"], + "required": [ + "type", + "mode" + ], "additionalProperties": false }, { @@ -16496,7 +18557,10 @@ }, "type": { "type": "string", - "enum": ["field-accuracy", "field_accuracy"] + "enum": [ + "field-accuracy", + "field_accuracy" + ] }, "fields": { "type": "array", @@ -16508,7 +18572,11 @@ }, "match": { "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] + "enum": [ + "exact", + "numeric_tolerance", + "date" + ] }, "required": { "type": "boolean" @@ -16530,17 +18598,26 @@ } } }, - "required": ["path", "match"], + "required": [ + "path", + "match" + ], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": ["weighted_average", "all_or_nothing"] + "enum": [ + "weighted_average", + "all_or_nothing" + ] } }, - "required": ["type", "fields"], + "required": [ + "type", + "fields" + ], "additionalProperties": false }, { @@ -16577,7 +18654,10 @@ "minimum": 0 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -16614,7 +18694,10 @@ "minimum": 0 } }, - "required": ["type", "budget"], + "required": [ + "type", + "budget" + ], "additionalProperties": false }, { @@ -16644,7 +18727,10 @@ }, "type": { "type": "string", - "enum": ["token-usage", "token_usage"] + "enum": [ + "token-usage", + "token_usage" + ] }, "max_total": { "type": "number", @@ -16659,7 +18745,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -16689,7 +18777,10 @@ }, "type": { "type": "string", - "enum": ["execution-metrics", "execution_metrics"] + "enum": [ + "execution-metrics", + "execution_metrics" + ] }, "max_tool_calls": { "type": "number", @@ -16721,7 +18812,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -16751,7 +18844,10 @@ }, "type": { "type": "string", - "enum": ["agent-judge", "agent_judge"] + "enum": [ + "agent-judge", + "agent_judge" + ] }, "prompt": { "type": "string" @@ -16805,7 +18901,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -16827,7 +18926,9 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -16863,7 +18964,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -16899,7 +19003,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -16929,10 +19036,15 @@ }, "type": { "type": "string", - "enum": ["is-json", "is_json"] + "enum": [ + "is-json", + "is_json" + ] } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -16968,7 +19080,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -17049,7 +19164,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -17059,7 +19177,10 @@ "minItems": 1 } }, - "required": ["type", "criteria"], + "required": [ + "type", + "criteria" + ], "additionalProperties": false } ] @@ -17080,7 +19201,11 @@ }, "strategy": { "type": "string", - "enum": ["pass_at_k", "mean", "confidence_interval"] + "enum": [ + "pass_at_k", + "mean", + "confidence_interval" + ] }, "cost_limit_usd": { "type": "number", @@ -17091,7 +19216,9 @@ "minimum": 0 } }, - "required": ["count"], + "required": [ + "count" + ], "additionalProperties": false }, "total_budget_usd": { @@ -17142,7 +19269,10 @@ }, "type": { "type": "string", - "enum": ["code-judge", "code_judge"] + "enum": [ + "code-judge", + "code_judge" + ] }, "command": { "anyOf": [ @@ -17194,7 +19324,10 @@ "additionalProperties": {} } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false }, { @@ -17224,7 +19357,10 @@ }, "type": { "type": "string", - "enum": ["llm-judge", "llm_judge"] + "enum": [ + "llm-judge", + "llm_judge" + ] }, "prompt": { "anyOf": [ @@ -17318,7 +19454,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -17335,9 +19474,21 @@ "config": { "type": "object", "additionalProperties": {} + }, + "max_steps": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "temperature": { + "type": "number", + "minimum": 0, + "maximum": 2 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -17397,7 +19548,9 @@ } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -17413,7 +19566,10 @@ "maximum": 1 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -17430,7 +19586,10 @@ "type": "string" } }, - "required": ["type", "path"], + "required": [ + "type", + "path" + ], "additionalProperties": false }, { @@ -17447,13 +19606,18 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false } ] } }, - "required": ["type", "aggregator"], + "required": [ + "type", + "aggregator" + ], "additionalProperties": false }, { @@ -17483,11 +19647,20 @@ }, "type": { "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] + "enum": [ + "tool-trajectory", + "tool_trajectory" + ] }, "mode": { "type": "string", - "enum": ["any_order", "in_order", "exact", "subset", "superset"] + "enum": [ + "any_order", + "in_order", + "exact", + "subset", + "superset" + ] }, "minimums": { "type": "object", @@ -17528,7 +19701,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -17542,7 +19720,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -17553,7 +19736,9 @@ ] } }, - "required": ["tool"], + "required": [ + "tool" + ], "additionalProperties": false } }, @@ -17561,7 +19746,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -17575,7 +19765,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -17586,7 +19781,10 @@ ] } }, - "required": ["type", "mode"], + "required": [ + "type", + "mode" + ], "additionalProperties": false }, { @@ -17616,7 +19814,10 @@ }, "type": { "type": "string", - "enum": ["field-accuracy", "field_accuracy"] + "enum": [ + "field-accuracy", + "field_accuracy" + ] }, "fields": { "type": "array", @@ -17628,7 +19829,11 @@ }, "match": { "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] + "enum": [ + "exact", + "numeric_tolerance", + "date" + ] }, "required": { "type": "boolean" @@ -17650,17 +19855,26 @@ } } }, - "required": ["path", "match"], + "required": [ + "path", + "match" + ], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": ["weighted_average", "all_or_nothing"] + "enum": [ + "weighted_average", + "all_or_nothing" + ] } }, - "required": ["type", "fields"], + "required": [ + "type", + "fields" + ], "additionalProperties": false }, { @@ -17697,7 +19911,10 @@ "minimum": 0 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -17734,7 +19951,10 @@ "minimum": 0 } }, - "required": ["type", "budget"], + "required": [ + "type", + "budget" + ], "additionalProperties": false }, { @@ -17764,7 +19984,10 @@ }, "type": { "type": "string", - "enum": ["token-usage", "token_usage"] + "enum": [ + "token-usage", + "token_usage" + ] }, "max_total": { "type": "number", @@ -17779,7 +20002,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -17809,7 +20034,10 @@ }, "type": { "type": "string", - "enum": ["execution-metrics", "execution_metrics"] + "enum": [ + "execution-metrics", + "execution_metrics" + ] }, "max_tool_calls": { "type": "number", @@ -17841,7 +20069,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -17871,7 +20101,10 @@ }, "type": { "type": "string", - "enum": ["agent-judge", "agent_judge"] + "enum": [ + "agent-judge", + "agent_judge" + ] }, "prompt": { "type": "string" @@ -17925,7 +20158,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -17947,7 +20183,9 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -17983,7 +20221,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -18019,7 +20260,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -18049,10 +20293,15 @@ }, "type": { "type": "string", - "enum": ["is-json", "is_json"] + "enum": [ + "is-json", + "is_json" + ] } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -18088,7 +20337,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -18169,7 +20421,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -18179,7 +20434,10 @@ "minItems": 1 } }, - "required": ["type", "criteria"], + "required": [ + "type", + "criteria" + ], "additionalProperties": false } ] @@ -18216,7 +20474,10 @@ }, "type": { "type": "string", - "enum": ["code-judge", "code_judge"] + "enum": [ + "code-judge", + "code_judge" + ] }, "command": { "anyOf": [ @@ -18268,7 +20529,10 @@ "additionalProperties": {} } }, - "required": ["type", "command"], + "required": [ + "type", + "command" + ], "additionalProperties": false }, { @@ -18298,7 +20562,10 @@ }, "type": { "type": "string", - "enum": ["llm-judge", "llm_judge"] + "enum": [ + "llm-judge", + "llm_judge" + ] }, "prompt": { "anyOf": [ @@ -18392,7 +20659,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -18409,9 +20679,21 @@ "config": { "type": "object", "additionalProperties": {} + }, + "max_steps": { + "type": "integer", + "minimum": 1, + "maximum": 50 + }, + "temperature": { + "type": "number", + "minimum": 0, + "maximum": 2 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -18471,7 +20753,9 @@ } } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -18487,7 +20771,10 @@ "maximum": 1 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -18504,7 +20791,10 @@ "type": "string" } }, - "required": ["type", "path"], + "required": [ + "type", + "path" + ], "additionalProperties": false }, { @@ -18521,13 +20811,18 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false } ] } }, - "required": ["type", "aggregator"], + "required": [ + "type", + "aggregator" + ], "additionalProperties": false }, { @@ -18557,11 +20852,20 @@ }, "type": { "type": "string", - "enum": ["tool-trajectory", "tool_trajectory"] + "enum": [ + "tool-trajectory", + "tool_trajectory" + ] }, "mode": { "type": "string", - "enum": ["any_order", "in_order", "exact", "subset", "superset"] + "enum": [ + "any_order", + "in_order", + "exact", + "subset", + "superset" + ] }, "minimums": { "type": "object", @@ -18602,7 +20906,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -18616,7 +20925,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -18627,7 +20941,9 @@ ] } }, - "required": ["tool"], + "required": [ + "tool" + ], "additionalProperties": false } }, @@ -18635,7 +20951,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -18649,7 +20970,12 @@ "anyOf": [ { "type": "string", - "enum": ["exact", "ignore", "subset", "superset"] + "enum": [ + "exact", + "ignore", + "subset", + "superset" + ] }, { "type": "array", @@ -18660,7 +20986,10 @@ ] } }, - "required": ["type", "mode"], + "required": [ + "type", + "mode" + ], "additionalProperties": false }, { @@ -18690,7 +21019,10 @@ }, "type": { "type": "string", - "enum": ["field-accuracy", "field_accuracy"] + "enum": [ + "field-accuracy", + "field_accuracy" + ] }, "fields": { "type": "array", @@ -18702,7 +21034,11 @@ }, "match": { "type": "string", - "enum": ["exact", "numeric_tolerance", "date"] + "enum": [ + "exact", + "numeric_tolerance", + "date" + ] }, "required": { "type": "boolean" @@ -18724,17 +21060,26 @@ } } }, - "required": ["path", "match"], + "required": [ + "path", + "match" + ], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": ["weighted_average", "all_or_nothing"] + "enum": [ + "weighted_average", + "all_or_nothing" + ] } }, - "required": ["type", "fields"], + "required": [ + "type", + "fields" + ], "additionalProperties": false }, { @@ -18771,7 +21116,10 @@ "minimum": 0 } }, - "required": ["type", "threshold"], + "required": [ + "type", + "threshold" + ], "additionalProperties": false }, { @@ -18808,7 +21156,10 @@ "minimum": 0 } }, - "required": ["type", "budget"], + "required": [ + "type", + "budget" + ], "additionalProperties": false }, { @@ -18838,7 +21189,10 @@ }, "type": { "type": "string", - "enum": ["token-usage", "token_usage"] + "enum": [ + "token-usage", + "token_usage" + ] }, "max_total": { "type": "number", @@ -18853,7 +21207,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -18883,7 +21239,10 @@ }, "type": { "type": "string", - "enum": ["execution-metrics", "execution_metrics"] + "enum": [ + "execution-metrics", + "execution_metrics" + ] }, "max_tool_calls": { "type": "number", @@ -18915,7 +21274,9 @@ "minimum": 0 } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -18945,7 +21306,10 @@ }, "type": { "type": "string", - "enum": ["agent-judge", "agent_judge"] + "enum": [ + "agent-judge", + "agent_judge" + ] }, "prompt": { "type": "string" @@ -18999,7 +21363,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -19021,7 +21388,9 @@ "type": "string" } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -19057,7 +21426,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -19093,7 +21465,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -19123,10 +21498,15 @@ }, "type": { "type": "string", - "enum": ["is-json", "is_json"] + "enum": [ + "is-json", + "is_json" + ] } }, - "required": ["type"], + "required": [ + "type" + ], "additionalProperties": false }, { @@ -19162,7 +21542,10 @@ "type": "string" } }, - "required": ["type", "value"], + "required": [ + "type", + "value" + ], "additionalProperties": false }, { @@ -19243,7 +21626,10 @@ "minLength": 1 } }, - "required": ["score_range", "outcome"], + "required": [ + "score_range", + "outcome" + ], "additionalProperties": false } } @@ -19253,7 +21639,10 @@ "minItems": 1 } }, - "required": ["type", "criteria"], + "required": [ + "type", + "criteria" + ], "additionalProperties": false } ] @@ -19269,7 +21658,10 @@ }, "isolation": { "type": "string", - "enum": ["shared", "per_test"] + "enum": [ + "shared", + "per_test" + ] }, "repos": { "type": "array", @@ -19293,7 +21685,10 @@ "format": "uri" } }, - "required": ["type", "url"], + "required": [ + "type", + "url" + ], "additionalProperties": false }, { @@ -19307,7 +21702,10 @@ "type": "string" } }, - "required": ["type", "path"], + "required": [ + "type", + "path" + ], "additionalProperties": false } ] @@ -19320,7 +21718,10 @@ }, "resolve": { "type": "string", - "enum": ["remote", "local"] + "enum": [ + "remote", + "local" + ] }, "ancestor": { "type": "integer", @@ -19349,7 +21750,10 @@ "additionalProperties": false } }, - "required": ["path", "source"], + "required": [ + "path", + "source" + ], "additionalProperties": false } }, @@ -19385,7 +21789,11 @@ }, "reset": { "type": "string", - "enum": ["none", "fast", "strict"] + "enum": [ + "none", + "fast", + "strict" + ] } }, "additionalProperties": false @@ -19416,7 +21824,11 @@ }, "reset": { "type": "string", - "enum": ["none", "fast", "strict"] + "enum": [ + "none", + "fast", + "strict" + ] } }, "additionalProperties": false @@ -19447,7 +21859,11 @@ }, "reset": { "type": "string", - "enum": ["none", "fast", "strict"] + "enum": [ + "none", + "fast", + "strict" + ] } }, "additionalProperties": false @@ -19478,7 +21894,11 @@ }, "reset": { "type": "string", - "enum": ["none", "fast", "strict"] + "enum": [ + "none", + "fast", + "strict" + ] } }, "additionalProperties": false @@ -19488,7 +21908,11 @@ }, "mode": { "type": "string", - "enum": ["pooled", "temp", "static"] + "enum": [ + "pooled", + "temp", + "static" + ] }, "path": { "type": "string" @@ -19502,7 +21926,9 @@ ] } }, - "required": ["tests"], + "required": [ + "tests" + ], "additionalProperties": false } } From bd576b91c95cf5e0b408856991fbcf3fb6f8d4b6 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 15 Mar 2026 13:28:25 +0000 Subject: [PATCH 07/13] feat: add --judge-target and --model CLI flags with orchestrator wiring Co-Authored-By: Claude Opus 4.6 --- apps/cli/src/commands/eval/commands/run.ts | 13 ++++++++++ apps/cli/src/commands/eval/run-eval.ts | 11 +++++++++ packages/core/src/evaluation/orchestrator.ts | 26 +++++++++++++++++++- 3 files changed, 49 insertions(+), 1 deletion(-) diff --git a/apps/cli/src/commands/eval/commands/run.ts b/apps/cli/src/commands/eval/commands/run.ts index 4a7ec1b50..7e2117107 100644 --- a/apps/cli/src/commands/eval/commands/run.ts +++ b/apps/cli/src/commands/eval/commands/run.ts @@ -163,6 +163,17 @@ export const evalRunCommand = command({ description: 'Write companion artifacts (grading/.json, timing.json, benchmark.json) to the specified directory', }), + judgeTarget: option({ + type: optional(string), + long: 'judge-target', + description: + 'Override judge target for all evaluators (e.g., "agentv", or a target name from targets.yaml)', + }), + model: option({ + type: optional(string), + long: 'model', + description: 'Override model for the judge target (e.g., "openai:gpt-5-mini")', + }), }, handler: async (args) => { // Launch interactive wizard when no eval paths and stdin is a TTY @@ -203,6 +214,8 @@ export const evalRunCommand = command({ strict: args.strict, benchmarkJson: args.benchmarkJson, artifacts: args.artifacts, + judgeTarget: args.judgeTarget, + model: args.model, }; await runEvalCommand({ testFiles: resolvedPaths, rawOptions }); }, diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index 43eec380b..1f3d77f46 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -82,6 +82,8 @@ interface NormalizedOptions { readonly workspacePath?: string; readonly benchmarkJson?: string; readonly artifacts?: string; + readonly judgeTarget?: string; + readonly model?: string; } function normalizeBoolean(value: unknown): boolean { @@ -249,6 +251,8 @@ function normalizeOptions( workspacePath, benchmarkJson: normalizeString(rawOptions.benchmarkJson), artifacts: normalizeString(rawOptions.artifacts), + judgeTarget: normalizeString(rawOptions.judgeTarget), + model: normalizeString(rawOptions.model), } satisfies NormalizedOptions; } @@ -593,6 +597,8 @@ async function runSingleEvalFile(params: { trials: trialsConfig, totalBudgetUsd, failOnError, + judgeTarget: options.judgeTarget, + model: options.model, streamCallbacks: streamingObserver?.getStreamCallbacks(), onResult: async (result: EvaluationResult) => { // Finalize streaming observer span with score @@ -674,6 +680,11 @@ export async function runEvalCommand(input: RunEvalCommandInput): Promise let options = normalizeOptions(input.rawOptions, config, yamlConfig?.execution); + // Validate --judge-target / --model combinations + if (options.judgeTarget === 'agentv' && !options.model) { + throw new Error('--judge-target agentv requires --model (e.g., --model openai:gpt-5-mini)'); + } + // --retry-errors: override filter to only re-run execution_error test cases. // IMPORTANT: JSONL must be fully loaded here, before the output writer is created below, // since the retry source and output destination may refer to the same file. diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index 396bc15fe..58e42903b 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -235,6 +235,10 @@ export interface RunEvaluationOptions { readonly retainOnSuccess?: 'keep' | 'cleanup'; /** Retention policy override for failed cases */ readonly retainOnFailure?: 'keep' | 'cleanup'; + /** CLI override: judge target name (e.g., "agentv" or a target from targets.yaml) */ + readonly judgeTarget?: string; + /** CLI override: model for judge target (e.g., "openai:gpt-5-mini") */ + readonly model?: string; } export async function runEvaluation( @@ -271,6 +275,8 @@ export async function runEvaluation( workspaceClean, retainOnSuccess, retainOnFailure, + judgeTarget: cliJudgeTarget, + model: cliModel, } = options; // Disable cache when trials > 1 (cache makes trials deterministic = pointless) @@ -335,6 +341,23 @@ export async function runEvaluation( const resolveJudgeProvider = async ( targetContext: ResolvedTarget, ): Promise => { + // CLI --judge-target takes highest priority + if (cliJudgeTarget) { + if (cliJudgeTarget === 'agentv') { + // Create an agentv provider on-the-fly with the CLI model + const { AgentvProvider } = await import('./providers/agentv-provider.js'); + return new AgentvProvider('agentv', { model: cliModel!, temperature: 0 }); + } + const overrideTarget = resolveTargetByName(cliJudgeTarget); + if (!overrideTarget) { + throw new Error(`--judge-target "${cliJudgeTarget}" not found in targets`); + } + return getOrCreateProvider(overrideTarget); + } + + // TODO: When --model is provided without --judge-target, override the model of + // whichever judge target is resolved. For now, --model only works with --judge-target agentv. + const judgeName = targetContext.judgeTarget ?? targetContext.name; const resolvedJudge = resolveTargetByName(judgeName); if (!resolvedJudge) { @@ -346,7 +369,8 @@ export async function runEvaluation( // Validate judge_target: error if an agent provider would be used as judge. // Agent providers can't return structured JSON for judging — they respond with // tool calls and markdown, causing silent score-0 failures. - if (isAgentProvider(getOrCreateProvider(target)) && !target.judgeTarget) { + // CLI --judge-target override also satisfies this requirement. + if (isAgentProvider(getOrCreateProvider(target)) && !target.judgeTarget && !cliJudgeTarget) { throw new Error( `Target "${target.name}" is an agent provider ("${target.kind}") with no judge_target — agent providers cannot return structured JSON for judging. Set judge_target to an LLM provider (e.g., azure-llm).`, ); From a11b2abde4a7c0c9ce0624673bc59200e7354466 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 15 Mar 2026 13:33:02 +0000 Subject: [PATCH 08/13] refactor: unify llm-judge/agent-judge in transpiler NL conversion The transpiler now handles llm-judge with rubrics the same way as agent-judge, expanding rubric items into individual NL assertion strings. Part of #614 --- .../loaders/eval-yaml-transpiler.ts | 11 +++++--- .../loaders/eval-yaml-transpiler.test.ts | 25 +++++++++++++++++++ 2 files changed, 32 insertions(+), 4 deletions(-) diff --git a/packages/core/src/evaluation/loaders/eval-yaml-transpiler.ts b/packages/core/src/evaluation/loaders/eval-yaml-transpiler.ts index 4d9560157..7e99fd01f 100644 --- a/packages/core/src/evaluation/loaders/eval-yaml-transpiler.ts +++ b/packages/core/src/evaluation/loaders/eval-yaml-transpiler.ts @@ -144,8 +144,6 @@ function assertionToNaturalLanguage(entry: RawAssertEntry): string | null { case 'llm-judge': case 'llm_judge': - return typeof entry.prompt === 'string' ? entry.prompt : null; - case 'agent-judge': case 'agent_judge': { // Expand each rubric item to its own assertion string @@ -217,10 +215,15 @@ function assertionToNaturalLanguage(entry: RawAssertEntry): string | null { /** * Expand a single assertion entry into zero or more NL strings. - * Most assertions produce exactly one string; agent-judge with rubrics expands to many. + * Most assertions produce exactly one string; llm-judge/agent-judge with rubrics expands to many. */ function assertionToNaturalLanguageList(entry: RawAssertEntry): string[] { - if (entry.type === 'agent-judge' || entry.type === 'agent_judge') { + if ( + entry.type === 'llm-judge' || + entry.type === 'llm_judge' || + entry.type === 'agent-judge' || + entry.type === 'agent_judge' + ) { if (Array.isArray(entry.rubrics) && entry.rubrics.length > 0) { return (entry.rubrics as Array<{ outcome?: string; criteria?: string; id?: string }>) .map((r) => r.outcome ?? r.criteria ?? r.id) diff --git a/packages/core/test/evaluation/loaders/eval-yaml-transpiler.test.ts b/packages/core/test/evaluation/loaders/eval-yaml-transpiler.test.ts index de224a1a2..fa8a7e497 100644 --- a/packages/core/test/evaluation/loaders/eval-yaml-transpiler.test.ts +++ b/packages/core/test/evaluation/loaders/eval-yaml-transpiler.test.ts @@ -269,6 +269,31 @@ describe('transpileEvalYaml — NL assertions', () => { expect(evals[0].assertions).toContain('No unnecessary steps'); }); + it('converts llm-judge with rubrics to multiple assertions', () => { + const suite = { + tests: [ + { + id: 't1', + input: 'test', + assertions: [ + { type: 'skill-trigger', skill: 's', should_trigger: true }, + { + type: 'llm-judge', + rubrics: [ + { id: 'r1', outcome: 'Response is accurate' }, + { id: 'r2', outcome: 'Formatting is correct' }, + ], + }, + ], + }, + ], + }; + const { files } = transpileEvalYaml(suite); + const evals = files.get('s')?.evals; + expect(evals[0].assertions).toContain('Response is accurate'); + expect(evals[0].assertions).toContain('Formatting is correct'); + }); + it('converts tool-trajectory to NL', () => { const suite = { tests: [ From 65a0d7cd7ba0381066ef8de110fc87a400606bf8 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 15 Mar 2026 13:43:43 +0000 Subject: [PATCH 09/13] fix: address code review findings - Add explicit guard for --model when --judge-target is agentv (was non-null assertion) - Consolidate evaluateWithJudgeTarget/evaluateWithDelegatedAgent into shared evaluateWithDelegate - Add try-catch for RegExp construction in search_files tool (prevents crash on invalid patterns) - Add comments explaining agentv exclusion from AGENT_PROVIDER_KINDS and AgentJudgeSchema backward compat Part of #614 --- .../src/evaluation/evaluators/llm-judge.ts | 99 ++++++------------- .../evaluation/loaders/evaluator-parser.ts | 4 +- packages/core/src/evaluation/orchestrator.ts | 7 +- .../evaluation/providers/agentv-provider.ts | 3 +- .../evaluation/registry/builtin-evaluators.ts | 2 + .../evaluation/validation/eval-file.schema.ts | 1 + .../providers/agentv-provider.test.ts | 8 +- 7 files changed, 45 insertions(+), 79 deletions(-) diff --git a/packages/core/src/evaluation/evaluators/llm-judge.ts b/packages/core/src/evaluation/evaluators/llm-judge.ts index 91e6578bc..88e6a5268 100644 --- a/packages/core/src/evaluation/evaluators/llm-judge.ts +++ b/packages/core/src/evaluation/evaluators/llm-judge.ts @@ -473,63 +473,7 @@ export class LlmJudgeEvaluator implements Evaluator { * Judge target mode: Delegates to an explicit judgeTargetProvider via Provider.invoke(). */ private async evaluateWithJudgeTarget(context: EvaluationContext): Promise { - const provider = this.judgeTargetProvider as Provider; - - const workspacePath = context.workspacePath; - const prompt = this.buildDelegatedPrompt(context); - - const evaluatorRawRequest: JsonObject = { - mode: 'judge_target', - judge_target: provider.targetName, - prompt, - }; - - try { - const response = await provider.invoke({ - question: prompt, - cwd: workspacePath, - evalCaseId: context.evalCase.id, - attempt: context.attempt, - }); - - const assistantContent = extractLastAssistantContent(response.output); - if (!assistantContent) { - return { - score: 0, - verdict: 'fail', - hits: [], - misses: ['llm-judge judge_target returned no assistant response'], - expectedAspectCount: 1, - evaluatorRawRequest, - details: { mode: 'judge_target', judge_target: provider.targetName }, - }; - } - - const config = context.evaluator; - const rubrics = config?.type === 'llm-judge' ? config.rubrics : undefined; - - const details: JsonObject = { - mode: 'judge_target', - judge_target: provider.targetName, - }; - - return this.parseAgentResult(assistantContent, rubrics, evaluatorRawRequest, details); - } catch (error) { - const message = error instanceof Error ? error.message : String(error); - return { - score: 0, - verdict: 'fail', - hits: [], - misses: [`llm-judge judge_target evaluation failed: ${message}`], - expectedAspectCount: 1, - evaluatorRawRequest, - details: { - mode: 'judge_target', - judge_target: provider.targetName, - error: message, - }, - }; - } + return this.evaluateWithDelegate(context, this.judgeTargetProvider as Provider, 'judge_target'); } /** @@ -538,18 +482,30 @@ export class LlmJudgeEvaluator implements Evaluator { private async evaluateWithDelegatedAgent( context: EvaluationContext, judgeProvider: Provider, + ): Promise { + return this.evaluateWithDelegate(context, judgeProvider, 'delegate'); + } + + /** + * Shared implementation for judge_target and delegate modes. + * Both invoke a provider and parse the agent result from the response. + */ + private async evaluateWithDelegate( + context: EvaluationContext, + provider: Provider, + modeLabel: string, ): Promise { const workspacePath = context.workspacePath; const prompt = this.buildDelegatedPrompt(context); const evaluatorRawRequest: JsonObject = { - mode: 'judge_target', - judge_target: judgeProvider.targetName, + mode: modeLabel, + judge_target: provider.targetName, prompt, }; try { - const response = await judgeProvider.invoke({ + const response = await provider.invoke({ question: prompt, cwd: workspacePath, evalCaseId: context.evalCase.id, @@ -562,10 +518,10 @@ export class LlmJudgeEvaluator implements Evaluator { score: 0, verdict: 'fail', hits: [], - misses: ['llm-judge delegate returned no assistant response'], + misses: [`llm-judge ${modeLabel} returned no assistant response`], expectedAspectCount: 1, evaluatorRawRequest, - details: { mode: 'judge_target', judge_target: judgeProvider.targetName }, + details: { mode: modeLabel, judge_target: provider.targetName }, }; } @@ -573,8 +529,8 @@ export class LlmJudgeEvaluator implements Evaluator { const rubrics = config?.type === 'llm-judge' ? config.rubrics : undefined; const details: JsonObject = { - mode: 'judge_target', - judge_target: judgeProvider.targetName, + mode: modeLabel, + judge_target: provider.targetName, }; return this.parseAgentResult(assistantContent, rubrics, evaluatorRawRequest, details); @@ -584,12 +540,12 @@ export class LlmJudgeEvaluator implements Evaluator { score: 0, verdict: 'fail', hits: [], - misses: [`llm-judge delegate evaluation failed: ${message}`], + misses: [`llm-judge ${modeLabel} evaluation failed: ${message}`], expectedAspectCount: 1, evaluatorRawRequest, details: { - mode: 'judge_target', - judge_target: judgeProvider.targetName, + mode: modeLabel, + judge_target: provider.targetName, error: message, }, }; @@ -1255,7 +1211,14 @@ function createFilesystemTools(workspacePath: string) { execute: async (input: { pattern: string; path: string }) => { try { const resolved = resolveSandboxed(workspacePath, input.path); - const regex = new RegExp(input.pattern, 'gi'); + let regex: RegExp; + try { + regex = new RegExp(input.pattern, 'gi'); + } catch (regexErr) { + return { + error: `Invalid regex pattern: ${regexErr instanceof Error ? regexErr.message : String(regexErr)}`, + }; + } const matches: Array<{ file: string; line: number; text: string }> = []; await searchDirectory(resolved, workspacePath, regex, matches); diff --git a/packages/core/src/evaluation/loaders/evaluator-parser.ts b/packages/core/src/evaluation/loaders/evaluator-parser.ts index e931db1c8..2b77e87e3 100644 --- a/packages/core/src/evaluation/loaders/evaluator-parser.ts +++ b/packages/core/src/evaluation/loaders/evaluator-parser.ts @@ -1301,9 +1301,7 @@ async function parseEvaluatorList( : undefined; const rawTempLlm = rawEvaluator.temperature; const llmTemperature = - typeof rawTempLlm === 'number' && rawTempLlm >= 0 && rawTempLlm <= 2 - ? rawTempLlm - : undefined; + typeof rawTempLlm === 'number' && rawTempLlm >= 0 && rawTempLlm <= 2 ? rawTempLlm : undefined; evaluators.push({ name, diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index 58e42903b..fb8649aa4 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -344,9 +344,12 @@ export async function runEvaluation( // CLI --judge-target takes highest priority if (cliJudgeTarget) { if (cliJudgeTarget === 'agentv') { - // Create an agentv provider on-the-fly with the CLI model + if (!cliModel) { + throw new Error('--judge-target "agentv" requires --model (e.g., "openai:gpt-5-mini")'); + + } const { AgentvProvider } = await import('./providers/agentv-provider.js'); - return new AgentvProvider('agentv', { model: cliModel!, temperature: 0 }); + return new AgentvProvider('agentv', { model: cliModel, temperature: 0 }); } const overrideTarget = resolveTargetByName(cliJudgeTarget); if (!overrideTarget) { diff --git a/packages/core/src/evaluation/providers/agentv-provider.ts b/packages/core/src/evaluation/providers/agentv-provider.ts index 06abd9cfe..88084c8fa 100644 --- a/packages/core/src/evaluation/providers/agentv-provider.ts +++ b/packages/core/src/evaluation/providers/agentv-provider.ts @@ -42,8 +42,7 @@ function createLanguageModel(modelString: string): LanguageModel { return createGoogleGenerativeAI()(modelName); default: throw new Error( - `Unsupported AI SDK provider "${provider}" in model string "${modelString}". ` + - 'Supported providers: openai, anthropic, azure, google', + `Unsupported AI SDK provider "${provider}" in model string "${modelString}". Supported providers: openai, anthropic, azure, google`, ); } } diff --git a/packages/core/src/evaluation/registry/builtin-evaluators.ts b/packages/core/src/evaluation/registry/builtin-evaluators.ts index a370c08b4..7d8e6ff88 100644 --- a/packages/core/src/evaluation/registry/builtin-evaluators.ts +++ b/packages/core/src/evaluation/registry/builtin-evaluators.ts @@ -93,6 +93,8 @@ export const llmJudgeFactory: EvaluatorFactoryFn = (config, context) => { } // Only pass judgeTargetProvider for agent providers (delegate mode). // LLM providers use the normal resolveJudgeProvider path for structured JSON mode. + // Note: agentv uses asLanguageModel() not invoke(), so it's not in AGENT_PROVIDER_KINDS; + // check it explicitly here for built-in agent mode. const isAgent = isAgentProvider(judgeTargetProvider) || judgeTargetProvider.kind === 'agentv'; evaluator = new LlmJudgeEvaluator({ resolveJudgeProvider: async (evalContext) => { diff --git a/packages/core/src/evaluation/validation/eval-file.schema.ts b/packages/core/src/evaluation/validation/eval-file.schema.ts index 977b68daa..88ab00041 100644 --- a/packages/core/src/evaluation/validation/eval-file.schema.ts +++ b/packages/core/src/evaluation/validation/eval-file.schema.ts @@ -191,6 +191,7 @@ const ExecutionMetricsSchema = EvaluatorCommonSchema.extend({ exploration_tolerance: z.number().min(0).optional(), }); +/** Backward compat: agent-judge YAML type is accepted and remapped to llm-judge at parse time. */ const AgentJudgeSchema = EvaluatorCommonSchema.extend({ type: z.enum(['agent-judge', 'agent_judge']), prompt: z.string().optional(), diff --git a/packages/core/test/evaluation/providers/agentv-provider.test.ts b/packages/core/test/evaluation/providers/agentv-provider.test.ts index 8670f4ec3..2b0c0aadd 100644 --- a/packages/core/test/evaluation/providers/agentv-provider.test.ts +++ b/packages/core/test/evaluation/providers/agentv-provider.test.ts @@ -68,7 +68,7 @@ describe('AgentvProvider', () => { }); const model = provider.asLanguageModel(); expect(model).toBeDefined(); - expect((model as any).modelId).toBe('gpt-5-mini'); + expect((model as unknown as { modelId: string }).modelId).toBe('gpt-5-mini'); }); it('asLanguageModel() works with anthropic model strings', () => { @@ -78,7 +78,7 @@ describe('AgentvProvider', () => { }); const model = provider.asLanguageModel(); expect(model).toBeDefined(); - expect((model as any).modelId).toBe('claude-sonnet-4-20250514'); + expect((model as unknown as { modelId: string }).modelId).toBe('claude-sonnet-4-20250514'); }); it('asLanguageModel() works with google model strings', () => { @@ -88,7 +88,7 @@ describe('AgentvProvider', () => { }); const model = provider.asLanguageModel(); expect(model).toBeDefined(); - expect((model as any).modelId).toBe('gemini-2.5-flash'); + expect((model as unknown as { modelId: string }).modelId).toBe('gemini-2.5-flash'); }); it('asLanguageModel() works with azure model strings', () => { @@ -98,7 +98,7 @@ describe('AgentvProvider', () => { }); const model = provider.asLanguageModel(); expect(model).toBeDefined(); - expect((model as any).modelId).toBe('gpt-4o-deployment'); + expect((model as unknown as { modelId: string }).modelId).toBe('gpt-4o-deployment'); }); it('throws for unsupported provider prefix', () => { From 0f3c6afa5d8a27f7ec3dc6a3a12d593efde2fbee Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 15 Mar 2026 14:06:56 +0000 Subject: [PATCH 10/13] refactor: remove all agent-judge references from codebase --- .../content/docs/evaluation/eval-cases.mdx | 2 +- .../content/docs/guides/agent-eval-layers.mdx | 6 +- ...026-02-26-eval-schema-generation-design.md | 12 +- .../features/agent-judge/.agentv/targets.yaml | 22 - .../evals/dataset.eval.baseline.jsonl | 2 - .../agent-judge/evals/dataset.eval.yaml | 64 - .../workspace-template/package.json | 5 - .../workspace-template/src/main.ts | 11 - .../file-changes-judges/.agentv/targets.yaml | 4 +- .../evals/dataset.eval.yaml | 18 +- .../loaders/eval-yaml-transpiler.ts | 10 +- .../evaluation/loaders/evaluator-parser.ts | 79 +- .../evaluation/validation/eval-file.schema.ts | 11 - .../loaders/eval-yaml-transpiler.test.ts | 4 +- plugins/agentv-dev/agents/eval-analyzer.md | 4 +- .../skills/agentv-eval-writer/SKILL.md | 2 +- .../references/eval-schema.json | 2166 +---------------- 17 files changed, 141 insertions(+), 2281 deletions(-) delete mode 100644 examples/features/agent-judge/.agentv/targets.yaml delete mode 100644 examples/features/agent-judge/evals/dataset.eval.baseline.jsonl delete mode 100644 examples/features/agent-judge/evals/dataset.eval.yaml delete mode 100644 examples/features/agent-judge/workspace-template/package.json delete mode 100644 examples/features/agent-judge/workspace-template/src/main.ts diff --git a/apps/web/src/content/docs/evaluation/eval-cases.mdx b/apps/web/src/content/docs/evaluation/eval-cases.mdx index cc1545b64..4881674a9 100644 --- a/apps/web/src/content/docs/evaluation/eval-cases.mdx +++ b/apps/web/src/content/docs/evaluation/eval-cases.mdx @@ -265,7 +265,7 @@ tests: ### `assert` present — explicit evaluators only -When `assert` is defined, only the declared evaluators run. No implicit judge is added. Judges that are declared (such as `llm-judge`, `code-judge`, `agent-judge`, or `rubrics`) receive `criteria` as input automatically. +When `assert` is defined, only the declared evaluators run. No implicit judge is added. Judges that are declared (such as `llm-judge`, `code-judge`, or `rubrics`) receive `criteria` as input automatically. If `assert` contains only deterministic evaluators (like `contains` or `regex`), the `criteria` field is not evaluated and a warning is emitted: diff --git a/apps/web/src/content/docs/guides/agent-eval-layers.mdx b/apps/web/src/content/docs/guides/agent-eval-layers.mdx index 783a2ca55..6d0f542cf 100644 --- a/apps/web/src/content/docs/guides/agent-eval-layers.mdx +++ b/apps/web/src/content/docs/guides/agent-eval-layers.mdx @@ -15,8 +15,8 @@ Covers plan quality, plan adherence, and tool selection rationale. Use LLM-based | Concern | AgentV evaluator | |---------|-----------------| -| Plan quality & coherence | `llm_judge` with reasoning-focused prompt | -| Workspace-aware auditing | `agent_judge` with rubrics | +| Plan quality & coherence | `llm-judge` with reasoning-focused prompt | +| Workspace-aware auditing | `llm-judge` with rubrics | ```yaml # Layer 1: Reasoning — verify the agent's plan makes sense @@ -29,7 +29,7 @@ assertions: Did it select appropriate tools for the task? Score 1.0 if reasoning is sound, 0.0 if not. - name: workspace-audit - type: agent-judge + type: llm-judge max_steps: 5 temperature: 0 rubrics: diff --git a/docs/plans/2026-02-26-eval-schema-generation-design.md b/docs/plans/2026-02-26-eval-schema-generation-design.md index a20a7909f..9d6047886 100644 --- a/docs/plans/2026-02-26-eval-schema-generation-design.md +++ b/docs/plans/2026-02-26-eval-schema-generation-design.md @@ -248,14 +248,9 @@ const ExecutionMetricsSchema = EvaluatorCommonSchema.extend({ exploration_tolerance: z.number().min(0).optional(), }); -const AgentJudgeSchema = EvaluatorCommonSchema.extend({ - type: z.literal('agent_judge'), - prompt: z.string().optional(), - rubrics: z.array(RubricItemSchema).optional(), - max_steps: z.number().int().min(1).max(50).optional(), - temperature: z.number().min(0).max(2).optional(), - target: z.string().optional(), -}); +// Note: agent_judge was removed — llm-judge now covers all judge use cases +// including agentic behavior (auto-detected based on judge provider kind). +// See LlmJudgeSchema above for the unified schema. const ContainsSchema = EvaluatorCommonSchema.extend({ type: z.literal('contains'), @@ -292,7 +287,6 @@ const EvaluatorSchema = z.union([ CostSchema, TokenUsageSchema, ExecutionMetricsSchema, - AgentJudgeSchema, ContainsSchema, RegexSchema, IsJsonSchema, diff --git a/examples/features/agent-judge/.agentv/targets.yaml b/examples/features/agent-judge/.agentv/targets.yaml deleted file mode 100644 index 6d5c82918..000000000 --- a/examples/features/agent-judge/.agentv/targets.yaml +++ /dev/null @@ -1,22 +0,0 @@ -targets: - # Mock agent that "creates tests" in the workspace. - # Each test gets a fresh copy of workspace-template/ as its CWD. - - name: mock_agent - provider: cli - command: >- - bash -c ' - mkdir -p tests && - printf "import { add, multiply } from \"../src/main\";\n\ndescribe(\"math functions\", () => {\n test(\"add returns sum\", () => {\n expect(add(2, 3)).toBe(5);\n });\n\n test(\"multiply returns product\", () => {\n expect(multiply(4, 5)).toBe(20);\n });\n});\n" > tests/math.test.ts && - printf "import { greet } from \"../src/main\";\n\ndescribe(\"greet\", () => {\n test(\"returns greeting\", () => {\n expect(greet(\"World\")).toBe(\"Hello, World!\");\n });\n});\n" > tests/greet.test.ts && - echo "Created test files: tests/math.test.ts and tests/greet.test.ts" > {OUTPUT_FILE} - ' - workspace_template: ../workspace-template - judge_target: azure_judge - - # Azure OpenAI target used as judge provider for built-in agent_judge mode. - - name: azure_judge - provider: azure - endpoint: ${{ AZURE_OPENAI_ENDPOINT }} - api_key: ${{ AZURE_OPENAI_API_KEY }} - model: ${{ AZURE_DEPLOYMENT_NAME }} - version: ${{ AZURE_OPENAI_API_VERSION }} diff --git a/examples/features/agent-judge/evals/dataset.eval.baseline.jsonl b/examples/features/agent-judge/evals/dataset.eval.baseline.jsonl deleted file mode 100644 index bc2d5b6ee..000000000 --- a/examples/features/agent-judge/evals/dataset.eval.baseline.jsonl +++ /dev/null @@ -1,2 +0,0 @@ -{"timestamp":"2026-02-20T21:37:58.641Z","test_id":"verify-test-creation-freeform","dataset":"dataset","score":1,"hits":["Created tests/math.test.ts and tests/greet.test.ts in tests/ directory","Test files import functions from src/main.ts","add, multiply, and greet functions are tested","Assertions are meaningful and verify correct outputs"],"misses":[],"target":"mock_agent","reasoning":"workspace-audit: All criteria are fully met: each function is tested with meaningful assertions, test files are correctly placed and import from the source file.","scores":[{"name":"workspace-audit","type":"agent-judge","score":1,"weight":1,"verdict":"pass","hits":["Created tests/math.test.ts and tests/greet.test.ts in tests/ directory","Test files import functions from src/main.ts","add, multiply, and greet functions are tested","Assertions are meaningful and verify correct outputs"],"misses":[],"reasoning":"All criteria are fully met: each function is tested with meaningful assertions, test files are correctly placed and import from the source file.","details":{"mode":"built-in","steps":3,"tool_calls":5}}]} -{"timestamp":"2026-02-20T21:37:59.540Z","test_id":"verify-test-creation-rubric","dataset":"dataset","score":1,"hits":["[tests-dir-exists] A tests/ directory exists in the workspace: A 'tests/' directory exists in the workspace, containing test files.","[math-tests] Test file exists that tests the add and multiply functions: 'tests/math.test.ts' exists and contains tests for both 'add' and 'multiply' functions.","[greet-tests] Test file exists that tests the greet function: 'tests/greet.test.ts' exists and contains a test for the 'greet' function.","[assertions-present] Tests contain proper assertions (expect/assert calls): All test files contain proper assertions using 'expect' calls."],"misses":[],"target":"mock_agent","reasoning":"workspace-audit-rubric: All required test files exist in the 'tests/' directory, and each function from 'src/main.ts' is covered by appropriate unit tests with proper assertions. The candidate answer meets all rubric criteria.","scores":[{"name":"workspace-audit-rubric","type":"agent-judge","score":1,"weight":1,"verdict":"pass","hits":["[tests-dir-exists] A tests/ directory exists in the workspace: A 'tests/' directory exists in the workspace, containing test files.","[math-tests] Test file exists that tests the add and multiply functions: 'tests/math.test.ts' exists and contains tests for both 'add' and 'multiply' functions.","[greet-tests] Test file exists that tests the greet function: 'tests/greet.test.ts' exists and contains a test for the 'greet' function.","[assertions-present] Tests contain proper assertions (expect/assert calls): All test files contain proper assertions using 'expect' calls."],"misses":[],"reasoning":"All required test files exist in the 'tests/' directory, and each function from 'src/main.ts' is covered by appropriate unit tests with proper assertions. The candidate answer meets all rubric criteria.","details":{"mode":"built-in","steps":2,"tool_calls":3}}]} diff --git a/examples/features/agent-judge/evals/dataset.eval.yaml b/examples/features/agent-judge/evals/dataset.eval.yaml deleted file mode 100644 index a9bf21048..000000000 --- a/examples/features/agent-judge/evals/dataset.eval.yaml +++ /dev/null @@ -1,64 +0,0 @@ -# Agent Judge feature demonstration -# Tests that the agent_judge evaluator can investigate the workspace -# to verify that an agent created the expected files and content. -# -# The mock_agent creates test files in the workspace-template. -# The agent_judge evaluator uses an AI SDK agent loop with filesystem tools -# to verify the test files exist and contain proper test cases. - -description: Verify agent_judge evaluator can audit workspace file creation - -execution: - target: mock_agent - -tests: - # Case 1: freeform agent_judge (no rubrics) — scores 0-1 - - id: verify-test-creation-freeform - criteria: >- - The agent should create unit test files in a tests/ directory. - Test files should import from src/main.ts and test the add, multiply, - and greet functions with meaningful assertions. - - input: - - role: user - content: - - type: text - value: Create unit tests for all functions in src/main.ts - - assertions: - - name: workspace-audit - type: agent-judge - max_steps: 5 - temperature: 0 - - # Case 2: rubric-based agent_judge — structured evaluation - - id: verify-test-creation-rubric - criteria: >- - The agent should create comprehensive unit tests for the project. - - input: - - role: user - content: - - type: text - value: Create unit tests for all functions in src/main.ts - - assertions: - - name: workspace-audit-rubric - type: agent-judge - max_steps: 5 - temperature: 0 - rubrics: - - id: tests-dir-exists - outcome: "A tests/ directory exists in the workspace" - weight: 1.0 - required: true - - id: math-tests - outcome: "Test file exists that tests the add and multiply functions" - weight: 1.0 - required: true - - id: greet-tests - outcome: "Test file exists that tests the greet function" - weight: 1.0 - - id: assertions-present - outcome: "Tests contain proper assertions (expect/assert calls)" - weight: 0.5 diff --git a/examples/features/agent-judge/workspace-template/package.json b/examples/features/agent-judge/workspace-template/package.json deleted file mode 100644 index 24d635536..000000000 --- a/examples/features/agent-judge/workspace-template/package.json +++ /dev/null @@ -1,5 +0,0 @@ -{ - "name": "sample-project", - "version": "1.0.0", - "type": "module" -} diff --git a/examples/features/agent-judge/workspace-template/src/main.ts b/examples/features/agent-judge/workspace-template/src/main.ts deleted file mode 100644 index cfda22527..000000000 --- a/examples/features/agent-judge/workspace-template/src/main.ts +++ /dev/null @@ -1,11 +0,0 @@ -export function add(a: number, b: number): number { - return a + b; -} - -export function multiply(a: number, b: number): number { - return a * b; -} - -export function greet(name: string): string { - return `Hello, ${name}!`; -} diff --git a/examples/features/file-changes-judges/.agentv/targets.yaml b/examples/features/file-changes-judges/.agentv/targets.yaml index d9645bc03..10c067b31 100644 --- a/examples/features/file-changes-judges/.agentv/targets.yaml +++ b/examples/features/file-changes-judges/.agentv/targets.yaml @@ -11,7 +11,7 @@ targets: workspace_template: ../workspace-template judge_target: azure_judge - # Azure OpenAI — used as LLM judge (rubrics) and built-in agent_judge provider + # Azure OpenAI — used as LLM judge (rubrics) and built-in llm-judge provider - name: azure_judge provider: azure endpoint: ${{ AZURE_OPENAI_ENDPOINT }} @@ -19,7 +19,7 @@ targets: model: ${{ AZURE_DEPLOYMENT_NAME }} version: ${{ AZURE_OPENAI_API_VERSION }} - # Copilot CLI — used as delegated agent_judge target + # Copilot CLI — used as delegated llm-judge target - name: copilot_judge provider: copilot-cli model: claude-haiku-4.5 diff --git a/examples/features/file-changes-judges/evals/dataset.eval.yaml b/examples/features/file-changes-judges/evals/dataset.eval.yaml index 65ebd68df..2fb796537 100644 --- a/examples/features/file-changes-judges/evals/dataset.eval.yaml +++ b/examples/features/file-changes-judges/evals/dataset.eval.yaml @@ -2,13 +2,13 @@ # # Proves that file_changes diffs are correctly passed to all judge types: # 1. rubrics — LLM judge (Azure) evaluates the diff -# 2. agent_judge — built-in mode (Azure via AI SDK) sees file_changes in prompt -# 3. agent_judge — delegated mode (Copilot CLI with haiku) sees file_changes in prompt +# 2. llm-judge — built-in mode (Azure via AI SDK) sees file_changes in prompt +# 3. llm-judge — delegated mode (Copilot CLI with haiku) sees file_changes in prompt # # The mock agent adds a `subtract` function to calculator.ts, producing a small # diff (~10 lines) that fits comfortably in any LLM context window. -description: Verify file_changes diffs are accessible to LLM judge, built-in agent judge, and copilot-cli agent judge +description: Verify file_changes diffs are accessible to LLM judge (rubrics, built-in, and copilot-cli) execution: target: mock_agent @@ -43,14 +43,14 @@ tests: outcome: "The file_changes contains a valid unified diff format" weight: 0.5 - # 2. Built-in agent judge — Azure via AI SDK with filesystem tools - - name: agent-judge-builtin - type: agent-judge + # 2. Built-in LLM judge — Azure via AI SDK with filesystem tools + - name: llm-judge-builtin + type: llm-judge max_steps: 3 temperature: 0 - # 3. Copilot CLI agent judge — delegated via target - - name: agent-judge-copilot - type: agent-judge + # 3. Copilot CLI LLM judge — delegated via target + - name: llm-judge-copilot + type: llm-judge target: copilot_judge temperature: 0 diff --git a/packages/core/src/evaluation/loaders/eval-yaml-transpiler.ts b/packages/core/src/evaluation/loaders/eval-yaml-transpiler.ts index 7e99fd01f..db646013b 100644 --- a/packages/core/src/evaluation/loaders/eval-yaml-transpiler.ts +++ b/packages/core/src/evaluation/loaders/eval-yaml-transpiler.ts @@ -143,9 +143,7 @@ function assertionToNaturalLanguage(entry: RawAssertEntry): string | null { return `Output ends with '${entry.value}'`; case 'llm-judge': - case 'llm_judge': - case 'agent-judge': - case 'agent_judge': { + case 'llm_judge': { // Expand each rubric item to its own assertion string // Return the first one — callers handle arrays via assertionToNaturalLanguageList if (Array.isArray(entry.rubrics) && entry.rubrics.length > 0) { @@ -215,14 +213,12 @@ function assertionToNaturalLanguage(entry: RawAssertEntry): string | null { /** * Expand a single assertion entry into zero or more NL strings. - * Most assertions produce exactly one string; llm-judge/agent-judge with rubrics expands to many. + * Most assertions produce exactly one string; llm-judge with rubrics expands to many. */ function assertionToNaturalLanguageList(entry: RawAssertEntry): string[] { if ( entry.type === 'llm-judge' || - entry.type === 'llm_judge' || - entry.type === 'agent-judge' || - entry.type === 'agent_judge' + entry.type === 'llm_judge' ) { if (Array.isArray(entry.rubrics) && entry.rubrics.length > 0) { return (entry.rubrics as Array<{ outcome?: string; criteria?: string; id?: string }>) diff --git a/packages/core/src/evaluation/loaders/evaluator-parser.ts b/packages/core/src/evaluation/loaders/evaluator-parser.ts index 2b77e87e3..8f994a8e4 100644 --- a/packages/core/src/evaluation/loaders/evaluator-parser.ts +++ b/packages/core/src/evaluation/loaders/evaluator-parser.ts @@ -134,9 +134,8 @@ async function parseEvaluatorList( const typeValue = typeof rawType === 'string' ? normalizeEvaluatorType(rawType) : rawType; // Unknown types are treated as custom assertion types (resolved via registry discovery) - // 'agent-judge' is a known alias (maps to 'llm-judge'), not a custom type const isCustomType = - typeof typeValue === 'string' && !isEvaluatorKind(typeValue) && typeValue !== 'agent-judge'; + typeof typeValue === 'string' && !isEvaluatorKind(typeValue); if (typeof typeValue !== 'string') { logWarning(`Skipping evaluator with invalid type in '${evalId}'`); continue; @@ -854,82 +853,6 @@ async function parseEvaluatorList( continue; } - // Backward compat: agent-judge / agent_judge → llm-judge with agent-specific fields - if ((typeValue as string) === 'agent-judge') { - // Validate max_steps (1-50) - const rawMaxSteps = rawEvaluator.max_steps ?? rawEvaluator.maxSteps; - let maxSteps: number | undefined; - if (rawMaxSteps !== undefined) { - if ( - typeof rawMaxSteps !== 'number' || - !Number.isInteger(rawMaxSteps) || - rawMaxSteps < 1 || - rawMaxSteps > 50 - ) { - logWarning( - `Skipping llm-judge evaluator '${name}' in '${evalId}': max_steps must be an integer 1-50`, - ); - continue; - } - maxSteps = rawMaxSteps; - } - - // Validate temperature (0-2) - const rawTemperature = rawEvaluator.temperature; - let temperature: number | undefined; - if (rawTemperature !== undefined) { - if (typeof rawTemperature !== 'number' || rawTemperature < 0 || rawTemperature > 2) { - logWarning( - `Skipping llm-judge evaluator '${name}' in '${evalId}': temperature must be a number 0-2`, - ); - continue; - } - temperature = rawTemperature; - } - - // Validate target (string) - const judgeTarget = asString(rawEvaluator.target); - - // Parse prompt (file path or inline text) - let agentPrompt: string | undefined; - let agentPromptPath: string | undefined; - const rawAgentPrompt = rawEvaluator.prompt; - if (typeof rawAgentPrompt === 'string') { - agentPrompt = rawAgentPrompt; - const resolved = await resolveFileReference(rawAgentPrompt, searchRoots); - if (resolved.resolvedPath) { - agentPromptPath = path.resolve(resolved.resolvedPath); - } - } - - // Parse rubrics via existing infrastructure - const rawAgentRubrics = rawEvaluator.rubrics; - const agentParsedRubrics = Array.isArray(rawAgentRubrics) - ? parseRubricItems(rawAgentRubrics, name, evalId) - : undefined; - - const weight = validateWeight(rawEvaluator.weight, name, evalId); - const required = parseRequired(rawEvaluator.required); - - evaluators.push({ - name, - type: 'llm-judge', - ...(agentPrompt ? { prompt: agentPrompt } : {}), - ...(agentPromptPath - ? { promptPath: agentPromptPath, resolvedPromptPath: agentPromptPath } - : {}), - ...(agentParsedRubrics && agentParsedRubrics.length > 0 - ? { rubrics: agentParsedRubrics } - : {}), - ...(maxSteps !== undefined ? { max_steps: maxSteps } : {}), - ...(temperature !== undefined ? { temperature } : {}), - ...(judgeTarget ? { target: judgeTarget } : {}), - ...(weight !== undefined ? { weight } : {}), - ...(required !== undefined ? { required } : {}), - ...(negate !== undefined ? { negate } : {}), - }); - continue; - } if (typeValue === 'skill-trigger') { const skillName = asString(rawEvaluator.skill); diff --git a/packages/core/src/evaluation/validation/eval-file.schema.ts b/packages/core/src/evaluation/validation/eval-file.schema.ts index 88ab00041..e3bad5fed 100644 --- a/packages/core/src/evaluation/validation/eval-file.schema.ts +++ b/packages/core/src/evaluation/validation/eval-file.schema.ts @@ -191,16 +191,6 @@ const ExecutionMetricsSchema = EvaluatorCommonSchema.extend({ exploration_tolerance: z.number().min(0).optional(), }); -/** Backward compat: agent-judge YAML type is accepted and remapped to llm-judge at parse time. */ -const AgentJudgeSchema = EvaluatorCommonSchema.extend({ - type: z.enum(['agent-judge', 'agent_judge']), - prompt: z.string().optional(), - rubrics: z.array(RubricItemSchema).optional(), - max_steps: z.number().int().min(1).max(50).optional(), - temperature: z.number().min(0).max(2).optional(), - target: z.string().optional(), -}); - const ContainsSchema = EvaluatorCommonSchema.extend({ type: z.literal('contains'), value: z.string(), @@ -236,7 +226,6 @@ const EvaluatorSchema = z.union([ CostSchema, TokenUsageSchema, ExecutionMetricsSchema, - AgentJudgeSchema, ContainsSchema, RegexSchema, IsJsonSchema, diff --git a/packages/core/test/evaluation/loaders/eval-yaml-transpiler.test.ts b/packages/core/test/evaluation/loaders/eval-yaml-transpiler.test.ts index fa8a7e497..0647ce387 100644 --- a/packages/core/test/evaluation/loaders/eval-yaml-transpiler.test.ts +++ b/packages/core/test/evaluation/loaders/eval-yaml-transpiler.test.ts @@ -244,7 +244,7 @@ describe('transpileEvalYaml — NL assertions', () => { expect(evals[0].assertions).toContain('The answer is clear and concise'); }); - it('converts agent-judge with rubrics to multiple assertions', () => { + it('converts llm-judge with rubrics to multiple assertions (rubrics variant)', () => { const suite = { tests: [ { @@ -253,7 +253,7 @@ describe('transpileEvalYaml — NL assertions', () => { assertions: [ { type: 'skill-trigger', skill: 's', should_trigger: true }, { - type: 'agent-judge', + type: 'llm-judge', rubrics: [ { id: 'r1', outcome: 'Correct result returned' }, { id: 'r2', outcome: 'No unnecessary steps' }, diff --git a/plugins/agentv-dev/agents/eval-analyzer.md b/plugins/agentv-dev/agents/eval-analyzer.md index 547c86267..31660128e 100644 --- a/plugins/agentv-dev/agents/eval-analyzer.md +++ b/plugins/agentv-dev/agents/eval-analyzer.md @@ -28,7 +28,7 @@ If `eval-path` is provided, also read the EVAL.yaml to understand evaluator conf ### Step 2: Deterministic-Upgrade Analysis -For each evaluator entry in `scores` where `type` is `"llm-judge"`, `"rubrics"`, or `"agent-judge"`, inspect the `reasoning`, `hits`, and `misses` fields for patterns that indicate a deterministic assertion would suffice: +For each evaluator entry in `scores` where `type` is `"llm-judge"` or `"rubrics"`, inspect the `reasoning`, `hits`, and `misses` fields for patterns that indicate a deterministic assertion would suffice: | Signal | Detection | Suggested Upgrade | |--------|-----------|-------------------| @@ -123,7 +123,7 @@ If a section has no findings, include the header with "None found." underneath. - **Be specific:** Every suggestion must include the test case ID, evaluator name, evidence from the results, and a concrete replacement config. - **Be conservative:** Only suggest deterministic upgrades when the pattern is clear and consistent. Partial or ambiguous evidence should be noted but not acted on. - **Prioritize by impact:** Order suggestions by estimated cost savings (LLM-judge → deterministic saves the most). -- **Handle all evaluator types:** Process `code-judge`, `tool-trajectory`, `llm-judge`, `agent-judge`, `rubrics`, `composite`, and all deterministic types. Only LLM-based types are candidates for deterministic upgrades. +- **Handle all evaluator types:** Process `code-judge`, `tool-trajectory`, `llm-judge`, `rubrics`, `composite`, and all deterministic types. Only LLM-based types are candidates for deterministic upgrades. - **Multi-provider awareness:** When results span multiple targets, note if a suggestion applies to all targets or is target-specific. - **No false positives:** It is better to miss a suggestion than to recommend an incorrect upgrade. If unsure, add the finding to a "Needs Review" subsection with your reasoning. diff --git a/plugins/agentv-dev/skills/agentv-eval-writer/SKILL.md b/plugins/agentv-dev/skills/agentv-eval-writer/SKILL.md index 95d7bf796..5ae6275a3 100644 --- a/plugins/agentv-dev/skills/agentv-eval-writer/SKILL.md +++ b/plugins/agentv-dev/skills/agentv-eval-writer/SKILL.md @@ -222,7 +222,7 @@ tests: |----------|-------------|----------| | `criteria` + **no `assertions`** | Implicit `llm-judge` runs automatically against `criteria` | No | | `criteria` + **`assertions` with only deterministic evaluators** (contains, regex, etc.) | Only declared evaluators run. `criteria` is **not evaluated**. | Yes — warns that no evaluator will consume criteria | -| `criteria` + **`assertions` with a judge** (llm-judge, code-judge, agent-judge, rubrics) | Declared evaluators run. Judges receive `criteria` as input. | No | +| `criteria` + **`assertions` with a judge** (llm-judge, code-judge, rubrics) | Declared evaluators run. Judges receive `criteria` as input. | No | ### No assertions → implicit llm-judge diff --git a/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json b/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json index b55528f3c..77f5be6d9 100644 --- a/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json +++ b/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json @@ -1075,120 +1075,6 @@ ], "additionalProperties": false }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": 0, - "maximum": 1 - } - ] - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": [ - "agent-judge", - "agent_judge" - ] - }, - "prompt": { - "type": "string" - }, - "rubrics": { - "type": "array", - "items": { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "outcome": { - "type": "string" - }, - "weight": { - "type": "number" - }, - "required": { - "type": "boolean" - }, - "required_min_score": { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - "score_ranges": { - "type": "array", - "items": { - "type": "object", - "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ - { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - { - "type": "integer", - "minimum": 0, - "maximum": 10 - } - ] - }, - "outcome": { - "type": "string", - "minLength": 1 - } - }, - "required": [ - "score_range", - "outcome" - ], - "additionalProperties": false - } - } - }, - "additionalProperties": false - } - }, - "max_steps": { - "type": "integer", - "minimum": 1, - "maximum": 50 - }, - "temperature": { - "type": "number", - "minimum": 0, - "maximum": 2 - }, - "target": { - "type": "string" - } - }, - "required": [ - "type" - ], - "additionalProperties": false - }, { "type": "object", "properties": { @@ -2280,120 +2166,6 @@ ], "additionalProperties": false }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": 0, - "maximum": 1 - } - ] - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": [ - "agent-judge", - "agent_judge" - ] - }, - "prompt": { - "type": "string" - }, - "rubrics": { - "type": "array", - "items": { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "outcome": { - "type": "string" - }, - "weight": { - "type": "number" - }, - "required": { - "type": "boolean" - }, - "required_min_score": { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - "score_ranges": { - "type": "array", - "items": { - "type": "object", - "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ - { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - { - "type": "integer", - "minimum": 0, - "maximum": 10 - } - ] - }, - "outcome": { - "type": "string", - "minLength": 1 - } - }, - "required": [ - "score_range", - "outcome" - ], - "additionalProperties": false - } - } - }, - "additionalProperties": false - } - }, - "max_steps": { - "type": "integer", - "minimum": 1, - "maximum": 50 - }, - "temperature": { - "type": "number", - "minimum": 0, - "maximum": 2 - }, - "target": { - "type": "string" - } - }, - "required": [ - "type" - ], - "additionalProperties": false - }, { "type": "object", "properties": { @@ -3512,136 +3284,22 @@ }, "type": { "type": "string", - "enum": [ - "agent-judge", - "agent_judge" - ] + "const": "contains" }, - "prompt": { - "type": "string" - }, - "rubrics": { - "type": "array", - "items": { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "outcome": { - "type": "string" - }, - "weight": { - "type": "number" - }, - "required": { - "type": "boolean" - }, - "required_min_score": { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - "score_ranges": { - "type": "array", - "items": { - "type": "object", - "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ - { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - { - "type": "integer", - "minimum": 0, - "maximum": 10 - } - ] - }, - "outcome": { - "type": "string", - "minLength": 1 - } - }, - "required": [ - "score_range", - "outcome" - ], - "additionalProperties": false - } - } - }, - "additionalProperties": false - } - }, - "max_steps": { - "type": "integer", - "minimum": 1, - "maximum": 50 - }, - "temperature": { - "type": "number", - "minimum": 0, - "maximum": 2 - }, - "target": { - "type": "string" - } - }, - "required": [ - "type" - ], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": 0, - "maximum": 1 - } - ] - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "contains" - }, - "value": { - "type": "string" - } - }, - "required": [ - "type", - "value" - ], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { + "value": { + "type": "string" + } + }, + "required": [ + "type", + "value" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { "type": "string" }, "weight": { @@ -4702,120 +4360,6 @@ ], "additionalProperties": false }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": 0, - "maximum": 1 - } - ] - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": [ - "agent-judge", - "agent_judge" - ] - }, - "prompt": { - "type": "string" - }, - "rubrics": { - "type": "array", - "items": { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "outcome": { - "type": "string" - }, - "weight": { - "type": "number" - }, - "required": { - "type": "boolean" - }, - "required_min_score": { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - "score_ranges": { - "type": "array", - "items": { - "type": "object", - "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ - { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - { - "type": "integer", - "minimum": 0, - "maximum": 10 - } - ] - }, - "outcome": { - "type": "string", - "minLength": 1 - } - }, - "required": [ - "score_range", - "outcome" - ], - "additionalProperties": false - } - } - }, - "additionalProperties": false - } - }, - "max_steps": { - "type": "integer", - "minimum": 1, - "maximum": 50 - }, - "temperature": { - "type": "number", - "minimum": 0, - "maximum": 2 - }, - "target": { - "type": "string" - } - }, - "required": [ - "type" - ], - "additionalProperties": false - }, { "type": "object", "properties": { @@ -5934,136 +5478,22 @@ }, "type": { "type": "string", - "enum": [ - "agent-judge", - "agent_judge" - ] + "const": "contains" }, - "prompt": { - "type": "string" - }, - "rubrics": { - "type": "array", - "items": { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "outcome": { - "type": "string" - }, - "weight": { - "type": "number" - }, - "required": { - "type": "boolean" - }, - "required_min_score": { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - "score_ranges": { - "type": "array", - "items": { - "type": "object", - "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ - { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - { - "type": "integer", - "minimum": 0, - "maximum": 10 - } - ] - }, - "outcome": { - "type": "string", - "minLength": 1 - } - }, - "required": [ - "score_range", - "outcome" - ], - "additionalProperties": false - } - } - }, - "additionalProperties": false - } - }, - "max_steps": { - "type": "integer", - "minimum": 1, - "maximum": 50 - }, - "temperature": { - "type": "number", - "minimum": 0, - "maximum": 2 - }, - "target": { - "type": "string" - } - }, - "required": [ - "type" - ], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": 0, - "maximum": 1 - } - ] - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "contains" - }, - "value": { - "type": "string" - } - }, - "required": [ - "type", - "value" - ], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { + "value": { + "type": "string" + } + }, + "required": [ + "type", + "value" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { "type": "string" }, "weight": { @@ -7112,120 +6542,6 @@ ], "additionalProperties": false }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": 0, - "maximum": 1 - } - ] - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": [ - "agent-judge", - "agent_judge" - ] - }, - "prompt": { - "type": "string" - }, - "rubrics": { - "type": "array", - "items": { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "outcome": { - "type": "string" - }, - "weight": { - "type": "number" - }, - "required": { - "type": "boolean" - }, - "required_min_score": { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - "score_ranges": { - "type": "array", - "items": { - "type": "object", - "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ - { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - { - "type": "integer", - "minimum": 0, - "maximum": 10 - } - ] - }, - "outcome": { - "type": "string", - "minLength": 1 - } - }, - "required": [ - "score_range", - "outcome" - ], - "additionalProperties": false - } - } - }, - "additionalProperties": false - } - }, - "max_steps": { - "type": "integer", - "minimum": 1, - "maximum": 50 - }, - "temperature": { - "type": "number", - "minimum": 0, - "maximum": 2 - }, - "target": { - "type": "string" - } - }, - "required": [ - "type" - ], - "additionalProperties": false - }, { "type": "object", "properties": { @@ -8830,136 +8146,22 @@ }, "type": { "type": "string", - "enum": [ - "agent-judge", - "agent_judge" - ] + "const": "contains" }, - "prompt": { - "type": "string" - }, - "rubrics": { - "type": "array", - "items": { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "outcome": { - "type": "string" - }, - "weight": { - "type": "number" - }, - "required": { - "type": "boolean" - }, - "required_min_score": { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - "score_ranges": { - "type": "array", - "items": { - "type": "object", - "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ - { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - { - "type": "integer", - "minimum": 0, - "maximum": 10 - } - ] - }, - "outcome": { - "type": "string", - "minLength": 1 - } - }, - "required": [ - "score_range", - "outcome" - ], - "additionalProperties": false - } - } - }, - "additionalProperties": false - } - }, - "max_steps": { - "type": "integer", - "minimum": 1, - "maximum": 50 - }, - "temperature": { - "type": "number", - "minimum": 0, - "maximum": 2 - }, - "target": { - "type": "string" - } - }, - "required": [ - "type" - ], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": 0, - "maximum": 1 - } - ] - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "contains" - }, - "value": { - "type": "string" - } - }, - "required": [ - "type", - "value" - ], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { + "value": { + "type": "string" + } + }, + "required": [ + "type", + "value" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { "type": "string" }, "weight": { @@ -10008,120 +9210,6 @@ ], "additionalProperties": false }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": 0, - "maximum": 1 - } - ] - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": [ - "agent-judge", - "agent_judge" - ] - }, - "prompt": { - "type": "string" - }, - "rubrics": { - "type": "array", - "items": { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "outcome": { - "type": "string" - }, - "weight": { - "type": "number" - }, - "required": { - "type": "boolean" - }, - "required_min_score": { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - "score_ranges": { - "type": "array", - "items": { - "type": "object", - "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ - { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - { - "type": "integer", - "minimum": 0, - "maximum": 10 - } - ] - }, - "outcome": { - "type": "string", - "minLength": 1 - } - }, - "required": [ - "score_range", - "outcome" - ], - "additionalProperties": false - } - } - }, - "additionalProperties": false - } - }, - "max_steps": { - "type": "integer", - "minimum": 1, - "maximum": 50 - }, - "temperature": { - "type": "number", - "minimum": 0, - "maximum": 2 - }, - "target": { - "type": "string" - } - }, - "required": [ - "type" - ], - "additionalProperties": false - }, { "type": "object", "properties": { @@ -11240,136 +10328,22 @@ }, "type": { "type": "string", - "enum": [ - "agent-judge", - "agent_judge" - ] + "const": "contains" }, - "prompt": { - "type": "string" - }, - "rubrics": { - "type": "array", - "items": { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "outcome": { - "type": "string" - }, - "weight": { - "type": "number" - }, - "required": { - "type": "boolean" - }, - "required_min_score": { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - "score_ranges": { - "type": "array", - "items": { - "type": "object", - "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ - { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - { - "type": "integer", - "minimum": 0, - "maximum": 10 - } - ] - }, - "outcome": { - "type": "string", - "minLength": 1 - } - }, - "required": [ - "score_range", - "outcome" - ], - "additionalProperties": false - } - } - }, - "additionalProperties": false - } - }, - "max_steps": { - "type": "integer", - "minimum": 1, - "maximum": 50 - }, - "temperature": { - "type": "number", - "minimum": 0, - "maximum": 2 - }, - "target": { - "type": "string" - } - }, - "required": [ - "type" - ], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": 0, - "maximum": 1 - } - ] - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "contains" - }, - "value": { - "type": "string" - } - }, - "required": [ - "type", - "value" - ], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { + "value": { + "type": "string" + } + }, + "required": [ + "type", + "value" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { "type": "string" }, "weight": { @@ -12430,120 +11404,6 @@ ], "additionalProperties": false }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": 0, - "maximum": 1 - } - ] - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": [ - "agent-judge", - "agent_judge" - ] - }, - "prompt": { - "type": "string" - }, - "rubrics": { - "type": "array", - "items": { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "outcome": { - "type": "string" - }, - "weight": { - "type": "number" - }, - "required": { - "type": "boolean" - }, - "required_min_score": { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - "score_ranges": { - "type": "array", - "items": { - "type": "object", - "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ - { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - { - "type": "integer", - "minimum": 0, - "maximum": 10 - } - ] - }, - "outcome": { - "type": "string", - "minLength": 1 - } - }, - "required": [ - "score_range", - "outcome" - ], - "additionalProperties": false - } - } - }, - "additionalProperties": false - } - }, - "max_steps": { - "type": "integer", - "minimum": 1, - "maximum": 50 - }, - "temperature": { - "type": "number", - "minimum": 0, - "maximum": 2 - }, - "target": { - "type": "string" - } - }, - "required": [ - "type" - ], - "additionalProperties": false - }, { "type": "object", "properties": { @@ -13662,136 +12522,22 @@ }, "type": { "type": "string", - "enum": [ - "agent-judge", - "agent_judge" - ] + "const": "contains" }, - "prompt": { - "type": "string" - }, - "rubrics": { - "type": "array", - "items": { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "outcome": { - "type": "string" - }, - "weight": { - "type": "number" - }, - "required": { - "type": "boolean" - }, - "required_min_score": { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - "score_ranges": { - "type": "array", - "items": { - "type": "object", - "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ - { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - { - "type": "integer", - "minimum": 0, - "maximum": 10 - } - ] - }, - "outcome": { - "type": "string", - "minLength": 1 - } - }, - "required": [ - "score_range", - "outcome" - ], - "additionalProperties": false - } - } - }, - "additionalProperties": false - } - }, - "max_steps": { - "type": "integer", - "minimum": 1, - "maximum": 50 - }, - "temperature": { - "type": "number", - "minimum": 0, - "maximum": 2 - }, - "target": { - "type": "string" - } - }, - "required": [ - "type" - ], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": 0, - "maximum": 1 - } - ] - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "contains" - }, - "value": { - "type": "string" - } - }, - "required": [ - "type", - "value" - ], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { + "value": { + "type": "string" + } + }, + "required": [ + "type", + "value" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { "type": "string" }, "weight": { @@ -14840,120 +13586,6 @@ ], "additionalProperties": false }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": 0, - "maximum": 1 - } - ] - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": [ - "agent-judge", - "agent_judge" - ] - }, - "prompt": { - "type": "string" - }, - "rubrics": { - "type": "array", - "items": { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "outcome": { - "type": "string" - }, - "weight": { - "type": "number" - }, - "required": { - "type": "boolean" - }, - "required_min_score": { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - "score_ranges": { - "type": "array", - "items": { - "type": "object", - "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ - { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - { - "type": "integer", - "minimum": 0, - "maximum": 10 - } - ] - }, - "outcome": { - "type": "string", - "minLength": 1 - } - }, - "required": [ - "score_range", - "outcome" - ], - "additionalProperties": false - } - } - }, - "additionalProperties": false - } - }, - "max_steps": { - "type": "integer", - "minimum": 1, - "maximum": 50 - }, - "temperature": { - "type": "number", - "minimum": 0, - "maximum": 2 - }, - "target": { - "type": "string" - } - }, - "required": [ - "type" - ], - "additionalProperties": false - }, { "type": "object", "properties": { @@ -16434,136 +15066,22 @@ }, "type": { "type": "string", - "enum": [ - "agent-judge", - "agent_judge" - ] + "const": "contains" }, - "prompt": { - "type": "string" - }, - "rubrics": { - "type": "array", - "items": { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "outcome": { - "type": "string" - }, - "weight": { - "type": "number" - }, - "required": { - "type": "boolean" - }, - "required_min_score": { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - "score_ranges": { - "type": "array", - "items": { - "type": "object", - "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ - { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - { - "type": "integer", - "minimum": 0, - "maximum": 10 - } - ] - }, - "outcome": { - "type": "string", - "minLength": 1 - } - }, - "required": [ - "score_range", - "outcome" - ], - "additionalProperties": false - } - } - }, - "additionalProperties": false - } - }, - "max_steps": { - "type": "integer", - "minimum": 1, - "maximum": 50 - }, - "temperature": { - "type": "number", - "minimum": 0, - "maximum": 2 - }, - "target": { - "type": "string" - } - }, - "required": [ - "type" - ], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": 0, - "maximum": 1 - } - ] - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "const": "contains" - }, - "value": { - "type": "string" - } - }, - "required": [ - "type", - "value" - ], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { + "value": { + "type": "string" + } + }, + "required": [ + "type", + "value" + ], + "additionalProperties": false + }, + { + "type": "object", + "properties": { + "name": { "type": "string" }, "weight": { @@ -17612,120 +16130,6 @@ ], "additionalProperties": false }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": 0, - "maximum": 1 - } - ] - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": [ - "agent-judge", - "agent_judge" - ] - }, - "prompt": { - "type": "string" - }, - "rubrics": { - "type": "array", - "items": { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "outcome": { - "type": "string" - }, - "weight": { - "type": "number" - }, - "required": { - "type": "boolean" - }, - "required_min_score": { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - "score_ranges": { - "type": "array", - "items": { - "type": "object", - "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ - { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - { - "type": "integer", - "minimum": 0, - "maximum": 10 - } - ] - }, - "outcome": { - "type": "string", - "minLength": 1 - } - }, - "required": [ - "score_range", - "outcome" - ], - "additionalProperties": false - } - } - }, - "additionalProperties": false - } - }, - "max_steps": { - "type": "integer", - "minimum": 1, - "maximum": 50 - }, - "temperature": { - "type": "number", - "minimum": 0, - "maximum": 2 - }, - "target": { - "type": "string" - } - }, - "required": [ - "type" - ], - "additionalProperties": false - }, { "type": "object", "properties": { @@ -18782,148 +17186,34 @@ "execution_metrics" ] }, - "max_tool_calls": { - "type": "number", - "minimum": 0 - }, - "max_llm_calls": { - "type": "number", - "minimum": 0 - }, - "max_tokens": { - "type": "number", - "minimum": 0 - }, - "max_cost_usd": { - "type": "number", - "minimum": 0 - }, - "max_duration_ms": { - "type": "number", - "minimum": 0 - }, - "target_exploration_ratio": { - "type": "number", - "minimum": 0, - "maximum": 1 - }, - "exploration_tolerance": { - "type": "number", - "minimum": 0 - } - }, - "required": [ - "type" - ], - "additionalProperties": false - }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": 0, - "maximum": 1 - } - ] - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": [ - "agent-judge", - "agent_judge" - ] - }, - "prompt": { - "type": "string" - }, - "rubrics": { - "type": "array", - "items": { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "outcome": { - "type": "string" - }, - "weight": { - "type": "number" - }, - "required": { - "type": "boolean" - }, - "required_min_score": { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - "score_ranges": { - "type": "array", - "items": { - "type": "object", - "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ - { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - { - "type": "integer", - "minimum": 0, - "maximum": 10 - } - ] - }, - "outcome": { - "type": "string", - "minLength": 1 - } - }, - "required": [ - "score_range", - "outcome" - ], - "additionalProperties": false - } - } - }, - "additionalProperties": false - } - }, - "max_steps": { - "type": "integer", - "minimum": 1, - "maximum": 50 + "max_tool_calls": { + "type": "number", + "minimum": 0 }, - "temperature": { + "max_llm_calls": { + "type": "number", + "minimum": 0 + }, + "max_tokens": { + "type": "number", + "minimum": 0 + }, + "max_cost_usd": { + "type": "number", + "minimum": 0 + }, + "max_duration_ms": { + "type": "number", + "minimum": 0 + }, + "target_exploration_ratio": { "type": "number", "minimum": 0, - "maximum": 2 + "maximum": 1 }, - "target": { - "type": "string" + "exploration_tolerance": { + "type": "number", + "minimum": 0 } }, "required": [ @@ -20074,120 +18364,6 @@ ], "additionalProperties": false }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": 0, - "maximum": 1 - } - ] - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": [ - "agent-judge", - "agent_judge" - ] - }, - "prompt": { - "type": "string" - }, - "rubrics": { - "type": "array", - "items": { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "outcome": { - "type": "string" - }, - "weight": { - "type": "number" - }, - "required": { - "type": "boolean" - }, - "required_min_score": { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - "score_ranges": { - "type": "array", - "items": { - "type": "object", - "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ - { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - { - "type": "integer", - "minimum": 0, - "maximum": 10 - } - ] - }, - "outcome": { - "type": "string", - "minLength": 1 - } - }, - "required": [ - "score_range", - "outcome" - ], - "additionalProperties": false - } - } - }, - "additionalProperties": false - } - }, - "max_steps": { - "type": "integer", - "minimum": 1, - "maximum": 50 - }, - "temperature": { - "type": "number", - "minimum": 0, - "maximum": 2 - }, - "target": { - "type": "string" - } - }, - "required": [ - "type" - ], - "additionalProperties": false - }, { "type": "object", "properties": { @@ -21279,120 +19455,6 @@ ], "additionalProperties": false }, - { - "type": "object", - "properties": { - "name": { - "type": "string" - }, - "weight": { - "type": "number", - "minimum": 0 - }, - "required": { - "anyOf": [ - { - "type": "boolean" - }, - { - "type": "number", - "exclusiveMinimum": 0, - "maximum": 1 - } - ] - }, - "negate": { - "type": "boolean" - }, - "type": { - "type": "string", - "enum": [ - "agent-judge", - "agent_judge" - ] - }, - "prompt": { - "type": "string" - }, - "rubrics": { - "type": "array", - "items": { - "type": "object", - "properties": { - "id": { - "type": "string" - }, - "outcome": { - "type": "string" - }, - "weight": { - "type": "number" - }, - "required": { - "type": "boolean" - }, - "required_min_score": { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - "score_ranges": { - "type": "array", - "items": { - "type": "object", - "properties": { - "score_range": { - "type": "array", - "minItems": 2, - "maxItems": 2, - "items": [ - { - "type": "integer", - "minimum": 0, - "maximum": 10 - }, - { - "type": "integer", - "minimum": 0, - "maximum": 10 - } - ] - }, - "outcome": { - "type": "string", - "minLength": 1 - } - }, - "required": [ - "score_range", - "outcome" - ], - "additionalProperties": false - } - } - }, - "additionalProperties": false - } - }, - "max_steps": { - "type": "integer", - "minimum": 1, - "maximum": 50 - }, - "temperature": { - "type": "number", - "minimum": 0, - "maximum": 2 - }, - "target": { - "type": "string" - } - }, - "required": [ - "type" - ], - "additionalProperties": false - }, { "type": "object", "properties": { From 3171eb94b26b145418b498c06cfc1a577a2108f7 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 15 Mar 2026 19:15:51 +0000 Subject: [PATCH 11/13] docs: add E2E checklist to CLAUDE.md for all work before finishing Co-Authored-By: Claude Opus 4.6 --- CLAUDE.md | 23 +++++++++++++++++++++++ 1 file changed, 23 insertions(+) diff --git a/CLAUDE.md b/CLAUDE.md index ea7608f0f..492e4c944 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -159,6 +159,28 @@ Unit tests alone are insufficient for evaluator changes. After implementing or m 5. **Note:** `--dry-run` returns mock responses that don't match evaluator output schemas. Use it only for testing harness flow, not evaluator logic. +## Completing Work — E2E Checklist + +Before marking any branch as ready for review, complete this checklist: + +1. **Copy `.env` to worktree** (if working in a git worktree): + ```bash + cp /home/christso/projects/agentv/.env .env + ``` + Without this, any eval run or LLM-dependent test will fail with missing API key errors. + +2. **Run unit tests**: `bun run test` — all must pass. + +3. **Run at least one real eval** against an example file to verify end-to-end behavior: + ```bash + bun apps/cli/src/cli.ts eval examples/features/rubric/evals/dataset.eval.yaml --test-id + ``` + Inspect the output JSONL to confirm correct evaluator type, scores, and hits/misses. + +4. **Verify no regressions** in areas adjacent to your changes (e.g., if you changed evaluator parsing, run an eval that exercises different evaluator types). + +5. **Mark PR as ready** only after all above steps pass. + ## Evaluator Type System Evaluator types use **kebab-case** everywhere (matching promptfoo convention): @@ -248,6 +270,7 @@ When working on a GitHub issue, **ALWAYS** follow this workflow: ``` 4. **Before merging**, ensure: + - **E2E verification completed** (see "Completing Work — E2E Checklist" below) - CI pipeline passes (all checks green) - Code has been reviewed if required - No merge conflicts with `main` From effb331dbdef6ba3cd0e6667c467c60167a440c1 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 15 Mar 2026 19:16:58 +0000 Subject: [PATCH 12/13] style: fix biome formatting Co-Authored-By: Claude Opus 4.6 --- apps/cli/node_modules | 1 + node_modules | 1 + packages/core/node_modules | 1 + .../loaders/eval-yaml-transpiler.ts | 5 +- .../evaluation/loaders/evaluator-parser.ts | 4 +- packages/core/src/evaluation/orchestrator.ts | 1 - packages/eval/node_modules | 1 + .../references/eval-schema.json | 3462 ++++------------- 8 files changed, 677 insertions(+), 2799 deletions(-) create mode 120000 apps/cli/node_modules create mode 120000 node_modules create mode 120000 packages/core/node_modules create mode 120000 packages/eval/node_modules diff --git a/apps/cli/node_modules b/apps/cli/node_modules new file mode 120000 index 000000000..c99229581 --- /dev/null +++ b/apps/cli/node_modules @@ -0,0 +1 @@ +/home/christso/projects/agentv/apps/cli/node_modules \ No newline at end of file diff --git a/node_modules b/node_modules new file mode 120000 index 000000000..8cba0ae08 --- /dev/null +++ b/node_modules @@ -0,0 +1 @@ +/home/christso/projects/agentv/node_modules \ No newline at end of file diff --git a/packages/core/node_modules b/packages/core/node_modules new file mode 120000 index 000000000..a07840188 --- /dev/null +++ b/packages/core/node_modules @@ -0,0 +1 @@ +/home/christso/projects/agentv/packages/core/node_modules \ No newline at end of file diff --git a/packages/core/src/evaluation/loaders/eval-yaml-transpiler.ts b/packages/core/src/evaluation/loaders/eval-yaml-transpiler.ts index db646013b..9c79366a0 100644 --- a/packages/core/src/evaluation/loaders/eval-yaml-transpiler.ts +++ b/packages/core/src/evaluation/loaders/eval-yaml-transpiler.ts @@ -216,10 +216,7 @@ function assertionToNaturalLanguage(entry: RawAssertEntry): string | null { * Most assertions produce exactly one string; llm-judge with rubrics expands to many. */ function assertionToNaturalLanguageList(entry: RawAssertEntry): string[] { - if ( - entry.type === 'llm-judge' || - entry.type === 'llm_judge' - ) { + if (entry.type === 'llm-judge' || entry.type === 'llm_judge') { if (Array.isArray(entry.rubrics) && entry.rubrics.length > 0) { return (entry.rubrics as Array<{ outcome?: string; criteria?: string; id?: string }>) .map((r) => r.outcome ?? r.criteria ?? r.id) diff --git a/packages/core/src/evaluation/loaders/evaluator-parser.ts b/packages/core/src/evaluation/loaders/evaluator-parser.ts index 8f994a8e4..4ec619e22 100644 --- a/packages/core/src/evaluation/loaders/evaluator-parser.ts +++ b/packages/core/src/evaluation/loaders/evaluator-parser.ts @@ -134,8 +134,7 @@ async function parseEvaluatorList( const typeValue = typeof rawType === 'string' ? normalizeEvaluatorType(rawType) : rawType; // Unknown types are treated as custom assertion types (resolved via registry discovery) - const isCustomType = - typeof typeValue === 'string' && !isEvaluatorKind(typeValue); + const isCustomType = typeof typeValue === 'string' && !isEvaluatorKind(typeValue); if (typeof typeValue !== 'string') { logWarning(`Skipping evaluator with invalid type in '${evalId}'`); continue; @@ -853,7 +852,6 @@ async function parseEvaluatorList( continue; } - if (typeValue === 'skill-trigger') { const skillName = asString(rawEvaluator.skill); if (!skillName) { diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index fb8649aa4..95cbdab7f 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -346,7 +346,6 @@ export async function runEvaluation( if (cliJudgeTarget === 'agentv') { if (!cliModel) { throw new Error('--judge-target "agentv" requires --model (e.g., "openai:gpt-5-mini")'); - } const { AgentvProvider } = await import('./providers/agentv-provider.js'); return new AgentvProvider('agentv', { model: cliModel, temperature: 0 }); diff --git a/packages/eval/node_modules b/packages/eval/node_modules new file mode 120000 index 000000000..d1cf07368 --- /dev/null +++ b/packages/eval/node_modules @@ -0,0 +1 @@ +/home/christso/projects/agentv/packages/eval/node_modules \ No newline at end of file diff --git a/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json b/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json index 77f5be6d9..483031bf6 100644 --- a/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json +++ b/plugins/agentv-dev/skills/agentv-eval-writer/references/eval-schema.json @@ -53,12 +53,7 @@ "properties": { "role": { "type": "string", - "enum": [ - "system", - "user", - "assistant", - "tool" - ] + "enum": ["system", "user", "assistant", "tool"] }, "content": { "anyOf": [ @@ -72,29 +67,20 @@ "properties": { "type": { "type": "string", - "enum": [ - "text", - "file" - ] + "enum": ["text", "file"] }, "value": { "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false } } ] } }, - "required": [ - "role", - "content" - ], + "required": ["role", "content"], "additionalProperties": false } } @@ -129,12 +115,7 @@ "properties": { "role": { "type": "string", - "enum": [ - "system", - "user", - "assistant", - "tool" - ] + "enum": ["system", "user", "assistant", "tool"] }, "content": { "anyOf": [ @@ -148,29 +129,20 @@ "properties": { "type": { "type": "string", - "enum": [ - "text", - "file" - ] + "enum": ["text", "file"] }, "value": { "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false } } ] } }, - "required": [ - "role", - "content" - ], + "required": ["role", "content"], "additionalProperties": false } } @@ -192,12 +164,7 @@ "properties": { "role": { "type": "string", - "enum": [ - "system", - "user", - "assistant", - "tool" - ] + "enum": ["system", "user", "assistant", "tool"] }, "content": { "anyOf": [ @@ -211,29 +178,20 @@ "properties": { "type": { "type": "string", - "enum": [ - "text", - "file" - ] + "enum": ["text", "file"] }, "value": { "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false } } ] } }, - "required": [ - "role", - "content" - ], + "required": ["role", "content"], "additionalProperties": false } } @@ -270,10 +228,7 @@ }, "type": { "type": "string", - "enum": [ - "code-judge", - "code_judge" - ] + "enum": ["code-judge", "code_judge"] }, "command": { "anyOf": [ @@ -325,10 +280,7 @@ "additionalProperties": {} } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false }, { @@ -358,10 +310,7 @@ }, "type": { "type": "string", - "enum": [ - "llm-judge", - "llm_judge" - ] + "enum": ["llm-judge", "llm_judge"] }, "prompt": { "anyOf": [ @@ -455,10 +404,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -487,9 +433,7 @@ "maximum": 2 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -549,9 +493,7 @@ } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -567,10 +509,7 @@ "maximum": 1 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -587,10 +526,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false }, { @@ -607,18 +543,13 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false } ] } }, - "required": [ - "type", - "aggregator" - ], + "required": ["type", "aggregator"], "additionalProperties": false }, { @@ -648,20 +579,11 @@ }, "type": { "type": "string", - "enum": [ - "tool-trajectory", - "tool_trajectory" - ] + "enum": ["tool-trajectory", "tool_trajectory"] }, "mode": { "type": "string", - "enum": [ - "any_order", - "in_order", - "exact", - "subset", - "superset" - ] + "enum": ["any_order", "in_order", "exact", "subset", "superset"] }, "minimums": { "type": "object", @@ -702,12 +624,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -721,12 +638,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -737,9 +649,7 @@ ] } }, - "required": [ - "tool" - ], + "required": ["tool"], "additionalProperties": false } }, @@ -747,12 +657,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -766,12 +671,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -782,10 +682,7 @@ ] } }, - "required": [ - "type", - "mode" - ], + "required": ["type", "mode"], "additionalProperties": false }, { @@ -815,10 +712,7 @@ }, "type": { "type": "string", - "enum": [ - "field-accuracy", - "field_accuracy" - ] + "enum": ["field-accuracy", "field_accuracy"] }, "fields": { "type": "array", @@ -830,11 +724,7 @@ }, "match": { "type": "string", - "enum": [ - "exact", - "numeric_tolerance", - "date" - ] + "enum": ["exact", "numeric_tolerance", "date"] }, "required": { "type": "boolean" @@ -856,26 +746,17 @@ } } }, - "required": [ - "path", - "match" - ], + "required": ["path", "match"], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": [ - "weighted_average", - "all_or_nothing" - ] + "enum": ["weighted_average", "all_or_nothing"] } }, - "required": [ - "type", - "fields" - ], + "required": ["type", "fields"], "additionalProperties": false }, { @@ -912,10 +793,7 @@ "minimum": 0 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -952,10 +830,7 @@ "minimum": 0 } }, - "required": [ - "type", - "budget" - ], + "required": ["type", "budget"], "additionalProperties": false }, { @@ -985,10 +860,7 @@ }, "type": { "type": "string", - "enum": [ - "token-usage", - "token_usage" - ] + "enum": ["token-usage", "token_usage"] }, "max_total": { "type": "number", @@ -1003,9 +875,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -1035,10 +905,7 @@ }, "type": { "type": "string", - "enum": [ - "execution-metrics", - "execution_metrics" - ] + "enum": ["execution-metrics", "execution_metrics"] }, "max_tool_calls": { "type": "number", @@ -1070,9 +937,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -1108,10 +973,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -1147,10 +1009,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -1180,15 +1039,10 @@ }, "type": { "type": "string", - "enum": [ - "is-json", - "is_json" - ] + "enum": ["is-json", "is_json"] } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -1224,10 +1078,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -1308,10 +1159,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -1321,10 +1169,7 @@ "minItems": 1 } }, - "required": [ - "type", - "criteria" - ], + "required": ["type", "criteria"], "additionalProperties": false } ] @@ -1361,10 +1206,7 @@ }, "type": { "type": "string", - "enum": [ - "code-judge", - "code_judge" - ] + "enum": ["code-judge", "code_judge"] }, "command": { "anyOf": [ @@ -1416,10 +1258,7 @@ "additionalProperties": {} } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false }, { @@ -1449,10 +1288,7 @@ }, "type": { "type": "string", - "enum": [ - "llm-judge", - "llm_judge" - ] + "enum": ["llm-judge", "llm_judge"] }, "prompt": { "anyOf": [ @@ -1546,10 +1382,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -1578,9 +1411,7 @@ "maximum": 2 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -1640,9 +1471,7 @@ } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -1658,10 +1487,7 @@ "maximum": 1 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -1678,10 +1504,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false }, { @@ -1698,18 +1521,13 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false } ] } }, - "required": [ - "type", - "aggregator" - ], + "required": ["type", "aggregator"], "additionalProperties": false }, { @@ -1739,20 +1557,11 @@ }, "type": { "type": "string", - "enum": [ - "tool-trajectory", - "tool_trajectory" - ] + "enum": ["tool-trajectory", "tool_trajectory"] }, "mode": { "type": "string", - "enum": [ - "any_order", - "in_order", - "exact", - "subset", - "superset" - ] + "enum": ["any_order", "in_order", "exact", "subset", "superset"] }, "minimums": { "type": "object", @@ -1793,12 +1602,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -1812,12 +1616,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -1828,9 +1627,7 @@ ] } }, - "required": [ - "tool" - ], + "required": ["tool"], "additionalProperties": false } }, @@ -1838,12 +1635,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -1857,12 +1649,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -1873,10 +1660,7 @@ ] } }, - "required": [ - "type", - "mode" - ], + "required": ["type", "mode"], "additionalProperties": false }, { @@ -1906,10 +1690,7 @@ }, "type": { "type": "string", - "enum": [ - "field-accuracy", - "field_accuracy" - ] + "enum": ["field-accuracy", "field_accuracy"] }, "fields": { "type": "array", @@ -1921,11 +1702,7 @@ }, "match": { "type": "string", - "enum": [ - "exact", - "numeric_tolerance", - "date" - ] + "enum": ["exact", "numeric_tolerance", "date"] }, "required": { "type": "boolean" @@ -1947,26 +1724,17 @@ } } }, - "required": [ - "path", - "match" - ], + "required": ["path", "match"], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": [ - "weighted_average", - "all_or_nothing" - ] + "enum": ["weighted_average", "all_or_nothing"] } }, - "required": [ - "type", - "fields" - ], + "required": ["type", "fields"], "additionalProperties": false }, { @@ -2003,10 +1771,7 @@ "minimum": 0 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -2043,10 +1808,7 @@ "minimum": 0 } }, - "required": [ - "type", - "budget" - ], + "required": ["type", "budget"], "additionalProperties": false }, { @@ -2076,10 +1838,7 @@ }, "type": { "type": "string", - "enum": [ - "token-usage", - "token_usage" - ] + "enum": ["token-usage", "token_usage"] }, "max_total": { "type": "number", @@ -2094,9 +1853,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -2126,10 +1883,7 @@ }, "type": { "type": "string", - "enum": [ - "execution-metrics", - "execution_metrics" - ] + "enum": ["execution-metrics", "execution_metrics"] }, "max_tool_calls": { "type": "number", @@ -2161,9 +1915,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -2199,10 +1951,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -2238,10 +1987,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -2271,15 +2017,10 @@ }, "type": { "type": "string", - "enum": [ - "is-json", - "is_json" - ] + "enum": ["is-json", "is_json"] } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -2315,10 +2056,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -2399,10 +2137,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -2412,10 +2147,7 @@ "minItems": 1 } }, - "required": [ - "type", - "criteria" - ], + "required": ["type", "criteria"], "additionalProperties": false } ] @@ -2452,10 +2184,7 @@ }, "type": { "type": "string", - "enum": [ - "code-judge", - "code_judge" - ] + "enum": ["code-judge", "code_judge"] }, "command": { "anyOf": [ @@ -2507,10 +2236,7 @@ "additionalProperties": {} } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false }, { @@ -2540,10 +2266,7 @@ }, "type": { "type": "string", - "enum": [ - "llm-judge", - "llm_judge" - ] + "enum": ["llm-judge", "llm_judge"] }, "prompt": { "anyOf": [ @@ -2637,10 +2360,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -2669,9 +2389,7 @@ "maximum": 2 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -2731,9 +2449,7 @@ } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -2749,10 +2465,7 @@ "maximum": 1 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -2769,10 +2482,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false }, { @@ -2789,18 +2499,13 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false } ] } }, - "required": [ - "type", - "aggregator" - ], + "required": ["type", "aggregator"], "additionalProperties": false }, { @@ -2830,20 +2535,11 @@ }, "type": { "type": "string", - "enum": [ - "tool-trajectory", - "tool_trajectory" - ] + "enum": ["tool-trajectory", "tool_trajectory"] }, "mode": { "type": "string", - "enum": [ - "any_order", - "in_order", - "exact", - "subset", - "superset" - ] + "enum": ["any_order", "in_order", "exact", "subset", "superset"] }, "minimums": { "type": "object", @@ -2884,12 +2580,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -2903,12 +2594,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -2919,9 +2605,7 @@ ] } }, - "required": [ - "tool" - ], + "required": ["tool"], "additionalProperties": false } }, @@ -2929,12 +2613,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -2948,12 +2627,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -2964,10 +2638,7 @@ ] } }, - "required": [ - "type", - "mode" - ], + "required": ["type", "mode"], "additionalProperties": false }, { @@ -2997,10 +2668,7 @@ }, "type": { "type": "string", - "enum": [ - "field-accuracy", - "field_accuracy" - ] + "enum": ["field-accuracy", "field_accuracy"] }, "fields": { "type": "array", @@ -3012,11 +2680,7 @@ }, "match": { "type": "string", - "enum": [ - "exact", - "numeric_tolerance", - "date" - ] + "enum": ["exact", "numeric_tolerance", "date"] }, "required": { "type": "boolean" @@ -3038,26 +2702,17 @@ } } }, - "required": [ - "path", - "match" - ], + "required": ["path", "match"], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": [ - "weighted_average", - "all_or_nothing" - ] + "enum": ["weighted_average", "all_or_nothing"] } }, - "required": [ - "type", - "fields" - ], + "required": ["type", "fields"], "additionalProperties": false }, { @@ -3094,10 +2749,7 @@ "minimum": 0 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -3134,10 +2786,7 @@ "minimum": 0 } }, - "required": [ - "type", - "budget" - ], + "required": ["type", "budget"], "additionalProperties": false }, { @@ -3167,10 +2816,7 @@ }, "type": { "type": "string", - "enum": [ - "token-usage", - "token_usage" - ] + "enum": ["token-usage", "token_usage"] }, "max_total": { "type": "number", @@ -3185,9 +2831,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -3217,10 +2861,7 @@ }, "type": { "type": "string", - "enum": [ - "execution-metrics", - "execution_metrics" - ] + "enum": ["execution-metrics", "execution_metrics"] }, "max_tool_calls": { "type": "number", @@ -3252,9 +2893,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -3290,10 +2929,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -3329,10 +2965,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -3362,15 +2995,10 @@ }, "type": { "type": "string", - "enum": [ - "is-json", - "is_json" - ] + "enum": ["is-json", "is_json"] } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -3406,10 +3034,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -3490,10 +3115,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -3503,10 +3125,7 @@ "minItems": 1 } }, - "required": [ - "type", - "criteria" - ], + "required": ["type", "criteria"], "additionalProperties": false } ] @@ -3555,10 +3174,7 @@ }, "type": { "type": "string", - "enum": [ - "code-judge", - "code_judge" - ] + "enum": ["code-judge", "code_judge"] }, "command": { "anyOf": [ @@ -3610,10 +3226,7 @@ "additionalProperties": {} } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false }, { @@ -3643,10 +3256,7 @@ }, "type": { "type": "string", - "enum": [ - "llm-judge", - "llm_judge" - ] + "enum": ["llm-judge", "llm_judge"] }, "prompt": { "anyOf": [ @@ -3740,10 +3350,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -3772,9 +3379,7 @@ "maximum": 2 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -3834,9 +3439,7 @@ } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -3852,10 +3455,7 @@ "maximum": 1 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -3872,10 +3472,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false }, { @@ -3892,18 +3489,13 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false } ] } }, - "required": [ - "type", - "aggregator" - ], + "required": ["type", "aggregator"], "additionalProperties": false }, { @@ -3933,20 +3525,11 @@ }, "type": { "type": "string", - "enum": [ - "tool-trajectory", - "tool_trajectory" - ] + "enum": ["tool-trajectory", "tool_trajectory"] }, "mode": { "type": "string", - "enum": [ - "any_order", - "in_order", - "exact", - "subset", - "superset" - ] + "enum": ["any_order", "in_order", "exact", "subset", "superset"] }, "minimums": { "type": "object", @@ -3987,12 +3570,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -4006,12 +3584,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -4022,9 +3595,7 @@ ] } }, - "required": [ - "tool" - ], + "required": ["tool"], "additionalProperties": false } }, @@ -4032,12 +3603,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -4051,12 +3617,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -4067,10 +3628,7 @@ ] } }, - "required": [ - "type", - "mode" - ], + "required": ["type", "mode"], "additionalProperties": false }, { @@ -4100,10 +3658,7 @@ }, "type": { "type": "string", - "enum": [ - "field-accuracy", - "field_accuracy" - ] + "enum": ["field-accuracy", "field_accuracy"] }, "fields": { "type": "array", @@ -4115,11 +3670,7 @@ }, "match": { "type": "string", - "enum": [ - "exact", - "numeric_tolerance", - "date" - ] + "enum": ["exact", "numeric_tolerance", "date"] }, "required": { "type": "boolean" @@ -4141,26 +3692,17 @@ } } }, - "required": [ - "path", - "match" - ], + "required": ["path", "match"], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": [ - "weighted_average", - "all_or_nothing" - ] + "enum": ["weighted_average", "all_or_nothing"] } }, - "required": [ - "type", - "fields" - ], + "required": ["type", "fields"], "additionalProperties": false }, { @@ -4197,10 +3739,7 @@ "minimum": 0 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -4237,10 +3776,7 @@ "minimum": 0 } }, - "required": [ - "type", - "budget" - ], + "required": ["type", "budget"], "additionalProperties": false }, { @@ -4270,10 +3806,7 @@ }, "type": { "type": "string", - "enum": [ - "token-usage", - "token_usage" - ] + "enum": ["token-usage", "token_usage"] }, "max_total": { "type": "number", @@ -4288,9 +3821,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -4320,10 +3851,7 @@ }, "type": { "type": "string", - "enum": [ - "execution-metrics", - "execution_metrics" - ] + "enum": ["execution-metrics", "execution_metrics"] }, "max_tool_calls": { "type": "number", @@ -4355,9 +3883,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -4393,10 +3919,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -4432,10 +3955,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -4465,15 +3985,10 @@ }, "type": { "type": "string", - "enum": [ - "is-json", - "is_json" - ] + "enum": ["is-json", "is_json"] } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -4509,10 +4024,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -4593,10 +4105,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -4606,10 +4115,7 @@ "minItems": 1 } }, - "required": [ - "type", - "criteria" - ], + "required": ["type", "criteria"], "additionalProperties": false } ] @@ -4646,10 +4152,7 @@ }, "type": { "type": "string", - "enum": [ - "code-judge", - "code_judge" - ] + "enum": ["code-judge", "code_judge"] }, "command": { "anyOf": [ @@ -4701,10 +4204,7 @@ "additionalProperties": {} } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false }, { @@ -4734,10 +4234,7 @@ }, "type": { "type": "string", - "enum": [ - "llm-judge", - "llm_judge" - ] + "enum": ["llm-judge", "llm_judge"] }, "prompt": { "anyOf": [ @@ -4831,10 +4328,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -4863,9 +4357,7 @@ "maximum": 2 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -4925,9 +4417,7 @@ } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -4943,10 +4433,7 @@ "maximum": 1 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -4963,10 +4450,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false }, { @@ -4983,18 +4467,13 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false } ] } }, - "required": [ - "type", - "aggregator" - ], + "required": ["type", "aggregator"], "additionalProperties": false }, { @@ -5024,20 +4503,11 @@ }, "type": { "type": "string", - "enum": [ - "tool-trajectory", - "tool_trajectory" - ] + "enum": ["tool-trajectory", "tool_trajectory"] }, "mode": { "type": "string", - "enum": [ - "any_order", - "in_order", - "exact", - "subset", - "superset" - ] + "enum": ["any_order", "in_order", "exact", "subset", "superset"] }, "minimums": { "type": "object", @@ -5078,12 +4548,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -5097,12 +4562,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -5113,9 +4573,7 @@ ] } }, - "required": [ - "tool" - ], + "required": ["tool"], "additionalProperties": false } }, @@ -5123,12 +4581,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -5142,12 +4595,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -5158,10 +4606,7 @@ ] } }, - "required": [ - "type", - "mode" - ], + "required": ["type", "mode"], "additionalProperties": false }, { @@ -5191,10 +4636,7 @@ }, "type": { "type": "string", - "enum": [ - "field-accuracy", - "field_accuracy" - ] + "enum": ["field-accuracy", "field_accuracy"] }, "fields": { "type": "array", @@ -5206,11 +4648,7 @@ }, "match": { "type": "string", - "enum": [ - "exact", - "numeric_tolerance", - "date" - ] + "enum": ["exact", "numeric_tolerance", "date"] }, "required": { "type": "boolean" @@ -5232,26 +4670,17 @@ } } }, - "required": [ - "path", - "match" - ], + "required": ["path", "match"], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": [ - "weighted_average", - "all_or_nothing" - ] + "enum": ["weighted_average", "all_or_nothing"] } }, - "required": [ - "type", - "fields" - ], + "required": ["type", "fields"], "additionalProperties": false }, { @@ -5288,10 +4717,7 @@ "minimum": 0 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -5328,10 +4754,7 @@ "minimum": 0 } }, - "required": [ - "type", - "budget" - ], + "required": ["type", "budget"], "additionalProperties": false }, { @@ -5361,10 +4784,7 @@ }, "type": { "type": "string", - "enum": [ - "token-usage", - "token_usage" - ] + "enum": ["token-usage", "token_usage"] }, "max_total": { "type": "number", @@ -5379,9 +4799,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -5411,10 +4829,7 @@ }, "type": { "type": "string", - "enum": [ - "execution-metrics", - "execution_metrics" - ] + "enum": ["execution-metrics", "execution_metrics"] }, "max_tool_calls": { "type": "number", @@ -5446,9 +4861,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -5484,10 +4897,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -5523,10 +4933,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -5556,15 +4963,10 @@ }, "type": { "type": "string", - "enum": [ - "is-json", - "is_json" - ] + "enum": ["is-json", "is_json"] } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -5600,10 +5002,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -5684,10 +5083,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -5697,10 +5093,7 @@ "minItems": 1 } }, - "required": [ - "type", - "criteria" - ], + "required": ["type", "criteria"], "additionalProperties": false } ] @@ -5737,10 +5130,7 @@ }, "type": { "type": "string", - "enum": [ - "code-judge", - "code_judge" - ] + "enum": ["code-judge", "code_judge"] }, "command": { "anyOf": [ @@ -5792,10 +5182,7 @@ "additionalProperties": {} } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false }, { @@ -5825,10 +5212,7 @@ }, "type": { "type": "string", - "enum": [ - "llm-judge", - "llm_judge" - ] + "enum": ["llm-judge", "llm_judge"] }, "prompt": { "anyOf": [ @@ -5922,10 +5306,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -5954,9 +5335,7 @@ "maximum": 2 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -6016,9 +5395,7 @@ } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -6034,10 +5411,7 @@ "maximum": 1 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -6054,10 +5428,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false }, { @@ -6074,18 +5445,13 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false } ] } }, - "required": [ - "type", - "aggregator" - ], + "required": ["type", "aggregator"], "additionalProperties": false }, { @@ -6115,20 +5481,11 @@ }, "type": { "type": "string", - "enum": [ - "tool-trajectory", - "tool_trajectory" - ] + "enum": ["tool-trajectory", "tool_trajectory"] }, "mode": { "type": "string", - "enum": [ - "any_order", - "in_order", - "exact", - "subset", - "superset" - ] + "enum": ["any_order", "in_order", "exact", "subset", "superset"] }, "minimums": { "type": "object", @@ -6169,12 +5526,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -6188,12 +5540,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -6204,9 +5551,7 @@ ] } }, - "required": [ - "tool" - ], + "required": ["tool"], "additionalProperties": false } }, @@ -6214,12 +5559,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -6233,12 +5573,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -6249,10 +5584,7 @@ ] } }, - "required": [ - "type", - "mode" - ], + "required": ["type", "mode"], "additionalProperties": false }, { @@ -6282,10 +5614,7 @@ }, "type": { "type": "string", - "enum": [ - "field-accuracy", - "field_accuracy" - ] + "enum": ["field-accuracy", "field_accuracy"] }, "fields": { "type": "array", @@ -6297,11 +5626,7 @@ }, "match": { "type": "string", - "enum": [ - "exact", - "numeric_tolerance", - "date" - ] + "enum": ["exact", "numeric_tolerance", "date"] }, "required": { "type": "boolean" @@ -6323,26 +5648,17 @@ } } }, - "required": [ - "path", - "match" - ], + "required": ["path", "match"], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": [ - "weighted_average", - "all_or_nothing" - ] + "enum": ["weighted_average", "all_or_nothing"] } }, - "required": [ - "type", - "fields" - ], + "required": ["type", "fields"], "additionalProperties": false }, { @@ -6379,10 +5695,7 @@ "minimum": 0 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -6419,10 +5732,7 @@ "minimum": 0 } }, - "required": [ - "type", - "budget" - ], + "required": ["type", "budget"], "additionalProperties": false }, { @@ -6452,10 +5762,7 @@ }, "type": { "type": "string", - "enum": [ - "token-usage", - "token_usage" - ] + "enum": ["token-usage", "token_usage"] }, "max_total": { "type": "number", @@ -6470,9 +5777,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -6502,10 +5807,7 @@ }, "type": { "type": "string", - "enum": [ - "execution-metrics", - "execution_metrics" - ] + "enum": ["execution-metrics", "execution_metrics"] }, "max_tool_calls": { "type": "number", @@ -6537,9 +5839,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -6575,10 +5875,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -6614,10 +5911,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -6647,15 +5941,10 @@ }, "type": { "type": "string", - "enum": [ - "is-json", - "is_json" - ] + "enum": ["is-json", "is_json"] } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -6691,10 +5980,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -6775,10 +6061,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -6788,10 +6071,7 @@ "minItems": 1 } }, - "required": [ - "type", - "criteria" - ], + "required": ["type", "criteria"], "additionalProperties": false } ] @@ -6812,11 +6092,7 @@ }, "strategy": { "type": "string", - "enum": [ - "pass_at_k", - "mean", - "confidence_interval" - ] + "enum": ["pass_at_k", "mean", "confidence_interval"] }, "cost_limit_usd": { "type": "number", @@ -6827,9 +6103,7 @@ "minimum": 0 } }, - "required": [ - "count" - ], + "required": ["count"], "additionalProperties": false }, "total_budget_usd": { @@ -6857,10 +6131,7 @@ }, "isolation": { "type": "string", - "enum": [ - "shared", - "per_test" - ] + "enum": ["shared", "per_test"] }, "repos": { "type": "array", @@ -6884,10 +6155,7 @@ "format": "uri" } }, - "required": [ - "type", - "url" - ], + "required": ["type", "url"], "additionalProperties": false }, { @@ -6901,10 +6169,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false } ] @@ -6917,10 +6182,7 @@ }, "resolve": { "type": "string", - "enum": [ - "remote", - "local" - ] + "enum": ["remote", "local"] }, "ancestor": { "type": "integer", @@ -6949,10 +6211,7 @@ "additionalProperties": false } }, - "required": [ - "path", - "source" - ], + "required": ["path", "source"], "additionalProperties": false } }, @@ -6988,11 +6247,7 @@ }, "reset": { "type": "string", - "enum": [ - "none", - "fast", - "strict" - ] + "enum": ["none", "fast", "strict"] } }, "additionalProperties": false @@ -7023,11 +6278,7 @@ }, "reset": { "type": "string", - "enum": [ - "none", - "fast", - "strict" - ] + "enum": ["none", "fast", "strict"] } }, "additionalProperties": false @@ -7058,11 +6309,7 @@ }, "reset": { "type": "string", - "enum": [ - "none", - "fast", - "strict" - ] + "enum": ["none", "fast", "strict"] } }, "additionalProperties": false @@ -7093,11 +6340,7 @@ }, "reset": { "type": "string", - "enum": [ - "none", - "fast", - "strict" - ] + "enum": ["none", "fast", "strict"] } }, "additionalProperties": false @@ -7107,11 +6350,7 @@ }, "mode": { "type": "string", - "enum": [ - "pooled", - "temp", - "static" - ] + "enum": ["pooled", "temp", "static"] }, "path": { "type": "string" @@ -7133,9 +6372,7 @@ "type": "string" } }, - "required": [ - "id" - ], + "required": ["id"], "additionalProperties": false } }, @@ -7173,12 +6410,7 @@ "properties": { "role": { "type": "string", - "enum": [ - "system", - "user", - "assistant", - "tool" - ] + "enum": ["system", "user", "assistant", "tool"] }, "content": { "anyOf": [ @@ -7192,29 +6424,20 @@ "properties": { "type": { "type": "string", - "enum": [ - "text", - "file" - ] + "enum": ["text", "file"] }, "value": { "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false } } ] } }, - "required": [ - "role", - "content" - ], + "required": ["role", "content"], "additionalProperties": false } } @@ -7236,12 +6459,7 @@ "properties": { "role": { "type": "string", - "enum": [ - "system", - "user", - "assistant", - "tool" - ] + "enum": ["system", "user", "assistant", "tool"] }, "content": { "anyOf": [ @@ -7255,29 +6473,20 @@ "properties": { "type": { "type": "string", - "enum": [ - "text", - "file" - ] + "enum": ["text", "file"] }, "value": { "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false } } ] } }, - "required": [ - "role", - "content" - ], + "required": ["role", "content"], "additionalProperties": false } } @@ -7314,10 +6523,7 @@ }, "type": { "type": "string", - "enum": [ - "code-judge", - "code_judge" - ] + "enum": ["code-judge", "code_judge"] }, "command": { "anyOf": [ @@ -7369,10 +6575,7 @@ "additionalProperties": {} } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false }, { @@ -7402,10 +6605,7 @@ }, "type": { "type": "string", - "enum": [ - "llm-judge", - "llm_judge" - ] + "enum": ["llm-judge", "llm_judge"] }, "prompt": { "anyOf": [ @@ -7499,10 +6699,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -7531,9 +6728,7 @@ "maximum": 2 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -7593,9 +6788,7 @@ } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -7611,10 +6804,7 @@ "maximum": 1 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -7631,10 +6821,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false }, { @@ -7651,18 +6838,13 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false } ] } }, - "required": [ - "type", - "aggregator" - ], + "required": ["type", "aggregator"], "additionalProperties": false }, { @@ -7692,20 +6874,11 @@ }, "type": { "type": "string", - "enum": [ - "tool-trajectory", - "tool_trajectory" - ] + "enum": ["tool-trajectory", "tool_trajectory"] }, "mode": { "type": "string", - "enum": [ - "any_order", - "in_order", - "exact", - "subset", - "superset" - ] + "enum": ["any_order", "in_order", "exact", "subset", "superset"] }, "minimums": { "type": "object", @@ -7746,12 +6919,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -7765,12 +6933,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -7781,9 +6944,7 @@ ] } }, - "required": [ - "tool" - ], + "required": ["tool"], "additionalProperties": false } }, @@ -7791,12 +6952,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -7810,12 +6966,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -7826,10 +6977,7 @@ ] } }, - "required": [ - "type", - "mode" - ], + "required": ["type", "mode"], "additionalProperties": false }, { @@ -7859,10 +7007,7 @@ }, "type": { "type": "string", - "enum": [ - "field-accuracy", - "field_accuracy" - ] + "enum": ["field-accuracy", "field_accuracy"] }, "fields": { "type": "array", @@ -7874,11 +7019,7 @@ }, "match": { "type": "string", - "enum": [ - "exact", - "numeric_tolerance", - "date" - ] + "enum": ["exact", "numeric_tolerance", "date"] }, "required": { "type": "boolean" @@ -7900,26 +7041,17 @@ } } }, - "required": [ - "path", - "match" - ], + "required": ["path", "match"], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": [ - "weighted_average", - "all_or_nothing" - ] + "enum": ["weighted_average", "all_or_nothing"] } }, - "required": [ - "type", - "fields" - ], + "required": ["type", "fields"], "additionalProperties": false }, { @@ -7956,10 +7088,7 @@ "minimum": 0 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -7996,10 +7125,7 @@ "minimum": 0 } }, - "required": [ - "type", - "budget" - ], + "required": ["type", "budget"], "additionalProperties": false }, { @@ -8029,10 +7155,7 @@ }, "type": { "type": "string", - "enum": [ - "token-usage", - "token_usage" - ] + "enum": ["token-usage", "token_usage"] }, "max_total": { "type": "number", @@ -8047,9 +7170,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -8079,10 +7200,7 @@ }, "type": { "type": "string", - "enum": [ - "execution-metrics", - "execution_metrics" - ] + "enum": ["execution-metrics", "execution_metrics"] }, "max_tool_calls": { "type": "number", @@ -8114,9 +7232,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -8152,10 +7268,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -8191,10 +7304,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -8224,15 +7334,10 @@ }, "type": { "type": "string", - "enum": [ - "is-json", - "is_json" - ] + "enum": ["is-json", "is_json"] } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -8268,10 +7373,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -8352,10 +7454,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -8365,10 +7464,7 @@ "minItems": 1 } }, - "required": [ - "type", - "criteria" - ], + "required": ["type", "criteria"], "additionalProperties": false } ] @@ -8405,10 +7501,7 @@ }, "type": { "type": "string", - "enum": [ - "code-judge", - "code_judge" - ] + "enum": ["code-judge", "code_judge"] }, "command": { "anyOf": [ @@ -8460,10 +7553,7 @@ "additionalProperties": {} } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false }, { @@ -8493,10 +7583,7 @@ }, "type": { "type": "string", - "enum": [ - "llm-judge", - "llm_judge" - ] + "enum": ["llm-judge", "llm_judge"] }, "prompt": { "anyOf": [ @@ -8590,10 +7677,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -8622,9 +7706,7 @@ "maximum": 2 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -8684,9 +7766,7 @@ } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -8702,10 +7782,7 @@ "maximum": 1 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -8722,10 +7799,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false }, { @@ -8742,18 +7816,13 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false } ] } }, - "required": [ - "type", - "aggregator" - ], + "required": ["type", "aggregator"], "additionalProperties": false }, { @@ -8783,20 +7852,11 @@ }, "type": { "type": "string", - "enum": [ - "tool-trajectory", - "tool_trajectory" - ] + "enum": ["tool-trajectory", "tool_trajectory"] }, "mode": { "type": "string", - "enum": [ - "any_order", - "in_order", - "exact", - "subset", - "superset" - ] + "enum": ["any_order", "in_order", "exact", "subset", "superset"] }, "minimums": { "type": "object", @@ -8837,12 +7897,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -8856,12 +7911,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -8872,9 +7922,7 @@ ] } }, - "required": [ - "tool" - ], + "required": ["tool"], "additionalProperties": false } }, @@ -8882,12 +7930,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -8901,12 +7944,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -8917,10 +7955,7 @@ ] } }, - "required": [ - "type", - "mode" - ], + "required": ["type", "mode"], "additionalProperties": false }, { @@ -8950,10 +7985,7 @@ }, "type": { "type": "string", - "enum": [ - "field-accuracy", - "field_accuracy" - ] + "enum": ["field-accuracy", "field_accuracy"] }, "fields": { "type": "array", @@ -8965,11 +7997,7 @@ }, "match": { "type": "string", - "enum": [ - "exact", - "numeric_tolerance", - "date" - ] + "enum": ["exact", "numeric_tolerance", "date"] }, "required": { "type": "boolean" @@ -8991,26 +8019,17 @@ } } }, - "required": [ - "path", - "match" - ], + "required": ["path", "match"], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": [ - "weighted_average", - "all_or_nothing" - ] + "enum": ["weighted_average", "all_or_nothing"] } }, - "required": [ - "type", - "fields" - ], + "required": ["type", "fields"], "additionalProperties": false }, { @@ -9047,10 +8066,7 @@ "minimum": 0 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -9087,10 +8103,7 @@ "minimum": 0 } }, - "required": [ - "type", - "budget" - ], + "required": ["type", "budget"], "additionalProperties": false }, { @@ -9120,10 +8133,7 @@ }, "type": { "type": "string", - "enum": [ - "token-usage", - "token_usage" - ] + "enum": ["token-usage", "token_usage"] }, "max_total": { "type": "number", @@ -9138,9 +8148,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -9170,10 +8178,7 @@ }, "type": { "type": "string", - "enum": [ - "execution-metrics", - "execution_metrics" - ] + "enum": ["execution-metrics", "execution_metrics"] }, "max_tool_calls": { "type": "number", @@ -9205,9 +8210,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -9243,10 +8246,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -9282,10 +8282,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -9315,15 +8312,10 @@ }, "type": { "type": "string", - "enum": [ - "is-json", - "is_json" - ] + "enum": ["is-json", "is_json"] } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -9359,10 +8351,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -9443,10 +8432,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -9456,10 +8442,7 @@ "minItems": 1 } }, - "required": [ - "type", - "criteria" - ], + "required": ["type", "criteria"], "additionalProperties": false } ] @@ -9496,10 +8479,7 @@ }, "type": { "type": "string", - "enum": [ - "code-judge", - "code_judge" - ] + "enum": ["code-judge", "code_judge"] }, "command": { "anyOf": [ @@ -9551,10 +8531,7 @@ "additionalProperties": {} } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false }, { @@ -9584,10 +8561,7 @@ }, "type": { "type": "string", - "enum": [ - "llm-judge", - "llm_judge" - ] + "enum": ["llm-judge", "llm_judge"] }, "prompt": { "anyOf": [ @@ -9681,10 +8655,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -9713,9 +8684,7 @@ "maximum": 2 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -9775,9 +8744,7 @@ } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -9793,10 +8760,7 @@ "maximum": 1 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -9813,10 +8777,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false }, { @@ -9833,18 +8794,13 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false } ] } }, - "required": [ - "type", - "aggregator" - ], + "required": ["type", "aggregator"], "additionalProperties": false }, { @@ -9874,20 +8830,11 @@ }, "type": { "type": "string", - "enum": [ - "tool-trajectory", - "tool_trajectory" - ] + "enum": ["tool-trajectory", "tool_trajectory"] }, "mode": { "type": "string", - "enum": [ - "any_order", - "in_order", - "exact", - "subset", - "superset" - ] + "enum": ["any_order", "in_order", "exact", "subset", "superset"] }, "minimums": { "type": "object", @@ -9928,12 +8875,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -9947,12 +8889,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -9963,9 +8900,7 @@ ] } }, - "required": [ - "tool" - ], + "required": ["tool"], "additionalProperties": false } }, @@ -9973,12 +8908,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -9992,12 +8922,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -10008,10 +8933,7 @@ ] } }, - "required": [ - "type", - "mode" - ], + "required": ["type", "mode"], "additionalProperties": false }, { @@ -10041,10 +8963,7 @@ }, "type": { "type": "string", - "enum": [ - "field-accuracy", - "field_accuracy" - ] + "enum": ["field-accuracy", "field_accuracy"] }, "fields": { "type": "array", @@ -10056,11 +8975,7 @@ }, "match": { "type": "string", - "enum": [ - "exact", - "numeric_tolerance", - "date" - ] + "enum": ["exact", "numeric_tolerance", "date"] }, "required": { "type": "boolean" @@ -10082,26 +8997,17 @@ } } }, - "required": [ - "path", - "match" - ], + "required": ["path", "match"], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": [ - "weighted_average", - "all_or_nothing" - ] + "enum": ["weighted_average", "all_or_nothing"] } }, - "required": [ - "type", - "fields" - ], + "required": ["type", "fields"], "additionalProperties": false }, { @@ -10138,10 +9044,7 @@ "minimum": 0 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -10178,10 +9081,7 @@ "minimum": 0 } }, - "required": [ - "type", - "budget" - ], + "required": ["type", "budget"], "additionalProperties": false }, { @@ -10211,10 +9111,7 @@ }, "type": { "type": "string", - "enum": [ - "token-usage", - "token_usage" - ] + "enum": ["token-usage", "token_usage"] }, "max_total": { "type": "number", @@ -10229,9 +9126,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -10261,10 +9156,7 @@ }, "type": { "type": "string", - "enum": [ - "execution-metrics", - "execution_metrics" - ] + "enum": ["execution-metrics", "execution_metrics"] }, "max_tool_calls": { "type": "number", @@ -10296,9 +9188,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -10334,10 +9224,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -10373,10 +9260,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -10406,15 +9290,10 @@ }, "type": { "type": "string", - "enum": [ - "is-json", - "is_json" - ] + "enum": ["is-json", "is_json"] } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -10450,10 +9329,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -10534,10 +9410,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -10547,10 +9420,7 @@ "minItems": 1 } }, - "required": [ - "type", - "criteria" - ], + "required": ["type", "criteria"], "additionalProperties": false } ] @@ -10599,10 +9469,7 @@ }, "type": { "type": "string", - "enum": [ - "code-judge", - "code_judge" - ] + "enum": ["code-judge", "code_judge"] }, "command": { "anyOf": [ @@ -10654,10 +9521,7 @@ "additionalProperties": {} } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false }, { @@ -10687,10 +9551,7 @@ }, "type": { "type": "string", - "enum": [ - "llm-judge", - "llm_judge" - ] + "enum": ["llm-judge", "llm_judge"] }, "prompt": { "anyOf": [ @@ -10784,10 +9645,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -10816,9 +9674,7 @@ "maximum": 2 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -10878,9 +9734,7 @@ } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -10896,10 +9750,7 @@ "maximum": 1 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -10916,10 +9767,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false }, { @@ -10936,18 +9784,13 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false } ] } }, - "required": [ - "type", - "aggregator" - ], + "required": ["type", "aggregator"], "additionalProperties": false }, { @@ -10977,20 +9820,11 @@ }, "type": { "type": "string", - "enum": [ - "tool-trajectory", - "tool_trajectory" - ] + "enum": ["tool-trajectory", "tool_trajectory"] }, "mode": { "type": "string", - "enum": [ - "any_order", - "in_order", - "exact", - "subset", - "superset" - ] + "enum": ["any_order", "in_order", "exact", "subset", "superset"] }, "minimums": { "type": "object", @@ -11031,12 +9865,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -11050,12 +9879,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -11066,9 +9890,7 @@ ] } }, - "required": [ - "tool" - ], + "required": ["tool"], "additionalProperties": false } }, @@ -11076,12 +9898,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -11095,12 +9912,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -11111,10 +9923,7 @@ ] } }, - "required": [ - "type", - "mode" - ], + "required": ["type", "mode"], "additionalProperties": false }, { @@ -11144,10 +9953,7 @@ }, "type": { "type": "string", - "enum": [ - "field-accuracy", - "field_accuracy" - ] + "enum": ["field-accuracy", "field_accuracy"] }, "fields": { "type": "array", @@ -11159,11 +9965,7 @@ }, "match": { "type": "string", - "enum": [ - "exact", - "numeric_tolerance", - "date" - ] + "enum": ["exact", "numeric_tolerance", "date"] }, "required": { "type": "boolean" @@ -11185,26 +9987,17 @@ } } }, - "required": [ - "path", - "match" - ], + "required": ["path", "match"], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": [ - "weighted_average", - "all_or_nothing" - ] + "enum": ["weighted_average", "all_or_nothing"] } }, - "required": [ - "type", - "fields" - ], + "required": ["type", "fields"], "additionalProperties": false }, { @@ -11241,10 +10034,7 @@ "minimum": 0 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -11281,10 +10071,7 @@ "minimum": 0 } }, - "required": [ - "type", - "budget" - ], + "required": ["type", "budget"], "additionalProperties": false }, { @@ -11314,10 +10101,7 @@ }, "type": { "type": "string", - "enum": [ - "token-usage", - "token_usage" - ] + "enum": ["token-usage", "token_usage"] }, "max_total": { "type": "number", @@ -11332,9 +10116,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -11364,10 +10146,7 @@ }, "type": { "type": "string", - "enum": [ - "execution-metrics", - "execution_metrics" - ] + "enum": ["execution-metrics", "execution_metrics"] }, "max_tool_calls": { "type": "number", @@ -11399,9 +10178,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -11437,10 +10214,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -11476,10 +10250,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -11509,15 +10280,10 @@ }, "type": { "type": "string", - "enum": [ - "is-json", - "is_json" - ] + "enum": ["is-json", "is_json"] } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -11553,10 +10319,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -11637,10 +10400,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -11650,10 +10410,7 @@ "minItems": 1 } }, - "required": [ - "type", - "criteria" - ], + "required": ["type", "criteria"], "additionalProperties": false } ] @@ -11690,10 +10447,7 @@ }, "type": { "type": "string", - "enum": [ - "code-judge", - "code_judge" - ] + "enum": ["code-judge", "code_judge"] }, "command": { "anyOf": [ @@ -11745,10 +10499,7 @@ "additionalProperties": {} } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false }, { @@ -11778,10 +10529,7 @@ }, "type": { "type": "string", - "enum": [ - "llm-judge", - "llm_judge" - ] + "enum": ["llm-judge", "llm_judge"] }, "prompt": { "anyOf": [ @@ -11875,10 +10623,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -11907,9 +10652,7 @@ "maximum": 2 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -11969,9 +10712,7 @@ } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -11987,10 +10728,7 @@ "maximum": 1 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -12007,10 +10745,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false }, { @@ -12027,18 +10762,13 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false } ] } }, - "required": [ - "type", - "aggregator" - ], + "required": ["type", "aggregator"], "additionalProperties": false }, { @@ -12068,20 +10798,11 @@ }, "type": { "type": "string", - "enum": [ - "tool-trajectory", - "tool_trajectory" - ] + "enum": ["tool-trajectory", "tool_trajectory"] }, "mode": { "type": "string", - "enum": [ - "any_order", - "in_order", - "exact", - "subset", - "superset" - ] + "enum": ["any_order", "in_order", "exact", "subset", "superset"] }, "minimums": { "type": "object", @@ -12122,12 +10843,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -12141,12 +10857,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -12157,9 +10868,7 @@ ] } }, - "required": [ - "tool" - ], + "required": ["tool"], "additionalProperties": false } }, @@ -12167,12 +10876,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -12186,12 +10890,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -12202,10 +10901,7 @@ ] } }, - "required": [ - "type", - "mode" - ], + "required": ["type", "mode"], "additionalProperties": false }, { @@ -12235,10 +10931,7 @@ }, "type": { "type": "string", - "enum": [ - "field-accuracy", - "field_accuracy" - ] + "enum": ["field-accuracy", "field_accuracy"] }, "fields": { "type": "array", @@ -12250,11 +10943,7 @@ }, "match": { "type": "string", - "enum": [ - "exact", - "numeric_tolerance", - "date" - ] + "enum": ["exact", "numeric_tolerance", "date"] }, "required": { "type": "boolean" @@ -12276,26 +10965,17 @@ } } }, - "required": [ - "path", - "match" - ], + "required": ["path", "match"], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": [ - "weighted_average", - "all_or_nothing" - ] + "enum": ["weighted_average", "all_or_nothing"] } }, - "required": [ - "type", - "fields" - ], + "required": ["type", "fields"], "additionalProperties": false }, { @@ -12332,10 +11012,7 @@ "minimum": 0 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -12372,10 +11049,7 @@ "minimum": 0 } }, - "required": [ - "type", - "budget" - ], + "required": ["type", "budget"], "additionalProperties": false }, { @@ -12405,10 +11079,7 @@ }, "type": { "type": "string", - "enum": [ - "token-usage", - "token_usage" - ] + "enum": ["token-usage", "token_usage"] }, "max_total": { "type": "number", @@ -12423,9 +11094,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -12455,10 +11124,7 @@ }, "type": { "type": "string", - "enum": [ - "execution-metrics", - "execution_metrics" - ] + "enum": ["execution-metrics", "execution_metrics"] }, "max_tool_calls": { "type": "number", @@ -12490,9 +11156,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -12528,10 +11192,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -12567,10 +11228,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -12600,15 +11258,10 @@ }, "type": { "type": "string", - "enum": [ - "is-json", - "is_json" - ] + "enum": ["is-json", "is_json"] } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -12644,10 +11297,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -12728,10 +11378,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -12741,10 +11388,7 @@ "minItems": 1 } }, - "required": [ - "type", - "criteria" - ], + "required": ["type", "criteria"], "additionalProperties": false } ] @@ -12781,10 +11425,7 @@ }, "type": { "type": "string", - "enum": [ - "code-judge", - "code_judge" - ] + "enum": ["code-judge", "code_judge"] }, "command": { "anyOf": [ @@ -12836,10 +11477,7 @@ "additionalProperties": {} } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false }, { @@ -12869,10 +11507,7 @@ }, "type": { "type": "string", - "enum": [ - "llm-judge", - "llm_judge" - ] + "enum": ["llm-judge", "llm_judge"] }, "prompt": { "anyOf": [ @@ -12966,10 +11601,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -12998,9 +11630,7 @@ "maximum": 2 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -13060,9 +11690,7 @@ } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -13078,10 +11706,7 @@ "maximum": 1 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -13098,10 +11723,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false }, { @@ -13118,18 +11740,13 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false } ] } }, - "required": [ - "type", - "aggregator" - ], + "required": ["type", "aggregator"], "additionalProperties": false }, { @@ -13159,20 +11776,11 @@ }, "type": { "type": "string", - "enum": [ - "tool-trajectory", - "tool_trajectory" - ] + "enum": ["tool-trajectory", "tool_trajectory"] }, "mode": { "type": "string", - "enum": [ - "any_order", - "in_order", - "exact", - "subset", - "superset" - ] + "enum": ["any_order", "in_order", "exact", "subset", "superset"] }, "minimums": { "type": "object", @@ -13213,12 +11821,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -13232,12 +11835,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -13248,9 +11846,7 @@ ] } }, - "required": [ - "tool" - ], + "required": ["tool"], "additionalProperties": false } }, @@ -13258,12 +11854,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -13277,12 +11868,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -13293,10 +11879,7 @@ ] } }, - "required": [ - "type", - "mode" - ], + "required": ["type", "mode"], "additionalProperties": false }, { @@ -13326,10 +11909,7 @@ }, "type": { "type": "string", - "enum": [ - "field-accuracy", - "field_accuracy" - ] + "enum": ["field-accuracy", "field_accuracy"] }, "fields": { "type": "array", @@ -13341,11 +11921,7 @@ }, "match": { "type": "string", - "enum": [ - "exact", - "numeric_tolerance", - "date" - ] + "enum": ["exact", "numeric_tolerance", "date"] }, "required": { "type": "boolean" @@ -13367,26 +11943,17 @@ } } }, - "required": [ - "path", - "match" - ], + "required": ["path", "match"], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": [ - "weighted_average", - "all_or_nothing" - ] + "enum": ["weighted_average", "all_or_nothing"] } }, - "required": [ - "type", - "fields" - ], + "required": ["type", "fields"], "additionalProperties": false }, { @@ -13423,10 +11990,7 @@ "minimum": 0 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -13463,10 +12027,7 @@ "minimum": 0 } }, - "required": [ - "type", - "budget" - ], + "required": ["type", "budget"], "additionalProperties": false }, { @@ -13496,10 +12057,7 @@ }, "type": { "type": "string", - "enum": [ - "token-usage", - "token_usage" - ] + "enum": ["token-usage", "token_usage"] }, "max_total": { "type": "number", @@ -13514,9 +12072,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -13546,10 +12102,7 @@ }, "type": { "type": "string", - "enum": [ - "execution-metrics", - "execution_metrics" - ] + "enum": ["execution-metrics", "execution_metrics"] }, "max_tool_calls": { "type": "number", @@ -13581,9 +12134,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -13619,10 +12170,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -13658,10 +12206,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -13691,15 +12236,10 @@ }, "type": { "type": "string", - "enum": [ - "is-json", - "is_json" - ] + "enum": ["is-json", "is_json"] } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -13735,10 +12275,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -13819,10 +12356,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -13832,10 +12366,7 @@ "minItems": 1 } }, - "required": [ - "type", - "criteria" - ], + "required": ["type", "criteria"], "additionalProperties": false } ] @@ -13856,11 +12387,7 @@ }, "strategy": { "type": "string", - "enum": [ - "pass_at_k", - "mean", - "confidence_interval" - ] + "enum": ["pass_at_k", "mean", "confidence_interval"] }, "cost_limit_usd": { "type": "number", @@ -13871,9 +12398,7 @@ "minimum": 0 } }, - "required": [ - "count" - ], + "required": ["count"], "additionalProperties": false }, "total_budget_usd": { @@ -13901,10 +12426,7 @@ }, "isolation": { "type": "string", - "enum": [ - "shared", - "per_test" - ] + "enum": ["shared", "per_test"] }, "repos": { "type": "array", @@ -13928,10 +12450,7 @@ "format": "uri" } }, - "required": [ - "type", - "url" - ], + "required": ["type", "url"], "additionalProperties": false }, { @@ -13945,10 +12464,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false } ] @@ -13961,10 +12477,7 @@ }, "resolve": { "type": "string", - "enum": [ - "remote", - "local" - ] + "enum": ["remote", "local"] }, "ancestor": { "type": "integer", @@ -13993,10 +12506,7 @@ "additionalProperties": false } }, - "required": [ - "path", - "source" - ], + "required": ["path", "source"], "additionalProperties": false } }, @@ -14032,11 +12542,7 @@ }, "reset": { "type": "string", - "enum": [ - "none", - "fast", - "strict" - ] + "enum": ["none", "fast", "strict"] } }, "additionalProperties": false @@ -14067,11 +12573,7 @@ }, "reset": { "type": "string", - "enum": [ - "none", - "fast", - "strict" - ] + "enum": ["none", "fast", "strict"] } }, "additionalProperties": false @@ -14102,11 +12604,7 @@ }, "reset": { "type": "string", - "enum": [ - "none", - "fast", - "strict" - ] + "enum": ["none", "fast", "strict"] } }, "additionalProperties": false @@ -14137,11 +12635,7 @@ }, "reset": { "type": "string", - "enum": [ - "none", - "fast", - "strict" - ] + "enum": ["none", "fast", "strict"] } }, "additionalProperties": false @@ -14151,11 +12645,7 @@ }, "mode": { "type": "string", - "enum": [ - "pooled", - "temp", - "static" - ] + "enum": ["pooled", "temp", "static"] }, "path": { "type": "string" @@ -14177,9 +12667,7 @@ "type": "string" } }, - "required": [ - "id" - ], + "required": ["id"], "additionalProperties": false } }, @@ -14234,10 +12722,7 @@ }, "type": { "type": "string", - "enum": [ - "code-judge", - "code_judge" - ] + "enum": ["code-judge", "code_judge"] }, "command": { "anyOf": [ @@ -14289,10 +12774,7 @@ "additionalProperties": {} } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false }, { @@ -14322,10 +12804,7 @@ }, "type": { "type": "string", - "enum": [ - "llm-judge", - "llm_judge" - ] + "enum": ["llm-judge", "llm_judge"] }, "prompt": { "anyOf": [ @@ -14419,10 +12898,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -14451,9 +12927,7 @@ "maximum": 2 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -14513,9 +12987,7 @@ } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -14531,10 +13003,7 @@ "maximum": 1 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -14551,10 +13020,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false }, { @@ -14571,18 +13037,13 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false } ] } }, - "required": [ - "type", - "aggregator" - ], + "required": ["type", "aggregator"], "additionalProperties": false }, { @@ -14612,20 +13073,11 @@ }, "type": { "type": "string", - "enum": [ - "tool-trajectory", - "tool_trajectory" - ] + "enum": ["tool-trajectory", "tool_trajectory"] }, "mode": { "type": "string", - "enum": [ - "any_order", - "in_order", - "exact", - "subset", - "superset" - ] + "enum": ["any_order", "in_order", "exact", "subset", "superset"] }, "minimums": { "type": "object", @@ -14666,12 +13118,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -14685,12 +13132,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -14701,9 +13143,7 @@ ] } }, - "required": [ - "tool" - ], + "required": ["tool"], "additionalProperties": false } }, @@ -14711,12 +13151,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -14730,12 +13165,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -14746,10 +13176,7 @@ ] } }, - "required": [ - "type", - "mode" - ], + "required": ["type", "mode"], "additionalProperties": false }, { @@ -14779,10 +13206,7 @@ }, "type": { "type": "string", - "enum": [ - "field-accuracy", - "field_accuracy" - ] + "enum": ["field-accuracy", "field_accuracy"] }, "fields": { "type": "array", @@ -14794,11 +13218,7 @@ }, "match": { "type": "string", - "enum": [ - "exact", - "numeric_tolerance", - "date" - ] + "enum": ["exact", "numeric_tolerance", "date"] }, "required": { "type": "boolean" @@ -14820,26 +13240,17 @@ } } }, - "required": [ - "path", - "match" - ], + "required": ["path", "match"], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": [ - "weighted_average", - "all_or_nothing" - ] + "enum": ["weighted_average", "all_or_nothing"] } }, - "required": [ - "type", - "fields" - ], + "required": ["type", "fields"], "additionalProperties": false }, { @@ -14876,10 +13287,7 @@ "minimum": 0 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -14916,10 +13324,7 @@ "minimum": 0 } }, - "required": [ - "type", - "budget" - ], + "required": ["type", "budget"], "additionalProperties": false }, { @@ -14949,10 +13354,7 @@ }, "type": { "type": "string", - "enum": [ - "token-usage", - "token_usage" - ] + "enum": ["token-usage", "token_usage"] }, "max_total": { "type": "number", @@ -14967,9 +13369,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -14999,10 +13399,7 @@ }, "type": { "type": "string", - "enum": [ - "execution-metrics", - "execution_metrics" - ] + "enum": ["execution-metrics", "execution_metrics"] }, "max_tool_calls": { "type": "number", @@ -15034,9 +13431,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -15072,10 +13467,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -15111,10 +13503,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -15144,15 +13533,10 @@ }, "type": { "type": "string", - "enum": [ - "is-json", - "is_json" - ] + "enum": ["is-json", "is_json"] } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -15188,10 +13572,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -15272,10 +13653,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -15285,10 +13663,7 @@ "minItems": 1 } }, - "required": [ - "type", - "criteria" - ], + "required": ["type", "criteria"], "additionalProperties": false } ] @@ -15325,10 +13700,7 @@ }, "type": { "type": "string", - "enum": [ - "code-judge", - "code_judge" - ] + "enum": ["code-judge", "code_judge"] }, "command": { "anyOf": [ @@ -15380,10 +13752,7 @@ "additionalProperties": {} } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false }, { @@ -15413,10 +13782,7 @@ }, "type": { "type": "string", - "enum": [ - "llm-judge", - "llm_judge" - ] + "enum": ["llm-judge", "llm_judge"] }, "prompt": { "anyOf": [ @@ -15510,10 +13876,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -15542,9 +13905,7 @@ "maximum": 2 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -15604,9 +13965,7 @@ } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -15622,10 +13981,7 @@ "maximum": 1 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -15642,10 +13998,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false }, { @@ -15662,18 +14015,13 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false } ] } }, - "required": [ - "type", - "aggregator" - ], + "required": ["type", "aggregator"], "additionalProperties": false }, { @@ -15703,20 +14051,11 @@ }, "type": { "type": "string", - "enum": [ - "tool-trajectory", - "tool_trajectory" - ] + "enum": ["tool-trajectory", "tool_trajectory"] }, "mode": { "type": "string", - "enum": [ - "any_order", - "in_order", - "exact", - "subset", - "superset" - ] + "enum": ["any_order", "in_order", "exact", "subset", "superset"] }, "minimums": { "type": "object", @@ -15757,12 +14096,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -15776,12 +14110,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -15792,9 +14121,7 @@ ] } }, - "required": [ - "tool" - ], + "required": ["tool"], "additionalProperties": false } }, @@ -15802,12 +14129,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -15821,12 +14143,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -15837,10 +14154,7 @@ ] } }, - "required": [ - "type", - "mode" - ], + "required": ["type", "mode"], "additionalProperties": false }, { @@ -15870,10 +14184,7 @@ }, "type": { "type": "string", - "enum": [ - "field-accuracy", - "field_accuracy" - ] + "enum": ["field-accuracy", "field_accuracy"] }, "fields": { "type": "array", @@ -15885,11 +14196,7 @@ }, "match": { "type": "string", - "enum": [ - "exact", - "numeric_tolerance", - "date" - ] + "enum": ["exact", "numeric_tolerance", "date"] }, "required": { "type": "boolean" @@ -15911,26 +14218,17 @@ } } }, - "required": [ - "path", - "match" - ], + "required": ["path", "match"], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": [ - "weighted_average", - "all_or_nothing" - ] + "enum": ["weighted_average", "all_or_nothing"] } }, - "required": [ - "type", - "fields" - ], + "required": ["type", "fields"], "additionalProperties": false }, { @@ -15967,10 +14265,7 @@ "minimum": 0 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -16007,10 +14302,7 @@ "minimum": 0 } }, - "required": [ - "type", - "budget" - ], + "required": ["type", "budget"], "additionalProperties": false }, { @@ -16040,10 +14332,7 @@ }, "type": { "type": "string", - "enum": [ - "token-usage", - "token_usage" - ] + "enum": ["token-usage", "token_usage"] }, "max_total": { "type": "number", @@ -16058,9 +14347,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -16090,10 +14377,7 @@ }, "type": { "type": "string", - "enum": [ - "execution-metrics", - "execution_metrics" - ] + "enum": ["execution-metrics", "execution_metrics"] }, "max_tool_calls": { "type": "number", @@ -16125,9 +14409,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -16163,10 +14445,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -16202,10 +14481,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -16235,15 +14511,10 @@ }, "type": { "type": "string", - "enum": [ - "is-json", - "is_json" - ] + "enum": ["is-json", "is_json"] } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -16279,10 +14550,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -16363,10 +14631,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -16376,10 +14641,7 @@ "minItems": 1 } }, - "required": [ - "type", - "criteria" - ], + "required": ["type", "criteria"], "additionalProperties": false } ] @@ -16416,10 +14678,7 @@ }, "type": { "type": "string", - "enum": [ - "code-judge", - "code_judge" - ] + "enum": ["code-judge", "code_judge"] }, "command": { "anyOf": [ @@ -16471,10 +14730,7 @@ "additionalProperties": {} } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false }, { @@ -16504,10 +14760,7 @@ }, "type": { "type": "string", - "enum": [ - "llm-judge", - "llm_judge" - ] + "enum": ["llm-judge", "llm_judge"] }, "prompt": { "anyOf": [ @@ -16601,10 +14854,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -16633,9 +14883,7 @@ "maximum": 2 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -16695,9 +14943,7 @@ } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -16713,10 +14959,7 @@ "maximum": 1 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -16733,10 +14976,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false }, { @@ -16753,18 +14993,13 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false } ] } }, - "required": [ - "type", - "aggregator" - ], + "required": ["type", "aggregator"], "additionalProperties": false }, { @@ -16794,20 +15029,11 @@ }, "type": { "type": "string", - "enum": [ - "tool-trajectory", - "tool_trajectory" - ] + "enum": ["tool-trajectory", "tool_trajectory"] }, "mode": { "type": "string", - "enum": [ - "any_order", - "in_order", - "exact", - "subset", - "superset" - ] + "enum": ["any_order", "in_order", "exact", "subset", "superset"] }, "minimums": { "type": "object", @@ -16848,12 +15074,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -16867,12 +15088,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -16883,9 +15099,7 @@ ] } }, - "required": [ - "tool" - ], + "required": ["tool"], "additionalProperties": false } }, @@ -16893,12 +15107,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -16912,12 +15121,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -16928,10 +15132,7 @@ ] } }, - "required": [ - "type", - "mode" - ], + "required": ["type", "mode"], "additionalProperties": false }, { @@ -16961,10 +15162,7 @@ }, "type": { "type": "string", - "enum": [ - "field-accuracy", - "field_accuracy" - ] + "enum": ["field-accuracy", "field_accuracy"] }, "fields": { "type": "array", @@ -16976,11 +15174,7 @@ }, "match": { "type": "string", - "enum": [ - "exact", - "numeric_tolerance", - "date" - ] + "enum": ["exact", "numeric_tolerance", "date"] }, "required": { "type": "boolean" @@ -17002,26 +15196,17 @@ } } }, - "required": [ - "path", - "match" - ], + "required": ["path", "match"], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": [ - "weighted_average", - "all_or_nothing" - ] + "enum": ["weighted_average", "all_or_nothing"] } }, - "required": [ - "type", - "fields" - ], + "required": ["type", "fields"], "additionalProperties": false }, { @@ -17058,10 +15243,7 @@ "minimum": 0 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -17098,10 +15280,7 @@ "minimum": 0 } }, - "required": [ - "type", - "budget" - ], + "required": ["type", "budget"], "additionalProperties": false }, { @@ -17131,10 +15310,7 @@ }, "type": { "type": "string", - "enum": [ - "token-usage", - "token_usage" - ] + "enum": ["token-usage", "token_usage"] }, "max_total": { "type": "number", @@ -17149,9 +15325,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -17181,10 +15355,7 @@ }, "type": { "type": "string", - "enum": [ - "execution-metrics", - "execution_metrics" - ] + "enum": ["execution-metrics", "execution_metrics"] }, "max_tool_calls": { "type": "number", @@ -17216,9 +15387,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -17254,10 +15423,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -17293,10 +15459,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -17326,15 +15489,10 @@ }, "type": { "type": "string", - "enum": [ - "is-json", - "is_json" - ] + "enum": ["is-json", "is_json"] } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -17370,10 +15528,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -17454,10 +15609,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -17467,10 +15619,7 @@ "minItems": 1 } }, - "required": [ - "type", - "criteria" - ], + "required": ["type", "criteria"], "additionalProperties": false } ] @@ -17491,11 +15640,7 @@ }, "strategy": { "type": "string", - "enum": [ - "pass_at_k", - "mean", - "confidence_interval" - ] + "enum": ["pass_at_k", "mean", "confidence_interval"] }, "cost_limit_usd": { "type": "number", @@ -17506,9 +15651,7 @@ "minimum": 0 } }, - "required": [ - "count" - ], + "required": ["count"], "additionalProperties": false }, "total_budget_usd": { @@ -17559,10 +15702,7 @@ }, "type": { "type": "string", - "enum": [ - "code-judge", - "code_judge" - ] + "enum": ["code-judge", "code_judge"] }, "command": { "anyOf": [ @@ -17614,10 +15754,7 @@ "additionalProperties": {} } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false }, { @@ -17647,10 +15784,7 @@ }, "type": { "type": "string", - "enum": [ - "llm-judge", - "llm_judge" - ] + "enum": ["llm-judge", "llm_judge"] }, "prompt": { "anyOf": [ @@ -17744,10 +15878,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -17776,9 +15907,7 @@ "maximum": 2 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -17838,9 +15967,7 @@ } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -17856,10 +15983,7 @@ "maximum": 1 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -17876,10 +16000,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false }, { @@ -17896,18 +16017,13 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false } ] } }, - "required": [ - "type", - "aggregator" - ], + "required": ["type", "aggregator"], "additionalProperties": false }, { @@ -17937,20 +16053,11 @@ }, "type": { "type": "string", - "enum": [ - "tool-trajectory", - "tool_trajectory" - ] + "enum": ["tool-trajectory", "tool_trajectory"] }, "mode": { "type": "string", - "enum": [ - "any_order", - "in_order", - "exact", - "subset", - "superset" - ] + "enum": ["any_order", "in_order", "exact", "subset", "superset"] }, "minimums": { "type": "object", @@ -17991,12 +16098,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -18010,12 +16112,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -18026,9 +16123,7 @@ ] } }, - "required": [ - "tool" - ], + "required": ["tool"], "additionalProperties": false } }, @@ -18036,12 +16131,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -18055,12 +16145,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -18071,10 +16156,7 @@ ] } }, - "required": [ - "type", - "mode" - ], + "required": ["type", "mode"], "additionalProperties": false }, { @@ -18104,10 +16186,7 @@ }, "type": { "type": "string", - "enum": [ - "field-accuracy", - "field_accuracy" - ] + "enum": ["field-accuracy", "field_accuracy"] }, "fields": { "type": "array", @@ -18119,11 +16198,7 @@ }, "match": { "type": "string", - "enum": [ - "exact", - "numeric_tolerance", - "date" - ] + "enum": ["exact", "numeric_tolerance", "date"] }, "required": { "type": "boolean" @@ -18145,26 +16220,17 @@ } } }, - "required": [ - "path", - "match" - ], + "required": ["path", "match"], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": [ - "weighted_average", - "all_or_nothing" - ] + "enum": ["weighted_average", "all_or_nothing"] } }, - "required": [ - "type", - "fields" - ], + "required": ["type", "fields"], "additionalProperties": false }, { @@ -18201,10 +16267,7 @@ "minimum": 0 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -18241,10 +16304,7 @@ "minimum": 0 } }, - "required": [ - "type", - "budget" - ], + "required": ["type", "budget"], "additionalProperties": false }, { @@ -18274,10 +16334,7 @@ }, "type": { "type": "string", - "enum": [ - "token-usage", - "token_usage" - ] + "enum": ["token-usage", "token_usage"] }, "max_total": { "type": "number", @@ -18292,9 +16349,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -18324,10 +16379,7 @@ }, "type": { "type": "string", - "enum": [ - "execution-metrics", - "execution_metrics" - ] + "enum": ["execution-metrics", "execution_metrics"] }, "max_tool_calls": { "type": "number", @@ -18359,9 +16411,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -18397,10 +16447,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -18436,10 +16483,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -18469,15 +16513,10 @@ }, "type": { "type": "string", - "enum": [ - "is-json", - "is_json" - ] + "enum": ["is-json", "is_json"] } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -18513,10 +16552,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -18597,10 +16633,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -18610,10 +16643,7 @@ "minItems": 1 } }, - "required": [ - "type", - "criteria" - ], + "required": ["type", "criteria"], "additionalProperties": false } ] @@ -18650,10 +16680,7 @@ }, "type": { "type": "string", - "enum": [ - "code-judge", - "code_judge" - ] + "enum": ["code-judge", "code_judge"] }, "command": { "anyOf": [ @@ -18705,10 +16732,7 @@ "additionalProperties": {} } }, - "required": [ - "type", - "command" - ], + "required": ["type", "command"], "additionalProperties": false }, { @@ -18738,10 +16762,7 @@ }, "type": { "type": "string", - "enum": [ - "llm-judge", - "llm_judge" - ] + "enum": ["llm-judge", "llm_judge"] }, "prompt": { "anyOf": [ @@ -18835,10 +16856,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -18867,9 +16885,7 @@ "maximum": 2 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -18929,9 +16945,7 @@ } } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -18947,10 +16961,7 @@ "maximum": 1 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -18967,10 +16978,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false }, { @@ -18987,18 +16995,13 @@ "type": "string" } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false } ] } }, - "required": [ - "type", - "aggregator" - ], + "required": ["type", "aggregator"], "additionalProperties": false }, { @@ -19028,20 +17031,11 @@ }, "type": { "type": "string", - "enum": [ - "tool-trajectory", - "tool_trajectory" - ] + "enum": ["tool-trajectory", "tool_trajectory"] }, "mode": { "type": "string", - "enum": [ - "any_order", - "in_order", - "exact", - "subset", - "superset" - ] + "enum": ["any_order", "in_order", "exact", "subset", "superset"] }, "minimums": { "type": "object", @@ -19082,12 +17076,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -19101,12 +17090,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -19117,9 +17101,7 @@ ] } }, - "required": [ - "tool" - ], + "required": ["tool"], "additionalProperties": false } }, @@ -19127,12 +17109,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -19146,12 +17123,7 @@ "anyOf": [ { "type": "string", - "enum": [ - "exact", - "ignore", - "subset", - "superset" - ] + "enum": ["exact", "ignore", "subset", "superset"] }, { "type": "array", @@ -19162,10 +17134,7 @@ ] } }, - "required": [ - "type", - "mode" - ], + "required": ["type", "mode"], "additionalProperties": false }, { @@ -19195,10 +17164,7 @@ }, "type": { "type": "string", - "enum": [ - "field-accuracy", - "field_accuracy" - ] + "enum": ["field-accuracy", "field_accuracy"] }, "fields": { "type": "array", @@ -19210,11 +17176,7 @@ }, "match": { "type": "string", - "enum": [ - "exact", - "numeric_tolerance", - "date" - ] + "enum": ["exact", "numeric_tolerance", "date"] }, "required": { "type": "boolean" @@ -19236,26 +17198,17 @@ } } }, - "required": [ - "path", - "match" - ], + "required": ["path", "match"], "additionalProperties": false }, "minItems": 1 }, "aggregation": { "type": "string", - "enum": [ - "weighted_average", - "all_or_nothing" - ] + "enum": ["weighted_average", "all_or_nothing"] } }, - "required": [ - "type", - "fields" - ], + "required": ["type", "fields"], "additionalProperties": false }, { @@ -19292,10 +17245,7 @@ "minimum": 0 } }, - "required": [ - "type", - "threshold" - ], + "required": ["type", "threshold"], "additionalProperties": false }, { @@ -19332,10 +17282,7 @@ "minimum": 0 } }, - "required": [ - "type", - "budget" - ], + "required": ["type", "budget"], "additionalProperties": false }, { @@ -19365,10 +17312,7 @@ }, "type": { "type": "string", - "enum": [ - "token-usage", - "token_usage" - ] + "enum": ["token-usage", "token_usage"] }, "max_total": { "type": "number", @@ -19383,9 +17327,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -19415,10 +17357,7 @@ }, "type": { "type": "string", - "enum": [ - "execution-metrics", - "execution_metrics" - ] + "enum": ["execution-metrics", "execution_metrics"] }, "max_tool_calls": { "type": "number", @@ -19450,9 +17389,7 @@ "minimum": 0 } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -19488,10 +17425,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -19527,10 +17461,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -19560,15 +17491,10 @@ }, "type": { "type": "string", - "enum": [ - "is-json", - "is_json" - ] + "enum": ["is-json", "is_json"] } }, - "required": [ - "type" - ], + "required": ["type"], "additionalProperties": false }, { @@ -19604,10 +17530,7 @@ "type": "string" } }, - "required": [ - "type", - "value" - ], + "required": ["type", "value"], "additionalProperties": false }, { @@ -19688,10 +17611,7 @@ "minLength": 1 } }, - "required": [ - "score_range", - "outcome" - ], + "required": ["score_range", "outcome"], "additionalProperties": false } } @@ -19701,10 +17621,7 @@ "minItems": 1 } }, - "required": [ - "type", - "criteria" - ], + "required": ["type", "criteria"], "additionalProperties": false } ] @@ -19720,10 +17637,7 @@ }, "isolation": { "type": "string", - "enum": [ - "shared", - "per_test" - ] + "enum": ["shared", "per_test"] }, "repos": { "type": "array", @@ -19747,10 +17661,7 @@ "format": "uri" } }, - "required": [ - "type", - "url" - ], + "required": ["type", "url"], "additionalProperties": false }, { @@ -19764,10 +17675,7 @@ "type": "string" } }, - "required": [ - "type", - "path" - ], + "required": ["type", "path"], "additionalProperties": false } ] @@ -19780,10 +17688,7 @@ }, "resolve": { "type": "string", - "enum": [ - "remote", - "local" - ] + "enum": ["remote", "local"] }, "ancestor": { "type": "integer", @@ -19812,10 +17717,7 @@ "additionalProperties": false } }, - "required": [ - "path", - "source" - ], + "required": ["path", "source"], "additionalProperties": false } }, @@ -19851,11 +17753,7 @@ }, "reset": { "type": "string", - "enum": [ - "none", - "fast", - "strict" - ] + "enum": ["none", "fast", "strict"] } }, "additionalProperties": false @@ -19886,11 +17784,7 @@ }, "reset": { "type": "string", - "enum": [ - "none", - "fast", - "strict" - ] + "enum": ["none", "fast", "strict"] } }, "additionalProperties": false @@ -19921,11 +17815,7 @@ }, "reset": { "type": "string", - "enum": [ - "none", - "fast", - "strict" - ] + "enum": ["none", "fast", "strict"] } }, "additionalProperties": false @@ -19956,11 +17846,7 @@ }, "reset": { "type": "string", - "enum": [ - "none", - "fast", - "strict" - ] + "enum": ["none", "fast", "strict"] } }, "additionalProperties": false @@ -19970,11 +17856,7 @@ }, "mode": { "type": "string", - "enum": [ - "pooled", - "temp", - "static" - ] + "enum": ["pooled", "temp", "static"] }, "path": { "type": "string" @@ -19988,9 +17870,7 @@ ] } }, - "required": [ - "tests" - ], + "required": ["tests"], "additionalProperties": false } } From 3c9ed0019d69762a8ef79a7c5d61144f8ed5bec2 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 15 Mar 2026 19:18:08 +0000 Subject: [PATCH 13/13] chore: remove accidentally committed node_modules symlinks Co-Authored-By: Claude Opus 4.6 --- apps/cli/node_modules | 1 - node_modules | 1 - packages/core/node_modules | 1 - packages/eval/node_modules | 1 - 4 files changed, 4 deletions(-) delete mode 120000 apps/cli/node_modules delete mode 120000 node_modules delete mode 120000 packages/core/node_modules delete mode 120000 packages/eval/node_modules diff --git a/apps/cli/node_modules b/apps/cli/node_modules deleted file mode 120000 index c99229581..000000000 --- a/apps/cli/node_modules +++ /dev/null @@ -1 +0,0 @@ -/home/christso/projects/agentv/apps/cli/node_modules \ No newline at end of file diff --git a/node_modules b/node_modules deleted file mode 120000 index 8cba0ae08..000000000 --- a/node_modules +++ /dev/null @@ -1 +0,0 @@ -/home/christso/projects/agentv/node_modules \ No newline at end of file diff --git a/packages/core/node_modules b/packages/core/node_modules deleted file mode 120000 index a07840188..000000000 --- a/packages/core/node_modules +++ /dev/null @@ -1 +0,0 @@ -/home/christso/projects/agentv/packages/core/node_modules \ No newline at end of file diff --git a/packages/eval/node_modules b/packages/eval/node_modules deleted file mode 120000 index d1cf07368..000000000 --- a/packages/eval/node_modules +++ /dev/null @@ -1 +0,0 @@ -/home/christso/projects/agentv/packages/eval/node_modules \ No newline at end of file