diff --git a/CLAUDE.md b/CLAUDE.md index b6196a7a4..b289ec625 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -35,10 +35,13 @@ If a feature serves a niche use case or adds conditional logic, it belongs in a ### 3. Align with Industry Standards Before adding features, research how peer frameworks solve the problem. Prefer the **lowest common denominator** that covers most use cases. Novel features without industry precedent require strong justification and should default to plugin implementation. -### 4. Non-Breaking Extensions +### 4. YAGNI — You Aren't Gonna Need It +Don't build features until there's a concrete need. Before adding a new capability, ask: "Is there real demand for this today, or am I anticipating future needs?" Numeric thresholds, extra tracking fields, and configurable knobs should be omitted until users actually request them. Start with the simplest version (e.g., boolean over numeric range) and extend later if needed. + +### 5. Non-Breaking Extensions New fields should be optional. Existing configurations must continue working unchanged. -### 5. AI-First Design +### 6. AI-First Design AI agents are the primary users of AgentV—not humans reading docs. Design for AI comprehension and composability. **Skills over rigid commands:** diff --git a/apps/cli/src/commands/eval/commands/run.ts b/apps/cli/src/commands/eval/commands/run.ts index 5afdd454d..2bdf4fcfe 100644 --- a/apps/cli/src/commands/eval/commands/run.ts +++ b/apps/cli/src/commands/eval/commands/run.ts @@ -142,6 +142,11 @@ export const evalRunCommand = command({ description: 'Group messages into turn spans for multi-turn evaluations (requires --export-otel)', }), + retryErrors: option({ + type: optional(string), + long: 'retry-errors', + description: 'Path to previous output JSONL — re-run only execution_error test cases', + }), }, handler: async (args) => { // Launch interactive wizard when no eval paths and stdin is a TTY @@ -178,6 +183,7 @@ export const evalRunCommand = command({ otelBackend: args.otelBackend, otelCaptureContent: args.otelCaptureContent, otelGroupTurns: args.otelGroupTurns, + retryErrors: args.retryErrors, }; await runEvalCommand({ testFiles: resolvedPaths, rawOptions }); }, diff --git a/apps/cli/src/commands/eval/retry-errors.ts b/apps/cli/src/commands/eval/retry-errors.ts new file mode 100644 index 000000000..11d94c8a4 --- /dev/null +++ b/apps/cli/src/commands/eval/retry-errors.ts @@ -0,0 +1,58 @@ +import { createReadStream } from 'node:fs'; +import { createInterface } from 'node:readline'; + +import type { EvaluationResult } from '@agentv/core'; + +/** + * Load test IDs from a JSONL results file that have executionStatus === 'execution_error'. + */ +export async function loadErrorTestIds(jsonlPath: string): Promise { + const ids: string[] = []; + const rl = createInterface({ + input: createReadStream(jsonlPath), + crlfDelay: Number.POSITIVE_INFINITY, + }); + + for await (const line of rl) { + const trimmed = line.trim(); + if (!trimmed) continue; + try { + const parsed = JSON.parse(trimmed) as Partial; + if (parsed.executionStatus === 'execution_error' && parsed.testId) { + ids.push(parsed.testId); + } + } catch { + // Skip malformed lines + } + } + + return [...new Set(ids)]; +} + +/** + * Load results from a JSONL file that do NOT have executionStatus === 'execution_error'. + * These are the "good" results that should be preserved when merging retry output. + */ +export async function loadNonErrorResults(jsonlPath: string): Promise { + const results: EvaluationResult[] = []; + const rl = createInterface({ + input: createReadStream(jsonlPath), + crlfDelay: Number.POSITIVE_INFINITY, + }); + + for await (const line of rl) { + const trimmed = line.trim(); + if (!trimmed) continue; + try { + const parsed = JSON.parse(trimmed) as Partial; + if (!parsed.testId || parsed.score === undefined) continue; + if (parsed.executionStatus !== 'execution_error') { + results.push(parsed as EvaluationResult); + } + } catch { + // Skip malformed lines + } + } + + return results; +} diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index 95368c4ac..a4a4a1f7c 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -8,6 +8,7 @@ import { type EvaluationCache, type EvaluationResult, type ExecutionDefaults, + type FailOnError, type OtelTraceExporter as OtelTraceExporterType, ResponseCache, type TrialsConfig, @@ -33,6 +34,7 @@ import { getDefaultExtension, } from './output-writer.js'; import { ProgressDisplay, type WorkerProgress } from './progress-display.js'; +import { loadErrorTestIds, loadNonErrorResults } from './retry-errors.js'; import { findRepoRoot } from './shared.js'; import { calculateEvaluationSummary, @@ -74,6 +76,7 @@ interface NormalizedOptions { readonly otelBackend?: string; readonly otelCaptureContent: boolean; readonly otelGroupTurns: boolean; + readonly retryErrors?: string; } function normalizeBoolean(value: unknown): boolean { @@ -225,6 +228,7 @@ function normalizeOptions( otelBackend: normalizeString(rawOptions.otelBackend), otelCaptureContent: normalizeBoolean(rawOptions.otelCaptureContent), otelGroupTurns: normalizeBoolean(rawOptions.otelGroupTurns), + retryErrors: normalizeString(rawOptions.retryErrors), } satisfies NormalizedOptions; } @@ -328,6 +332,7 @@ async function prepareFileMetadata(params: { readonly yamlCache?: boolean; readonly yamlCachePath?: string; readonly totalBudgetUsd?: number; + readonly failOnError?: FailOnError; }> { const { testFilePath, repoRoot, cwd, options } = params; @@ -419,6 +424,7 @@ async function prepareFileMetadata(params: { yamlCache: suite.cacheConfig?.enabled, yamlCachePath: suite.cacheConfig?.cachePath, totalBudgetUsd: suite.totalBudgetUsd, + failOnError: suite.failOnError, }; } @@ -460,6 +466,7 @@ async function runSingleEvalFile(params: { readonly trialsConfig?: TrialsConfig; readonly matrixMode?: boolean; readonly totalBudgetUsd?: number; + readonly failOnError?: FailOnError; }): Promise<{ results: EvaluationResult[] }> { const { testFilePath, @@ -480,6 +487,7 @@ async function runSingleEvalFile(params: { trialsConfig, matrixMode, totalBudgetUsd, + failOnError, } = params; const targetName = selection.targetName; @@ -562,6 +570,7 @@ async function runSingleEvalFile(params: { cleanupWorkspaces: options.cleanupWorkspaces, trials: trialsConfig, totalBudgetUsd, + failOnError, streamCallbacks: streamingObserver?.getStreamCallbacks(), onResult: async (result: EvaluationResult) => { // Finalize streaming observer span with score @@ -634,7 +643,26 @@ export async function runEvalCommand(input: RunEvalCommandInput): Promise // Pass a dummy file in cwd so the search starts from the working directory. const yamlConfig = await loadConfig(path.join(cwd, '_'), repoRoot); - const options = normalizeOptions(input.rawOptions, config, yamlConfig?.execution); + let options = normalizeOptions(input.rawOptions, config, yamlConfig?.execution); + + // --retry-errors: override filter to only re-run execution_error test cases. + // IMPORTANT: JSONL must be fully loaded here, before the output writer is created below, + // since the retry source and output destination may refer to the same file. + let retryNonErrorResults: readonly EvaluationResult[] | undefined; + if (options.retryErrors) { + const retryPath = path.resolve(options.retryErrors); + await ensureFileExists(retryPath, 'Retry-errors JSONL file'); + const errorIds = await loadErrorTestIds(retryPath); + if (errorIds.length === 0) { + console.log('No execution errors found in the previous output. Nothing to retry.'); + return; + } + console.log(`Retrying ${errorIds.length} execution-error test(s): ${errorIds.join(', ')}`); + // Override the filter to match only error test IDs using micromatch brace expansion + const filterPattern = errorIds.length === 1 ? errorIds[0] : `{${errorIds.join(',')}}`; + options = { ...options, filter: filterPattern }; + retryNonErrorResults = await loadNonErrorResults(retryPath); + } if (options.keepWorkspaces && options.cleanupWorkspaces) { console.warn( @@ -767,6 +795,7 @@ export async function runEvalCommand(input: RunEvalCommandInput): Promise readonly yamlCache?: boolean; readonly yamlCachePath?: string; readonly totalBudgetUsd?: number; + readonly failOnError?: FailOnError; } >(); for (const testFilePath of resolvedTestFiles) { @@ -915,6 +944,7 @@ export async function runEvalCommand(input: RunEvalCommandInput): Promise trialsConfig: targetPrep.trialsConfig, matrixMode: targetPrep.selections.length > 1, totalBudgetUsd: targetPrep.totalBudgetUsd, + failOnError: targetPrep.failOnError, }); allResults.push(...result.results); @@ -923,6 +953,17 @@ export async function runEvalCommand(input: RunEvalCommandInput): Promise progressReporter.finish(); + // Merge non-error results from previous run when using --retry-errors + if (retryNonErrorResults && retryNonErrorResults.length > 0) { + for (const preserved of retryNonErrorResults) { + await outputWriter.append(preserved); + } + allResults.push(...retryNonErrorResults); + console.log( + `Merged ${retryNonErrorResults.length} non-error result(s) from previous output.`, + ); + } + const summary = calculateEvaluationSummary(allResults); console.log(formatEvaluationSummary(summary)); diff --git a/apps/cli/test/unit/retry-errors.test.ts b/apps/cli/test/unit/retry-errors.test.ts new file mode 100644 index 000000000..ab331ee84 --- /dev/null +++ b/apps/cli/test/unit/retry-errors.test.ts @@ -0,0 +1,89 @@ +import { afterEach, describe, expect, it } from 'bun:test'; +import { mkdtempSync, rmSync, writeFileSync } from 'node:fs'; +import { tmpdir } from 'node:os'; +import path from 'node:path'; + +import { loadErrorTestIds, loadNonErrorResults } from '../../src/commands/eval/retry-errors.js'; + +describe('retry-errors', () => { + let tmpDir: string; + + afterEach(() => { + if (tmpDir) { + rmSync(tmpDir, { recursive: true, force: true }); + } + }); + + function createJsonlFile(lines: object[]): string { + tmpDir = mkdtempSync(path.join(tmpdir(), 'retry-errors-test-')); + const filePath = path.join(tmpDir, 'results.jsonl'); + writeFileSync(filePath, lines.map((l) => JSON.stringify(l)).join('\n')); + return filePath; + } + + it('loadErrorTestIds returns only execution_error test IDs', async () => { + const filePath = createJsonlFile([ + { testId: 'case-1', executionStatus: 'ok', score: 0.9 }, + { testId: 'case-2', executionStatus: 'execution_error', score: 0, error: 'timeout' }, + { testId: 'case-3', executionStatus: 'quality_failure', score: 0.3 }, + { testId: 'case-4', executionStatus: 'execution_error', score: 0, error: 'provider failed' }, + ]); + + const ids = await loadErrorTestIds(filePath); + expect(ids).toEqual(['case-2', 'case-4']); + }); + + it('loadErrorTestIds deduplicates IDs', async () => { + const filePath = createJsonlFile([ + { testId: 'case-1', executionStatus: 'execution_error', score: 0 }, + { testId: 'case-1', executionStatus: 'execution_error', score: 0 }, + ]); + + const ids = await loadErrorTestIds(filePath); + expect(ids).toEqual(['case-1']); + }); + + it('loadErrorTestIds returns empty array when no errors', async () => { + const filePath = createJsonlFile([ + { testId: 'case-1', executionStatus: 'ok', score: 0.9 }, + { testId: 'case-2', executionStatus: 'quality_failure', score: 0.5 }, + ]); + + const ids = await loadErrorTestIds(filePath); + expect(ids).toEqual([]); + }); + + it('loadNonErrorResults returns only non-error results', async () => { + const filePath = createJsonlFile([ + { testId: 'case-1', executionStatus: 'ok', score: 0.9 }, + { testId: 'case-2', executionStatus: 'execution_error', score: 0 }, + { testId: 'case-3', executionStatus: 'quality_failure', score: 0.5 }, + ]); + + const results = await loadNonErrorResults(filePath); + expect(results).toHaveLength(2); + expect(results[0].testId).toBe('case-1'); + expect(results[1].testId).toBe('case-3'); + }); + + it('skips malformed JSON lines', async () => { + tmpDir = mkdtempSync(path.join(tmpdir(), 'retry-errors-test-')); + const filePath = path.join(tmpDir, 'results.jsonl'); + writeFileSync( + filePath, + [ + JSON.stringify({ testId: 'case-1', executionStatus: 'execution_error', score: 0 }), + 'not valid json', + '', + JSON.stringify({ testId: 'case-2', executionStatus: 'ok', score: 0.9 }), + ].join('\n'), + ); + + const ids = await loadErrorTestIds(filePath); + expect(ids).toEqual(['case-1']); + + const results = await loadNonErrorResults(filePath); + expect(results).toHaveLength(1); + expect(results[0].testId).toBe('case-2'); + }); +}); diff --git a/apps/web/src/content/docs/evaluation/eval-files.mdx b/apps/web/src/content/docs/evaluation/eval-files.mdx index 773a95719..9636fb56a 100644 --- a/apps/web/src/content/docs/evaluation/eval-files.mdx +++ b/apps/web/src/content/docs/evaluation/eval-files.mdx @@ -34,7 +34,7 @@ tests: |-------|-------------| | `description` | Human-readable description of the evaluation | | `dataset` | Optional dataset identifier | -| `execution` | Default execution config (for example `target`) | +| `execution` | Default execution config (`target`, `fail_on_error`, etc.) | | `workspace` | Suite-level workspace config (lifecycle hooks, template) | | `tests` | Array of individual tests, or a string path to an external file | | `assert` | Suite-level evaluators appended to each test unless `execution.skip_defaults: true` is set on the test | diff --git a/apps/web/src/content/docs/evaluation/running-evals.mdx b/apps/web/src/content/docs/evaluation/running-evals.mdx index 6e6a6a138..4e8f52a5e 100644 --- a/apps/web/src/content/docs/evaluation/running-evals.mdx +++ b/apps/web/src/content/docs/evaluation/running-evals.mdx @@ -87,6 +87,33 @@ agentv eval evals/my-eval.yaml --cleanup-workspaces Workspaces are stored at `~/.agentv/workspaces///`. +### Retry Execution Errors + +Re-run only the tests that had infrastructure/execution errors from a previous output: + +```bash +agentv eval evals/my-eval.yaml --retry-errors .agentv/results/eval_previous.jsonl +``` + +This reads the previous JSONL, filters for `executionStatus === 'execution_error'`, and re-runs only those test cases. Non-error results from the previous run are preserved and merged into the new output. + +### Execution Error Tolerance + +Control whether the eval run halts on execution errors using `execution.fail_on_error` in the eval YAML: + +```yaml +execution: + fail_on_error: false # never halt on errors (default) + # fail_on_error: true # halt on first execution error +``` + +| Value | Behavior | +|-------|----------| +| `true` | Halt immediately on first execution error | +| `false` | Continue despite errors (default) | + +When halted, remaining tests are recorded with `failureReasonCode: 'error_threshold_exceeded'`. With concurrency > 1, a few additional tests may complete before halting takes effect. + ## Validate Before Running Check eval files for schema errors without executing: diff --git a/packages/core/src/evaluation/loaders/config-loader.ts b/packages/core/src/evaluation/loaders/config-loader.ts index 2fb2db647..167e81434 100644 --- a/packages/core/src/evaluation/loaders/config-loader.ts +++ b/packages/core/src/evaluation/loaders/config-loader.ts @@ -3,7 +3,7 @@ import path from 'node:path'; import micromatch from 'micromatch'; import { parse } from 'yaml'; -import type { JsonObject, TrialStrategy, TrialsConfig } from '../types.js'; +import type { FailOnError, JsonObject, TrialStrategy, TrialsConfig } from '../types.js'; import { isJsonObject } from '../types.js'; import { buildDirectoryChain, fileExists } from './file-resolver.js'; @@ -303,6 +303,32 @@ export function extractTotalBudgetUsd(suite: JsonObject): number | undefined { return undefined; } +/** + * Extract `execution.fail_on_error` from parsed eval suite. + * Accepts `true` or `false`. + * Returns undefined when not specified. + */ +export function extractFailOnError(suite: JsonObject): FailOnError | undefined { + const execution = suite.execution; + if (!execution || typeof execution !== 'object' || Array.isArray(execution)) { + return undefined; + } + + const executionObj = execution as Record; + const raw = executionObj.fail_on_error ?? executionObj.failOnError; + + if (raw === undefined || raw === null) { + return undefined; + } + + if (typeof raw === 'boolean') { + return raw; + } + + logWarning(`Invalid execution.fail_on_error: ${raw}. Must be true or false. Ignoring.`); + return undefined; +} + export function parseExecutionDefaults( raw: unknown, configPath: string, diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index 9401b13db..6d5a1a409 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -43,6 +43,7 @@ import type { EvaluatorKind, EvaluatorResult, ExecutionStatus, + FailOnError, FailureStage, JsonObject, JsonValue, @@ -172,6 +173,8 @@ export interface RunEvaluationOptions { readonly streamCallbacks?: ProviderStreamCallbacks; /** Suite-level total cost budget in USD (stops dispatching when exceeded) */ readonly totalBudgetUsd?: number; + /** Execution error tolerance: true halts on first error */ + readonly failOnError?: FailOnError; } export async function runEvaluation( @@ -199,6 +202,7 @@ export async function runEvaluation( trials, streamCallbacks, totalBudgetUsd, + failOnError, } = options; // Disable cache when trials > 1 (cache makes trials deterministic = pointless) @@ -455,6 +459,9 @@ export async function runEvaluation( let cumulativeBudgetCost = 0; let budgetExhausted = false; + // fail_on_error tracking (best-effort under concurrency > 1, matching budgetExhausted semantics) + let failOnErrorTriggered = false; + // Map test cases to limited promises for parallel execution const promises = filteredEvalCases.map((evalCase) => limit(async () => { @@ -499,6 +506,40 @@ export async function runEvaluation( return budgetResult; } + // Check fail_on_error before dispatching + if (failOnError === true && failOnErrorTriggered) { + const errorMsg = 'Halted: execution error encountered with fail_on_error enabled'; + const haltResult: EvaluationResult = { + timestamp: (now ?? (() => new Date()))().toISOString(), + testId: evalCase.id, + dataset: evalCase.dataset, + score: 0, + hits: [], + misses: [], + answer: '', + target: target.name, + error: errorMsg, + executionStatus: 'execution_error', + failureStage: 'setup', + failureReasonCode: 'error_threshold_exceeded', + executionError: { message: errorMsg, stage: 'setup' }, + }; + + if (onProgress) { + await onProgress({ + workerId, + testId: evalCase.id, + status: 'failed', + completedAt: Date.now(), + error: haltResult.error, + }); + } + if (onResult) { + await onResult(haltResult); + } + return haltResult; + } + if (onProgress) { await onProgress({ workerId, @@ -558,6 +599,11 @@ export async function runEvaluation( } } + // Track fail_on_error + if (failOnError === true && result.executionStatus === 'execution_error') { + failOnErrorTriggered = true; + } + // Attach beforeAllOutput to first result only if (beforeAllOutput && !beforeAllOutputAttached) { result = { ...result, beforeAllOutput }; diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts index 657583818..ae46ce924 100644 --- a/packages/core/src/evaluation/types.ts +++ b/packages/core/src/evaluation/types.ts @@ -846,6 +846,13 @@ export interface ExecutionError { readonly stage: FailureStage; } +/** + * Tolerance for execution errors in an eval run. + * - `true`: halt on first execution error + * - `false`: never halt on errors (default) + */ +export type FailOnError = boolean; + /** * Evaluator scorecard for a single eval case run. */ diff --git a/packages/core/src/evaluation/validation/eval-file.schema.ts b/packages/core/src/evaluation/validation/eval-file.schema.ts index 45c394278..e33240e16 100644 --- a/packages/core/src/evaluation/validation/eval-file.schema.ts +++ b/packages/core/src/evaluation/validation/eval-file.schema.ts @@ -305,6 +305,9 @@ const TrialsSchema = z.object({ costLimitUsd: z.number().min(0).optional(), }); +/** Execution error tolerance: true or false */ +const FailOnErrorSchema = z.boolean(); + const ExecutionSchema = z.object({ target: z.string().optional(), targets: z.array(z.string()).optional(), @@ -315,6 +318,8 @@ const ExecutionSchema = z.object({ trials: TrialsSchema.optional(), total_budget_usd: z.number().min(0).optional(), totalBudgetUsd: z.number().min(0).optional(), + fail_on_error: FailOnErrorSchema.optional(), + failOnError: FailOnErrorSchema.optional(), }); // --------------------------------------------------------------------------- diff --git a/packages/core/src/evaluation/yaml-parser.ts b/packages/core/src/evaluation/yaml-parser.ts index bbfef1997..1754a5c2c 100644 --- a/packages/core/src/evaluation/yaml-parser.ts +++ b/packages/core/src/evaluation/yaml-parser.ts @@ -6,6 +6,7 @@ import { parse } from 'yaml'; import { expandFileReferences, loadCasesFromFile } from './loaders/case-file-loader.js'; import { extractCacheConfig, + extractFailOnError, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, @@ -48,6 +49,7 @@ export { buildPromptInputs, type PromptInputs } from './formatting/prompt-builde export { DEFAULT_EVAL_PATTERNS, extractCacheConfig, + extractFailOnError, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, @@ -161,6 +163,8 @@ export type EvalSuiteResult = { readonly metadata?: import('./metadata.js').EvalMetadata; /** Suite-level total cost budget in USD */ readonly totalBudgetUsd?: number; + /** Execution error tolerance: true or false */ + readonly failOnError?: import('./types.js').FailOnError; }; /** @@ -178,6 +182,7 @@ export async function loadTestSuite( } const { tests, parsed } = await loadTestsFromYaml(evalFilePath, repoRoot, options); const metadata = parseMetadata(parsed); + const failOnError = extractFailOnError(parsed); return { tests, trials: extractTrialsConfig(parsed), @@ -185,6 +190,7 @@ export async function loadTestSuite( cacheConfig: extractCacheConfig(parsed), totalBudgetUsd: extractTotalBudgetUsd(parsed), ...(metadata !== undefined && { metadata }), + ...(failOnError !== undefined && { failOnError }), }; } diff --git a/packages/core/test/evaluation/loaders/config-loader.test.ts b/packages/core/test/evaluation/loaders/config-loader.test.ts index 3bb49b673..df9b0f155 100644 --- a/packages/core/test/evaluation/loaders/config-loader.test.ts +++ b/packages/core/test/evaluation/loaders/config-loader.test.ts @@ -1,6 +1,7 @@ import { describe, expect, it } from 'bun:test'; import { + extractFailOnError, extractTargetFromSuite, extractTargetsFromSuite, extractTargetsFromTestCase, @@ -264,6 +265,43 @@ describe('extractTotalBudgetUsd', () => { }); }); +describe('extractFailOnError', () => { + it('returns undefined when no execution block', () => { + const suite: JsonObject = { tests: [] }; + expect(extractFailOnError(suite)).toBeUndefined(); + }); + + it('returns undefined when fail_on_error not set', () => { + const suite: JsonObject = { execution: { target: 'default' } }; + expect(extractFailOnError(suite)).toBeUndefined(); + }); + + it('returns true for fail_on_error: true', () => { + const suite: JsonObject = { execution: { fail_on_error: true } }; + expect(extractFailOnError(suite)).toBe(true); + }); + + it('returns false for fail_on_error: false', () => { + const suite: JsonObject = { execution: { fail_on_error: false } }; + expect(extractFailOnError(suite)).toBe(false); + }); + + it('returns undefined for numeric value', () => { + const suite: JsonObject = { execution: { fail_on_error: 0.3 } }; + expect(extractFailOnError(suite)).toBeUndefined(); + }); + + it('returns undefined for invalid string value', () => { + const suite: JsonObject = { execution: { fail_on_error: 'always' } }; + expect(extractFailOnError(suite)).toBeUndefined(); + }); + + it('supports camelCase failOnError alias', () => { + const suite: JsonObject = { execution: { failOnError: true } }; + expect(extractFailOnError(suite)).toBe(true); + }); +}); + describe('parseExecutionDefaults', () => { it('returns undefined when no execution block', () => { expect(parseExecutionDefaults(undefined, '/test/config.yaml')).toBeUndefined(); diff --git a/packages/core/test/evaluation/orchestrator.test.ts b/packages/core/test/evaluation/orchestrator.test.ts index 6be7439e7..d5958acb8 100644 --- a/packages/core/test/evaluation/orchestrator.test.ts +++ b/packages/core/test/evaluation/orchestrator.test.ts @@ -2223,3 +2223,85 @@ describe('suite-level total budget guardrail', () => { expect(results[3].error).toContain('Suite budget exceeded'); }); }); + +describe('fail_on_error tolerance', () => { + it('fail_on_error: true halts on first execution error', async () => { + let callCount = 0; + const errorOnFirstProvider: Provider = { + id: 'mock:error-on-first', + kind: 'mock' as const, + targetName: 'error-on-first', + async invoke(): Promise { + callCount++; + if (callCount === 1) { + throw new Error('Provider failed'); + } + return { output: [{ role: 'assistant', content: 'ok' }] }; + }, + }; + + const evalCases: EvalTest[] = [ + { ...baseTestCase, id: 'fail-case' }, + { ...baseTestCase, id: 'skip-case-1' }, + { ...baseTestCase, id: 'skip-case-2' }, + ]; + + const results = await runEvaluation({ + testFilePath: 'in-memory.yaml', + repoRoot: 'in-memory', + target: baseTarget, + providerFactory: () => errorOnFirstProvider, + evaluators: evaluatorRegistry, + evalCases, + failOnError: true, + maxConcurrency: 1, + }); + + expect(results).toHaveLength(3); + // First case is execution_error from provider + expect(results[0].executionStatus).toBe('execution_error'); + expect(results[0].failureReasonCode).toBe('provider_error'); + // Remaining cases should be halted by error_threshold_exceeded + expect(results[1].executionStatus).toBe('execution_error'); + expect(results[1].failureReasonCode).toBe('error_threshold_exceeded'); + expect(results[2].executionStatus).toBe('execution_error'); + expect(results[2].failureReasonCode).toBe('error_threshold_exceeded'); + }); + + it('fail_on_error: false never halts on errors', async () => { + let callCount = 0; + const alwaysErrorProvider: Provider = { + id: 'mock:always-error', + kind: 'mock' as const, + targetName: 'always-error', + async invoke(): Promise { + callCount++; + throw new Error(`Provider failed call ${callCount}`); + }, + }; + + const evalCases: EvalTest[] = [ + { ...baseTestCase, id: 'err-1' }, + { ...baseTestCase, id: 'err-2' }, + { ...baseTestCase, id: 'err-3' }, + ]; + + const results = await runEvaluation({ + testFilePath: 'in-memory.yaml', + repoRoot: 'in-memory', + target: baseTarget, + providerFactory: () => alwaysErrorProvider, + evaluators: evaluatorRegistry, + evalCases, + failOnError: false, + maxConcurrency: 1, + }); + + expect(results).toHaveLength(3); + // All are actual provider errors, none are halted + for (const r of results) { + expect(r.executionStatus).toBe('execution_error'); + expect(r.failureReasonCode).toBe('provider_error'); + } + }); +}); diff --git a/plugins/agentv-dev/skills/agentv-eval-builder/SKILL.md b/plugins/agentv-dev/skills/agentv-eval-builder/SKILL.md index 7b34a8820..25292a13d 100644 --- a/plugins/agentv-dev/skills/agentv-eval-builder/SKILL.md +++ b/plugins/agentv-dev/skills/agentv-eval-builder/SKILL.md @@ -366,6 +366,18 @@ LLM-judged structured evaluation with weighted criteria. Criteria items support Top-level `rubrics:` field is deprecated. Use `type: rubrics` under `assert` instead. See `references/rubric-evaluator.md` for score-range mode and scoring formula. +## Execution Error Tolerance + +Control how the runner handles execution errors (infrastructure failures, not quality failures): + +```yaml +execution: + fail_on_error: false # never halt (default) + # fail_on_error: true # halt on first execution error +``` + +When halted, remaining tests get `executionStatus: 'execution_error'` with `failureReasonCode: 'error_threshold_exceeded'`. + ## CLI Commands ```bash @@ -383,6 +395,9 @@ agentv prompt eval # orchestrat agentv prompt eval input --test-id # task input JSON (file paths, not embedded content) agentv prompt eval judge --test-id --answer-file f # judge prompts / code judge results +# Re-run only execution errors from a previous output +agentv eval --retry-errors + # Validate eval file agentv validate diff --git a/plugins/agentv-dev/skills/agentv-eval-builder/references/eval-schema.json b/plugins/agentv-dev/skills/agentv-eval-builder/references/eval-schema.json index 39d35c250..7c55c0224 100644 --- a/plugins/agentv-dev/skills/agentv-eval-builder/references/eval-schema.json +++ b/plugins/agentv-dev/skills/agentv-eval-builder/references/eval-schema.json @@ -4513,6 +4513,12 @@ "totalBudgetUsd": { "type": "number", "minimum": 0 + }, + "fail_on_error": { + "type": "boolean" + }, + "failOnError": { + "type": "boolean" } }, "additionalProperties": false @@ -9231,6 +9237,12 @@ "totalBudgetUsd": { "type": "number", "minimum": 0 + }, + "fail_on_error": { + "type": "boolean" + }, + "failOnError": { + "type": "boolean" } }, "additionalProperties": false @@ -11707,6 +11719,12 @@ "totalBudgetUsd": { "type": "number", "minimum": 0 + }, + "fail_on_error": { + "type": "boolean" + }, + "failOnError": { + "type": "boolean" } }, "additionalProperties": false