From 2324584df4b2e7da3f3201f41a2ec0bad053bdf8 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Fri, 6 Mar 2026 00:23:31 +0000 Subject: [PATCH 1/9] feat: add ExecutionStatus, FailureStage types to EvaluationResult (#431) Co-Authored-By: Claude Opus 4.6 --- packages/core/src/evaluation/types.ts | 35 +++++++++++++++++++++++++++ 1 file changed, 35 insertions(+) diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts index fb500385c..657583818 100644 --- a/packages/core/src/evaluation/types.ts +++ b/packages/core/src/evaluation/types.ts @@ -782,6 +782,12 @@ export interface TrialResult { readonly scores?: readonly EvaluatorResult[]; readonly error?: string; readonly costUsd?: number; + /** Primary classification for this trial attempt */ + readonly executionStatus?: ExecutionStatus; + /** Pipeline stage where failure occurred */ + readonly failureStage?: FailureStage; + /** Machine-readable failure reason code */ + readonly failureReasonCode?: string; } /** @@ -819,6 +825,27 @@ export interface ConfidenceIntervalAggregation { */ export type TrialAggregation = PassAtKAggregation | MeanAggregation | ConfidenceIntervalAggregation; +/** + * Primary classification of evaluation outcome. + * - 'ok': evaluation completed, score reflects model quality (score >= 0.8) + * - 'quality_failure': evaluation completed but model scored below threshold + * - 'execution_error': evaluation could not complete due to infrastructure/tooling error + */ +export type ExecutionStatus = 'ok' | 'quality_failure' | 'execution_error'; + +/** + * Pipeline stage where the failure occurred. + */ +export type FailureStage = 'setup' | 'repo_setup' | 'agent' | 'evaluator' | 'teardown'; + +/** + * Structured error detail for execution failures. + */ +export interface ExecutionError { + readonly message: string; + readonly stage: FailureStage; +} + /** * Evaluator scorecard for a single eval case run. */ @@ -876,6 +903,14 @@ export interface EvaluationResult { readonly costLimited?: boolean; /** Whether the evaluation was skipped due to suite-level budget exhaustion */ readonly budgetExceeded?: boolean; + /** Primary classification: ok, quality_failure, or execution_error */ + readonly executionStatus: ExecutionStatus; + /** Pipeline stage where failure occurred (only when executionStatus !== 'ok') */ + readonly failureStage?: FailureStage; + /** Machine-readable failure reason code (only when executionStatus !== 'ok') */ + readonly failureReasonCode?: string; + /** Structured error detail (only when executionStatus === 'execution_error') */ + readonly executionError?: ExecutionError; } export type EvaluationVerdict = 'pass' | 'fail' | 'borderline' | 'skip'; From 1e23df18c4c0b6fe493170217fa7410f5d1fa0fd Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Fri, 6 Mar 2026 00:26:43 +0000 Subject: [PATCH 2/9] feat: classify execution errors at each orchestrator catch site (#431) Co-Authored-By: Claude Opus 4.6 --- packages/core/src/evaluation/orchestrator.ts | 65 ++++++++++++++++++-- 1 file changed, 61 insertions(+), 4 deletions(-) diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index f6c9b86d8..f5a571cca 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -42,6 +42,8 @@ import type { EvaluatorConfig, EvaluatorKind, EvaluatorResult, + ExecutionStatus, + FailureStage, JsonObject, JsonValue, TrialResult, @@ -466,6 +468,13 @@ export async function runEvaluation( target: target.name, error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`, budgetExceeded: true, + executionStatus: 'execution_error', + failureStage: 'setup', + failureReasonCode: 'budget_exceeded', + executionError: { + message: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`, + stage: 'setup', + }, }; if (onProgress) { @@ -599,6 +608,8 @@ export async function runEvaluation( outcome.reason, promptInputs, primaryProvider, + 'agent', + 'provider_error', ); results.push(errorResult); if (onResult) { @@ -788,7 +799,14 @@ async function runBatchEvaluation(options: { }); if (providerError) { - result = { ...result, error: providerError }; + result = { + ...result, + error: providerError, + executionStatus: 'execution_error' as const, + failureStage: 'agent' as const, + failureReasonCode: 'provider_error', + executionError: { message: providerError, stage: 'agent' as const }, + }; } } catch (error) { const errorResult = buildErrorResult( @@ -798,6 +816,8 @@ async function runBatchEvaluation(options: { error, promptInputs, provider, + 'evaluator', + 'evaluator_error', ); results.push(errorResult); if (onResult) { @@ -899,6 +919,8 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise= 0.8 + ? 'ok' + : 'quality_failure'; + const finalResult = providerError - ? { ...result, error: providerError, beforeAllOutput, beforeEachOutput, afterEachOutput } - : { ...result, beforeAllOutput, beforeEachOutput, afterEachOutput }; + ? { + ...result, + error: providerError, + executionStatus, + failureStage: 'agent' as const, + failureReasonCode: 'provider_error', + executionError: { message: providerError, stage: 'agent' as const }, + beforeAllOutput, + beforeEachOutput, + afterEachOutput, + } + : { ...result, executionStatus, beforeAllOutput, beforeEachOutput, afterEachOutput }; // Determine if this is a failure (has error or low score) const isFailure = !!finalResult.error || finalResult.score < 0.5; @@ -1205,6 +1253,8 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise= 0.8 ? 'ok' as const : 'quality_failure' as const, }; } @@ -1861,7 +1912,9 @@ function buildErrorResult( timestamp: Date, error: unknown, promptInputs: PromptInputs, - provider?: Provider, + provider: Provider | undefined, + failureStage: FailureStage, + failureReasonCode: string, ): EvaluationResult { const message = error instanceof Error ? error.message : String(error); @@ -1913,6 +1966,10 @@ function buildErrorResult( requests, input, error: message, + executionStatus: 'execution_error', + failureStage, + failureReasonCode, + executionError: { message, stage: failureStage }, } satisfies EvaluationResult; } From 091a67d7c170c5cf82a5fbab13b97e3e7817b9a6 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Fri, 6 Mar 2026 00:30:00 +0000 Subject: [PATCH 3/9] feat: propagate executionStatus through trial aggregation (#431) Co-Authored-By: Claude Opus 4.6 --- packages/core/src/evaluation/orchestrator.ts | 28 ++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index f5a571cca..dbcb64fea 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -1307,6 +1307,9 @@ async function runEvalCaseWithTrials( scores: result.scores, error: result.error, costUsd: trialCost, + executionStatus: result.executionStatus, + failureStage: result.failureStage, + failureReasonCode: result.failureReasonCode, }; trialResults.push(trial); @@ -1343,12 +1346,37 @@ async function runEvalCaseWithTrials( ); const baseResult = allResults[bestTrialIndex]; + // Determine aggregate executionStatus from trial results: + // - If ANY trial succeeded → ok + // - If ALL trials had execution_error → execution_error + // - Otherwise → quality_failure + const hasOk = trialResults.some((t) => t.executionStatus === 'ok'); + const allExecutionError = + trialResults.length > 0 && trialResults.every((t) => t.executionStatus === 'execution_error'); + const aggregateExecutionStatus: ExecutionStatus = hasOk + ? 'ok' + : allExecutionError + ? 'execution_error' + : 'quality_failure'; + + // When the aggregate status differs from baseResult, clear failure fields that no longer apply + const aggregateFailureStage = + aggregateExecutionStatus === 'ok' ? undefined : baseResult.failureStage; + const aggregateFailureReasonCode = + aggregateExecutionStatus === 'ok' ? undefined : baseResult.failureReasonCode; + const aggregateExecutionError = + aggregateExecutionStatus === 'execution_error' ? baseResult.executionError : undefined; + return { ...baseResult, score, trials: trialResults, aggregation, costLimited: costLimited || undefined, + executionStatus: aggregateExecutionStatus, + failureStage: aggregateFailureStage, + failureReasonCode: aggregateFailureReasonCode, + executionError: aggregateExecutionError, }; } From 1e23b27cb560656260887ebbaa949625de5187b7 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Fri, 6 Mar 2026 00:31:17 +0000 Subject: [PATCH 4/9] feat: separate execution errors from quality metrics in summary (#431) Co-Authored-By: Claude Opus 4.6 --- apps/cli/src/commands/eval/statistics.ts | 96 ++++++++++++++++++++---- 1 file changed, 81 insertions(+), 15 deletions(-) diff --git a/apps/cli/src/commands/eval/statistics.ts b/apps/cli/src/commands/eval/statistics.ts index c830592fa..6ed2b1568 100644 --- a/apps/cli/src/commands/eval/statistics.ts +++ b/apps/cli/src/commands/eval/statistics.ts @@ -17,6 +17,11 @@ export interface EvaluationSummary { readonly bottomResults: readonly EvaluationResult[]; readonly errorCount: number; readonly errors: readonly { readonly testId: string; readonly error: string }[]; + readonly executionErrorCount: number; + readonly qualityFailureCount: number; + readonly passedCount: number; + readonly byFailureStage: Readonly>; + readonly byFailureReason: Readonly>; } const HISTOGRAM_BREAKPOINTS = [0, 0.2, 0.4, 0.6, 0.8, 1]; @@ -80,7 +85,6 @@ function buildHistogram(values: readonly number[]): readonly HistogramBin[] { export function calculateEvaluationSummary( results: readonly EvaluationResult[], ): EvaluationSummary { - const scores = results.map((result) => result.score); const total = results.length; // Track errors @@ -102,20 +106,52 @@ export function calculateEvaluationSummary( bottomResults: [], errorCount: 0, errors: [], + executionErrorCount: 0, + qualityFailureCount: 0, + passedCount: 0, + byFailureStage: {}, + byFailureReason: {}, }; } - const mean = computeMean(scores); - const median = computeMedian(scores); - const min = Math.min(...scores); - const max = Math.max(...scores); - const standardDeviation = computeStandardDeviation(scores); - const histogram = buildHistogram(scores); - - const sortedResults = [...results].sort((a, b) => b.score - a.score); + // Separate execution errors from quality results + const executionErrors = results.filter((r) => r.executionStatus === 'execution_error'); + const qualityResults = results.filter((r) => r.executionStatus !== 'execution_error'); + const qualityScores = qualityResults.map((r) => r.score); + + // Compute quality metrics from non-execution-error results only + const mean = computeMean(qualityScores); + const median = computeMedian(qualityScores); + const min = qualityScores.length > 0 ? Math.min(...qualityScores) : 0; + const max = qualityScores.length > 0 ? Math.max(...qualityScores) : 0; + const standardDeviation = computeStandardDeviation(qualityScores); + const histogram = buildHistogram(qualityScores); + + // Top/bottom results consider only non-execution-error results + const sortedResults = [...qualityResults].sort((a, b) => b.score - a.score); const topResults = sortedResults.slice(0, Math.min(3, sortedResults.length)); const bottomResults = sortedResults.slice(-Math.min(3, sortedResults.length)); + // Count by execution status + const executionErrorCount = executionErrors.length; + const qualityFailureCount = results.filter( + (r) => r.executionStatus === 'quality_failure', + ).length; + const passedCount = results.filter((r) => r.executionStatus === 'ok').length; + + // Aggregate by failure stage and reason + const byFailureStage: Record = {}; + const byFailureReason: Record = {}; + for (const result of results) { + if (result.failureStage) { + byFailureStage[result.failureStage] = (byFailureStage[result.failureStage] ?? 0) + 1; + } + if (result.failureReasonCode) { + byFailureReason[result.failureReasonCode] = + (byFailureReason[result.failureReasonCode] ?? 0) + 1; + } + } + return { total, mean, @@ -128,6 +164,11 @@ export function calculateEvaluationSummary( bottomResults, errorCount, errors, + executionErrorCount, + qualityFailureCount, + passedCount, + byFailureStage, + byFailureReason, }; } @@ -145,7 +186,7 @@ export function formatEvaluationSummary(summary: EvaluationSummary): string { // Display errors first if any exist if (summary.errorCount > 0) { lines.push('\n=================================================='); - lines.push('ERRORS'); + lines.push('EXECUTION ERRORS'); lines.push('=================================================='); for (const error of summary.errors) { lines.push(`\n❌ ${error.testId}`); @@ -158,13 +199,22 @@ export function formatEvaluationSummary(summary: EvaluationSummary): string { lines.push('EVALUATION SUMMARY'); lines.push('=================================================='); lines.push(`Total tests: ${summary.total}`); - - if (summary.errorCount > 0) { - lines.push(`Failed: ${summary.errorCount}`); - lines.push(`Passed: ${summary.total - summary.errorCount}`); + lines.push(`Passed: ${summary.passedCount}`); + if (summary.qualityFailureCount > 0) { + lines.push(`Quality failures: ${summary.qualityFailureCount}`); + } + if (summary.executionErrorCount > 0) { + lines.push(`Execution errors: ${summary.executionErrorCount}`); } - lines.push(`Mean score: ${formatScore(summary.mean)}`); + if (summary.executionErrorCount > 0) { + const qualityCount = summary.total - summary.executionErrorCount; + lines.push( + `Mean score: ${formatScore(summary.mean)} (${qualityCount} quality tests, ${summary.executionErrorCount} execution errors excluded)`, + ); + } else { + lines.push(`Mean score: ${formatScore(summary.mean)}`); + } lines.push(`Median score: ${formatScore(summary.median)}`); lines.push(`Min score: ${formatScore(summary.min)}`); lines.push(`Max score: ${formatScore(summary.max)}`); @@ -188,6 +238,22 @@ export function formatEvaluationSummary(summary: EvaluationSummary): string { lines.push(` ${index + 1}. ${result.testId}: ${formatScore(result.score)}`); }); + const failureStageEntries = Object.entries(summary.byFailureStage); + if (failureStageEntries.length > 0) { + lines.push('\nExecution errors by stage:'); + for (const [stage, count] of failureStageEntries) { + lines.push(` ${stage}: ${count}`); + } + } + + const failureReasonEntries = Object.entries(summary.byFailureReason); + if (failureReasonEntries.length > 0) { + lines.push('\nExecution errors by reason:'); + for (const [reason, count] of failureReasonEntries) { + lines.push(` ${reason}: ${count}`); + } + } + return lines.join('\n'); } From 12a9f4d07c5294b7d628f22c15ab71601bac49d7 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Fri, 6 Mar 2026 00:34:45 +0000 Subject: [PATCH 5/9] test: add executionStatus assertions to existing orchestrator tests (#431) Add executionStatus, failureStage, failureReasonCode, and executionError assertions to representative existing tests: - Success path: assert executionStatus === 'ok' - Provider throw: assert execution_error with agent stage - Provider raw.error: assert execution_error with provider_error code - Setup script failure: assert execution_error with setup stage - Successful workspace scripts: assert ok status Co-Authored-By: Claude Opus 4.6 --- .../core/test/evaluation/orchestrator.test.ts | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/packages/core/test/evaluation/orchestrator.test.ts b/packages/core/test/evaluation/orchestrator.test.ts index 6262e779a..6be7439e7 100644 --- a/packages/core/test/evaluation/orchestrator.test.ts +++ b/packages/core/test/evaluation/orchestrator.test.ts @@ -177,6 +177,9 @@ describe('runTestCase', () => { expect(result.misses).toHaveLength(0); expect(result.timestamp).toBe('2024-01-01T00:00:00.000Z'); expect(result.input).toBe('Explain logging improvements'); + expect(result.executionStatus).toBe('ok'); + expect(result.failureStage).toBeUndefined(); + expect(result.failureReasonCode).toBeUndefined(); }); it('reuses cached provider response when available', async () => { @@ -258,6 +261,11 @@ describe('runTestCase', () => { expect(result.score).toBe(0); expect(result.misses[0]).toContain('Provider failure'); expect(result.input).toBe('Explain logging improvements'); + expect(result.executionStatus).toBe('execution_error'); + expect(result.failureStage).toBe('agent'); + expect(result.failureReasonCode).toBe('provider_error'); + expect(result.executionError).toBeDefined(); + expect(result.executionError?.message).toContain('Provider failure'); }); it('surfaces provider raw.error as evaluation error', async () => { @@ -278,6 +286,9 @@ describe('runTestCase', () => { }); expect(result.error).toBe("Batch output missing id 'case-1'"); + expect(result.executionStatus).toBe('execution_error'); + expect(result.failureStage).toBe('agent'); + expect(result.failureReasonCode).toBe('provider_error'); }); it('reports failed progress status for batch item errors', async () => { @@ -1423,6 +1434,7 @@ rl.on('close', () => { expect(result.beforeAllOutput).toContain('Setup done for case-1'); expect(result.error).toBeUndefined(); + expect(result.executionStatus).toBe('ok'); }); it('returns error result when setup script fails', async () => { @@ -1463,6 +1475,10 @@ rl.on('close', () => { expect(result.error).toContain('before_all script failed'); expect(result.score).toBe(0); + expect(result.executionStatus).toBe('execution_error'); + expect(result.failureStage).toBe('setup'); + expect(result.failureReasonCode).toBe('script_error'); + expect(result.executionError).toBeDefined(); }); it('executes teardown script and captures output in result', async () => { @@ -1520,6 +1536,7 @@ rl.on('close', () => { expect(result.afterEachOutput).toContain('Teardown done for case-1'); expect(result.error).toBeUndefined(); + expect(result.executionStatus).toBe('ok'); }); }); From c3b6e7040f4fec6322d84ca993cd2ea5a8d31d47 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Fri, 6 Mar 2026 00:35:55 +0000 Subject: [PATCH 6/9] test: add dedicated execution status classification tests (#431) New test file covering all executionStatus classification paths: - Provider throw -> execution_error with agent stage - High score (>=0.8) -> ok - Low score (<0.8) -> quality_failure - Backward compatibility: error field still set alongside executionError - Threshold boundary tests at exactly 0.8 and 0.79 Co-Authored-By: Claude Opus 4.6 --- .../test/evaluation/execution-status.test.ts | 232 ++++++++++++++++++ 1 file changed, 232 insertions(+) create mode 100644 packages/core/test/evaluation/execution-status.test.ts diff --git a/packages/core/test/evaluation/execution-status.test.ts b/packages/core/test/evaluation/execution-status.test.ts new file mode 100644 index 000000000..21748c162 --- /dev/null +++ b/packages/core/test/evaluation/execution-status.test.ts @@ -0,0 +1,232 @@ +import { describe, expect, it } from 'bun:test'; + +import { runEvalCase } from '../../src/evaluation/orchestrator.js'; +import type { ResolvedTarget } from '../../src/evaluation/providers/targets.js'; +import type { + Provider, + ProviderResponse, +} from '../../src/evaluation/providers/types.js'; +import type { EvalTest } from '../../src/evaluation/types.js'; + +// --------------------------------------------------------------------------- +// Mock providers +// --------------------------------------------------------------------------- + +class ErrorProvider implements Provider { + readonly id = 'mock:error'; + readonly kind = 'mock' as const; + readonly targetName = 'error-target'; + + async invoke(): Promise { + throw new Error('Provider failed'); + } +} + +class FixedResponseProvider implements Provider { + readonly id = 'mock:fixed'; + readonly kind = 'mock' as const; + readonly targetName = 'fixed-target'; + + constructor(private readonly response: string) {} + + async invoke(): Promise { + return { + output: [{ role: 'assistant', content: this.response }], + }; + } +} + +// --------------------------------------------------------------------------- +// Shared fixtures +// --------------------------------------------------------------------------- + +const baseTestCase: EvalTest = { + id: 'exec-status-1', + dataset: 'test-dataset', + question: 'Explain logging improvements', + input: [{ role: 'user', content: 'Explain logging improvements' }], + input_segments: [{ type: 'text', value: 'Explain logging improvements' }], + expected_output: [], + reference_answer: '- add structured logging\n- avoid global state', + guideline_paths: [], + file_paths: [], + criteria: 'Logging improved', + evaluator: 'llm_judge', +}; + +const baseTarget: ResolvedTarget = { + kind: 'mock', + name: 'mock', + config: { response: '{}' }, +}; + +/** Returns a score >= 0.8 → executionStatus 'ok' */ +const highScoreEvaluators = { + llm_judge: { + kind: 'llm_judge', + async evaluate() { + return { + score: 0.9, + verdict: 'pass' as const, + hits: ['good answer'], + misses: [], + expectedAspectCount: 1, + }; + }, + }, +}; + +/** Returns a score < 0.8 → executionStatus 'quality_failure' */ +const lowScoreEvaluators = { + llm_judge: { + kind: 'llm_judge', + async evaluate() { + return { + score: 0.3, + verdict: 'fail' as const, + hits: [], + misses: ['missed the point'], + expectedAspectCount: 1, + }; + }, + }, +}; + +// --------------------------------------------------------------------------- +// Tests +// --------------------------------------------------------------------------- + +describe('execution status classification', () => { + it('classifies provider errors as execution_error with agent stage', async () => { + const provider = new ErrorProvider(); + + const result = await runEvalCase({ + evalCase: baseTestCase, + provider, + target: baseTarget, + evaluators: highScoreEvaluators, + }); + + expect(result.executionStatus).toBe('execution_error'); + expect(result.failureStage).toBe('agent'); + expect(result.failureReasonCode).toBe('provider_error'); + expect(result.executionError).toBeDefined(); + expect(result.executionError?.message).toContain('Provider failed'); + expect(result.executionError?.stage).toBe('agent'); + // Backward compat: error field still set + expect(result.error).toBeDefined(); + expect(result.score).toBe(0); + }); + + it('classifies high-scoring results as ok', async () => { + const provider = new FixedResponseProvider( + 'Add structured logging and avoid global state.', + ); + + const result = await runEvalCase({ + evalCase: baseTestCase, + provider, + target: baseTarget, + evaluators: highScoreEvaluators, + }); + + expect(result.executionStatus).toBe('ok'); + expect(result.failureStage).toBeUndefined(); + expect(result.failureReasonCode).toBeUndefined(); + expect(result.executionError).toBeUndefined(); + expect(result.score).toBeGreaterThanOrEqual(0.8); + }); + + it('classifies low-scoring results as quality_failure', async () => { + const provider = new FixedResponseProvider('I have no idea about logging.'); + + const result = await runEvalCase({ + evalCase: baseTestCase, + provider, + target: baseTarget, + evaluators: lowScoreEvaluators, + }); + + expect(result.executionStatus).toBe('quality_failure'); + expect(result.failureStage).toBeUndefined(); + expect(result.failureReasonCode).toBeUndefined(); + expect(result.executionError).toBeUndefined(); + expect(result.score).toBeLessThan(0.8); + }); + + it('preserves backward-compatible error field on execution errors', async () => { + const provider = new ErrorProvider(); + + const result = await runEvalCase({ + evalCase: baseTestCase, + provider, + target: baseTarget, + evaluators: highScoreEvaluators, + }); + + // Both old and new fields are set + expect(result.error).toBeDefined(); + expect(typeof result.error).toBe('string'); + expect(result.executionStatus).toBe('execution_error'); + expect(result.executionError).toBeDefined(); + expect(result.executionError?.message).toBe(result.error); + }); + + it('sets executionStatus to ok at exact 0.8 threshold', async () => { + const thresholdEvaluators = { + llm_judge: { + kind: 'llm_judge', + async evaluate() { + return { + score: 0.8, + verdict: 'pass' as const, + hits: ['acceptable'], + misses: [], + expectedAspectCount: 1, + }; + }, + }, + }; + + const provider = new FixedResponseProvider('Adequate answer.'); + + const result = await runEvalCase({ + evalCase: baseTestCase, + provider, + target: baseTarget, + evaluators: thresholdEvaluators, + }); + + expect(result.executionStatus).toBe('ok'); + expect(result.score).toBe(0.8); + }); + + it('sets executionStatus to quality_failure just below threshold', async () => { + const belowThresholdEvaluators = { + llm_judge: { + kind: 'llm_judge', + async evaluate() { + return { + score: 0.79, + verdict: 'fail' as const, + hits: [], + misses: ['barely missed'], + expectedAspectCount: 1, + }; + }, + }, + }; + + const provider = new FixedResponseProvider('Almost adequate answer.'); + + const result = await runEvalCase({ + evalCase: baseTestCase, + provider, + target: baseTarget, + evaluators: belowThresholdEvaluators, + }); + + expect(result.executionStatus).toBe('quality_failure'); + expect(result.score).toBe(0.79); + }); +}); From b56dc9cafd559f5c3ad28c5a28db01211d6fcf19 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Fri, 6 Mar 2026 00:38:21 +0000 Subject: [PATCH 7/9] fix: add executionStatus to baseline test fixture (#431) Add required executionStatus: 'ok' to makeFullResult() in baseline.test.ts to match the updated EvaluationResult type which now requires this field. Co-Authored-By: Claude Opus 4.6 --- ...026-02-26-eval-schema-generation-design.md | 652 ++++++++++++++++++ .../core/test/evaluation/baseline.test.ts | 1 + 2 files changed, 653 insertions(+) create mode 100644 docs/plans/2026-02-26-eval-schema-generation-design.md diff --git a/docs/plans/2026-02-26-eval-schema-generation-design.md b/docs/plans/2026-02-26-eval-schema-generation-design.md new file mode 100644 index 000000000..c1edcc646 --- /dev/null +++ b/docs/plans/2026-02-26-eval-schema-generation-design.md @@ -0,0 +1,652 @@ +# Eval Schema Generation Implementation Plan + +> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task. + +**Goal:** Auto-generate `eval-schema.json` from a Zod schema and add a diff test to catch drift. + +**Architecture:** Create a comprehensive Zod schema (`eval-file.schema.ts`) that mirrors the eval YAML file structure. A generator script converts it to JSON Schema via `zod-to-json-schema`. A test regenerates and diffs against the committed file — if they diverge, it fails. + +**Tech Stack:** Zod, zod-to-json-schema, Vitest + +--- + +### Task 1: Add `zod-to-json-schema` dependency + +**Files:** +- Modify: `packages/core/package.json` + +**Step 1: Install the dependency** + +Run: `cd /home/christso/projects/agentv && bun add -d zod-to-json-schema --cwd packages/core` + +**Step 2: Verify installation** + +Run: `grep zod-to-json-schema packages/core/package.json` +Expected: `"zod-to-json-schema": "^3.x.x"` in devDependencies + +**Step 3: Commit** + +```bash +git add packages/core/package.json bun.lock +git commit -m "chore: add zod-to-json-schema dev dependency" +``` + +--- + +### Task 2: Create the eval file Zod schema + +**Files:** +- Create: `packages/core/src/evaluation/validation/eval-file.schema.ts` + +**Context:** This schema represents the **YAML input format** (what users write), not the parsed runtime types. Key differences from runtime types: +- Uses snake_case field names (YAML convention) +- Includes shorthands (string input → message array) +- Includes deprecated aliases (eval_cases, script, expected_outcome) +- Uses `additionalProperties` / `.passthrough()` where custom config is allowed +- Does NOT include resolved/computed fields (resolvedCwd, resolvedPromptPath, etc.) + +The schema should import `EVALUATOR_KIND_VALUES` from `types.ts` to stay in sync with the evaluator kind enum. + +**Step 1: Write the schema file** + +Create `packages/core/src/evaluation/validation/eval-file.schema.ts` with: + +```typescript +/** + * Zod schema for eval YAML file format. + * Used to generate eval-schema.json for AI agent reference. + * + * IMPORTANT: This schema describes the YAML input format, not the parsed runtime types. + * When adding new eval features, update this schema AND run `bun run generate:schema` + * to regenerate eval-schema.json. The sync test will fail if they diverge. + */ +import { z } from 'zod'; + +// --------------------------------------------------------------------------- +// Shared primitives +// --------------------------------------------------------------------------- + +/** Message content: string or structured array */ +const ContentItemSchema = z.object({ + type: z.enum(['text', 'file']), + value: z.string(), +}); + +const MessageContentSchema = z.union([ + z.string(), + z.array(ContentItemSchema), +]); + +const MessageSchema = z.object({ + role: z.enum(['system', 'user', 'assistant', 'tool']), + content: MessageContentSchema, +}); + +/** Input: string shorthand or message array */ +const InputSchema = z.union([z.string(), z.array(MessageSchema)]); + +/** Expected output: string, object, or message array */ +const ExpectedOutputSchema = z.union([ + z.string(), + z.record(z.unknown()), + z.array(MessageSchema), +]); + +// --------------------------------------------------------------------------- +// Evaluator schemas (YAML input format) +// --------------------------------------------------------------------------- + +/** Common fields shared by all evaluators */ +const EvaluatorCommonSchema = z.object({ + name: z.string().optional(), + weight: z.number().min(0).optional(), + required: z.union([z.boolean(), z.number().gt(0).lte(1)]).optional(), + negate: z.boolean().optional(), +}); + +/** Prompt: string (inline/file path) or executable script config */ +const PromptSchema = z.union([ + z.string(), + z.object({ + command: z.union([z.string(), z.array(z.string())]).optional(), + script: z.union([z.string(), z.array(z.string())]).optional(), + config: z.record(z.unknown()).optional(), + }), +]); + +/** Score range for analytic rubrics */ +const ScoreRangeSchema = z.object({ + score_range: z.tuple([z.number().int().min(0).max(10), z.number().int().min(0).max(10)]), + outcome: z.string().min(1), +}); + +/** Rubric item (checklist or score-range mode) */ +const RubricItemSchema = z.object({ + id: z.string().optional(), + outcome: z.string().optional(), + weight: z.number().optional(), + required: z.boolean().optional(), + required_min_score: z.number().int().min(0).max(10).optional(), + score_ranges: z.array(ScoreRangeSchema).optional(), +}); + +// --- Type-specific evaluator schemas --- + +const CodeJudgeSchema = EvaluatorCommonSchema.extend({ + type: z.literal('code_judge'), + command: z.union([z.string(), z.array(z.string())]), + script: z.union([z.string(), z.array(z.string())]).optional(), + cwd: z.string().optional(), + target: z.union([z.boolean(), z.object({ max_calls: z.number().optional() })]).optional(), + config: z.record(z.unknown()).optional(), +}); + +const LlmJudgeSchema = EvaluatorCommonSchema.extend({ + type: z.literal('llm_judge'), + prompt: PromptSchema.optional(), + rubrics: z.array(RubricItemSchema).optional(), + model: z.string().optional(), + config: z.record(z.unknown()).optional(), +}); + +/** Aggregator configs for composite evaluator */ +const AggregatorSchema = z.discriminatedUnion('type', [ + z.object({ + type: z.literal('weighted_average'), + weights: z.record(z.number()).optional(), + }), + z.object({ + type: z.literal('threshold'), + threshold: z.number().min(0).max(1), + }), + z.object({ + type: z.literal('code_judge'), + path: z.string(), + cwd: z.string().optional(), + }), + z.object({ + type: z.literal('llm_judge'), + prompt: z.string().optional(), + model: z.string().optional(), + }), +]); + +// Use z.lazy for recursive composite evaluator +const CompositeSchema: z.ZodType = z.lazy(() => + EvaluatorCommonSchema.extend({ + type: z.literal('composite'), + assert: z.array(EvaluatorSchema).optional(), + evaluators: z.array(EvaluatorSchema).optional(), + aggregator: AggregatorSchema, + }), +); + +const ArgsMatchSchema = z.union([ + z.enum(['exact', 'ignore', 'subset', 'superset']), + z.array(z.string()), +]); + +const ToolTrajectoryExpectedItemSchema = z.object({ + tool: z.string(), + args: z.union([z.literal('any'), z.record(z.unknown())]).optional(), + max_duration_ms: z.number().min(0).optional(), + maxDurationMs: z.number().min(0).optional(), + args_match: ArgsMatchSchema.optional(), + argsMatch: ArgsMatchSchema.optional(), +}); + +const ToolTrajectorySchema = EvaluatorCommonSchema.extend({ + type: z.literal('tool_trajectory'), + mode: z.enum(['any_order', 'in_order', 'exact', 'subset', 'superset']), + minimums: z.record(z.number().int().min(0)).optional(), + expected: z.array(ToolTrajectoryExpectedItemSchema).optional(), + args_match: ArgsMatchSchema.optional(), + argsMatch: ArgsMatchSchema.optional(), +}); + +const FieldConfigSchema = z.object({ + path: z.string(), + match: z.enum(['exact', 'numeric_tolerance', 'date']), + required: z.boolean().optional(), + weight: z.number().optional(), + tolerance: z.number().min(0).optional(), + relative: z.boolean().optional(), + formats: z.array(z.string()).optional(), +}); + +const FieldAccuracySchema = EvaluatorCommonSchema.extend({ + type: z.literal('field_accuracy'), + fields: z.array(FieldConfigSchema).min(1), + aggregation: z.enum(['weighted_average', 'all_or_nothing']).optional(), +}); + +const LatencySchema = EvaluatorCommonSchema.extend({ + type: z.literal('latency'), + threshold: z.number().min(0), +}); + +const CostSchema = EvaluatorCommonSchema.extend({ + type: z.literal('cost'), + budget: z.number().min(0), +}); + +const TokenUsageSchema = EvaluatorCommonSchema.extend({ + type: z.literal('token_usage'), + max_total: z.number().min(0).optional(), + max_input: z.number().min(0).optional(), + max_output: z.number().min(0).optional(), +}); + +const ExecutionMetricsSchema = EvaluatorCommonSchema.extend({ + type: z.literal('execution_metrics'), + max_tool_calls: z.number().min(0).optional(), + max_llm_calls: z.number().min(0).optional(), + max_tokens: z.number().min(0).optional(), + max_cost_usd: z.number().min(0).optional(), + max_duration_ms: z.number().min(0).optional(), + target_exploration_ratio: z.number().min(0).max(1).optional(), + exploration_tolerance: z.number().min(0).optional(), +}); + +const AgentJudgeSchema = EvaluatorCommonSchema.extend({ + type: z.literal('agent_judge'), + prompt: z.string().optional(), + rubrics: z.array(RubricItemSchema).optional(), + max_steps: z.number().int().min(1).max(50).optional(), + temperature: z.number().min(0).max(2).optional(), + target: z.string().optional(), +}); + +const ContainsSchema = EvaluatorCommonSchema.extend({ + type: z.literal('contains'), + value: z.string(), +}); + +const RegexSchema = EvaluatorCommonSchema.extend({ + type: z.literal('regex'), + value: z.string(), +}); + +const IsJsonSchema = EvaluatorCommonSchema.extend({ + type: z.literal('is_json'), +}); + +const EqualsSchema = EvaluatorCommonSchema.extend({ + type: z.literal('equals'), + value: z.string(), +}); + +const RubricsSchema = EvaluatorCommonSchema.extend({ + type: z.literal('rubrics'), + criteria: z.array(RubricItemSchema).min(1), +}); + +/** Union of all evaluator types */ +const EvaluatorSchema = z.union([ + CodeJudgeSchema, + LlmJudgeSchema, + CompositeSchema, + ToolTrajectorySchema, + FieldAccuracySchema, + LatencySchema, + CostSchema, + TokenUsageSchema, + ExecutionMetricsSchema, + AgentJudgeSchema, + ContainsSchema, + RegexSchema, + IsJsonSchema, + EqualsSchema, + RubricsSchema, +]); + +// --------------------------------------------------------------------------- +// Workspace +// --------------------------------------------------------------------------- + +const WorkspaceScriptSchema = z.object({ + command: z.union([z.string(), z.array(z.string())]).optional(), + script: z.union([z.string(), z.array(z.string())]).optional(), + timeout_ms: z.number().min(0).optional(), + cwd: z.string().optional(), +}); + +const WorkspaceSchema = z.object({ + template: z.string().optional(), + before_all: WorkspaceScriptSchema.optional(), + after_all: WorkspaceScriptSchema.optional(), + before_each: WorkspaceScriptSchema.optional(), + after_each: WorkspaceScriptSchema.optional(), +}); + +// --------------------------------------------------------------------------- +// Execution block +// --------------------------------------------------------------------------- + +const TrialsSchema = z.object({ + count: z.number().int().min(1), + strategy: z.enum(['pass_at_k', 'mean', 'confidence_interval']).optional(), + cost_limit_usd: z.number().min(0).optional(), + costLimitUsd: z.number().min(0).optional(), +}); + +const ExecutionSchema = z.object({ + target: z.string().optional(), + targets: z.array(z.string()).optional(), + assert: z.array(EvaluatorSchema).optional(), + evaluators: z.array(EvaluatorSchema).optional(), + skip_defaults: z.boolean().optional(), + cache: z.boolean().optional(), + trials: TrialsSchema.optional(), + total_budget_usd: z.number().min(0).optional(), + totalBudgetUsd: z.number().min(0).optional(), +}); + +// --------------------------------------------------------------------------- +// Test case +// --------------------------------------------------------------------------- + +const EvalTestSchema = z.object({ + id: z.string().min(1), + criteria: z.string().optional(), + expected_outcome: z.string().optional(), + input: InputSchema.optional(), + expected_output: ExpectedOutputSchema.optional(), + assert: z.array(EvaluatorSchema).optional(), + evaluators: z.array(EvaluatorSchema).optional(), + execution: ExecutionSchema.optional(), + workspace: WorkspaceSchema.optional(), + metadata: z.record(z.unknown()).optional(), + conversation_id: z.string().optional(), + dataset: z.string().optional(), + note: z.string().optional(), +}); + +// --------------------------------------------------------------------------- +// Top-level eval file +// --------------------------------------------------------------------------- + +export const EvalFileSchema = z.object({ + $schema: z.string().optional(), + // Metadata + name: z.string().regex(/^[a-z0-9-]+$/).optional(), + description: z.string().optional(), + version: z.string().optional(), + author: z.string().optional(), + tags: z.array(z.string()).optional(), + license: z.string().optional(), + requires: z.object({ agentv: z.string().optional() }).optional(), + // Suite-level input + input: InputSchema.optional(), + // Tests (array or external file path) + tests: z.union([z.array(EvalTestSchema), z.string()]), + // Deprecated aliases + eval_cases: z.union([z.array(EvalTestSchema), z.string()]).optional(), + // Target + target: z.string().optional(), + // Execution + execution: ExecutionSchema.optional(), + // Suite-level assertions + assert: z.array(EvaluatorSchema).optional(), + // Workspace + workspace: WorkspaceSchema.optional(), +}); +``` + +**Step 2: Verify the file compiles** + +Run: `cd /home/christso/projects/agentv && bunx tsc --noEmit packages/core/src/evaluation/validation/eval-file.schema.ts --esModuleInterop --moduleResolution bundler --module esnext --target es2022 --strict` + +If tsc is fussy with standalone file checking, just run the full typecheck: +Run: `bun run typecheck --filter @agentv/core` + +**Step 3: Commit** + +```bash +git add packages/core/src/evaluation/validation/eval-file.schema.ts +git commit -m "feat: add Zod schema for eval YAML file format" +``` + +--- + +### Task 3: Create the generator script + +**Files:** +- Create: `packages/core/scripts/generate-eval-schema.ts` +- Modify: `packages/core/package.json` (add script) + +**Step 1: Write the generator script** + +Create `packages/core/scripts/generate-eval-schema.ts`: + +```typescript +#!/usr/bin/env bun +/** + * Generates eval-schema.json from the Zod schema. + * Run: bun run generate:schema (from packages/core) + * Or: bun packages/core/scripts/generate-eval-schema.ts (from repo root) + */ +import { zodToJsonSchema } from 'zod-to-json-schema'; +import { writeFile } from 'node:fs/promises'; +import path from 'node:path'; +import { EvalFileSchema } from '../src/evaluation/validation/eval-file.schema.js'; + +const jsonSchema = zodToJsonSchema(EvalFileSchema, { + name: 'EvalFile', + $refStrategy: 'none', +}); + +// Add JSON Schema metadata +const schema = { + $schema: 'http://json-schema.org/draft-07/schema#', + title: 'AgentV Eval File', + description: 'Schema for AgentV evaluation YAML files (.eval.yaml)', + ...jsonSchema, +}; + +const outputPath = path.resolve( + import.meta.dirname, + '../../../plugins/agentv-dev/skills/agentv-eval-builder/references/eval-schema.json', +); + +await writeFile(outputPath, `${JSON.stringify(schema, null, 2)}\n`); +console.log(`Generated: ${outputPath}`); +``` + +**Step 2: Add the script to package.json** + +Add to `packages/core/package.json` scripts: +```json +"generate:schema": "bun scripts/generate-eval-schema.ts" +``` + +**Step 3: Run the generator and verify output** + +Run: `cd /home/christso/projects/agentv/packages/core && bun run generate:schema` +Expected: `Generated: .../plugins/agentv-dev/skills/agentv-eval-builder/references/eval-schema.json` + +Inspect the output: +Run: `head -30 /home/christso/projects/agentv/plugins/agentv-dev/skills/agentv-eval-builder/references/eval-schema.json` +Expected: Valid JSON with `$schema`, `title`, `properties` including `tests`, `execution`, `assert`, etc. + +**Step 4: Run biome format on the generated file** + +Run: `cd /home/christso/projects/agentv && bunx biome format --write plugins/agentv-dev/skills/agentv-eval-builder/references/eval-schema.json` + +**Step 5: Commit** + +```bash +git add packages/core/scripts/generate-eval-schema.ts packages/core/package.json +git add plugins/agentv-dev/skills/agentv-eval-builder/references/eval-schema.json +git commit -m "feat: add eval-schema.json generator from Zod schema" +``` + +--- + +### Task 4: Add the sync diff test + +**Files:** +- Create: `packages/core/test/evaluation/validation/eval-schema-sync.test.ts` + +**Step 1: Write the failing test (schema should already be in sync from Task 3)** + +Create `packages/core/test/evaluation/validation/eval-schema-sync.test.ts`: + +```typescript +import { describe, expect, it } from 'bun:test'; +import { readFile } from 'node:fs/promises'; +import path from 'node:path'; +import { zodToJsonSchema } from 'zod-to-json-schema'; +import { EvalFileSchema } from '../../../src/evaluation/validation/eval-file.schema.js'; + +describe('eval-schema.json sync', () => { + it('matches the generated schema from Zod', async () => { + const repoRoot = path.resolve(import.meta.dirname, '../../../..'); + const schemaPath = path.join( + repoRoot, + 'plugins/agentv-dev/skills/agentv-eval-builder/references/eval-schema.json', + ); + + // Read committed schema + const committed = JSON.parse(await readFile(schemaPath, 'utf8')); + + // Generate fresh schema from Zod + const generated = zodToJsonSchema(EvalFileSchema, { + name: 'EvalFile', + $refStrategy: 'none', + }); + + const expected = { + $schema: 'http://json-schema.org/draft-07/schema#', + title: 'AgentV Eval File', + description: 'Schema for AgentV evaluation YAML files (.eval.yaml)', + ...generated, + }; + + // Compare (ignoring formatting differences) + expect(JSON.parse(JSON.stringify(committed))).toEqual( + JSON.parse(JSON.stringify(expected)), + ); + }); +}); +``` + +**Step 2: Run the test to verify it passes** + +Run: `cd /home/christso/projects/agentv && bun test packages/core/test/evaluation/validation/eval-schema-sync.test.ts` +Expected: PASS (since we just generated the schema in Task 3) + +**Step 3: Commit** + +```bash +git add packages/core/test/evaluation/validation/eval-schema-sync.test.ts +git commit -m "test: add eval-schema.json sync test" +``` + +--- + +### Task 5: Also copy generated schema to CLI dist templates + +**Context:** The schema is also bundled in `apps/cli/dist/templates/`. Check if this is done by the build or needs manual sync. + +**Step 1: Check how CLI templates reference the schema** + +Run: `diff plugins/agentv-dev/skills/agentv-eval-builder/references/eval-schema.json apps/cli/dist/templates/.claude/skills/agentv-eval-builder/references/eval-schema.json` + +If they differ, the CLI build should copy from the source. Check the CLI build process: +Run: `grep -r "eval-schema" apps/cli/tsup.config.ts apps/cli/package.json 2>/dev/null` + +If no copy step exists, the template copies are stale artifacts. Either: +- Add a copy step to the CLI build, or +- Note this as out of scope (the CLI templates are created by `agentv create` and may have their own update cycle) + +**Step 2: Determine action and commit if needed** + +This step is investigative — commit only if a change is needed. + +--- + +### Task 6: Run full test suite and push + +**Step 1: Run all tests** + +Run: `cd /home/christso/projects/agentv && bun run test` +Expected: All tests pass + +**Step 2: Run typecheck** + +Run: `cd /home/christso/projects/agentv && bun run typecheck` +Expected: No errors + +**Step 3: Run lint** + +Run: `cd /home/christso/projects/agentv && bun run lint` +Expected: No errors (fix any formatting issues from generated file) + +**Step 4: Push the branch** + +Run: `git push -u origin chore/update-eval-schema` + +--- + +### Task 7: Create PR and file follow-up issue + +**Step 1: Create PR** + +```bash +gh pr create --title "chore: auto-generate eval-schema.json from Zod" --body "$(cat <<'EOF' +## Summary +- Adds a comprehensive Zod schema (`eval-file.schema.ts`) that describes the eval YAML file format +- Generates `eval-schema.json` from this Zod schema via `zod-to-json-schema` +- Adds a sync test that regenerates and diffs — fails if schema drifts from Zod + +## Motivation +The JSON schema was manually maintained and had drifted significantly from the actual validation logic. This ensures the schema stays current as the codebase evolves. + +## How to update the schema +When adding new eval features, update `eval-file.schema.ts` and run: +```bash +cd packages/core && bun run generate:schema +``` + +## Test plan +- [ ] `bun test packages/core/test/evaluation/validation/eval-schema-sync.test.ts` passes +- [ ] Full test suite passes +- [ ] Schema validates against existing example eval files + +🤖 Generated with [Claude Code](https://claude.com/claude-code) +EOF +)" +``` + +**Step 2: File follow-up issue for Approach B** + +```bash +gh issue create --title "refactor: migrate eval-validator.ts from procedural to Zod-based validation" --body "$(cat <<'EOF' +## Context +The eval file validation in `eval-validator.ts` uses procedural if/else logic (~500+ lines). A parallel Zod schema (`eval-file.schema.ts`) was added in # for JSON Schema generation, creating two sources of truth. + +## Proposal +Refactor `eval-validator.ts` to use the Zod schema as the single source of truth for both: +1. Runtime validation (Zod `.safeParse()`) +2. JSON Schema generation (`zod-to-json-schema`) + +## Benefits +- Single source of truth for eval file structure +- Better error messages from Zod +- Removes ~500 lines of manual validation code +- Type-safe parsing (no type casts) + +## Considerations +- The current procedural validator supports warnings (not just errors) — Zod only does pass/fail +- Custom evaluator types use `.passthrough()` which needs careful handling +- Backward-compatible aliases (eval_cases, script, expected_outcome) need Zod transforms +- Extensive test coverage exists in `eval-validator.test.ts` — migration should preserve all test cases + +## Scope +- `packages/core/src/evaluation/validation/eval-validator.ts` → refactor to use Zod +- `packages/core/test/evaluation/validation/eval-validator.test.ts` → update test setup +- Remove the separate `eval-file.schema.ts` once validator uses Zod natively +EOF +)" +``` diff --git a/packages/core/test/evaluation/baseline.test.ts b/packages/core/test/evaluation/baseline.test.ts index 5814accf1..4d9b0e152 100644 --- a/packages/core/test/evaluation/baseline.test.ts +++ b/packages/core/test/evaluation/baseline.test.ts @@ -30,6 +30,7 @@ function makeFullResult(overrides: Partial = {}): EvaluationRe beforeAllOutput: 'setup done', afterEachOutput: 'teardown done', fileChanges: '--- a/file\n+++ b/file\n@@ -1 +1 @@\n-old\n+new', + executionStatus: 'ok', ...overrides, }; } From bcae7e9f497da4f8e9caba1a00f465336bce522d Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Fri, 6 Mar 2026 00:46:34 +0000 Subject: [PATCH 8/9] refactor: extract QUALITY_PASS_THRESHOLD constant, scope failure aggregation - Extract 0.8 threshold into QUALITY_PASS_THRESHOLD constant with classifyQualityStatus helper to prevent threshold drift - Scope byFailureStage/byFailureReason aggregation to execution_error results only (was iterating all results) Co-Authored-By: Claude Opus 4.6 --- apps/cli/src/commands/eval/statistics.ts | 4 ++-- packages/core/src/evaluation/orchestrator.ts | 13 +++++++++---- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/apps/cli/src/commands/eval/statistics.ts b/apps/cli/src/commands/eval/statistics.ts index 6ed2b1568..4ace301de 100644 --- a/apps/cli/src/commands/eval/statistics.ts +++ b/apps/cli/src/commands/eval/statistics.ts @@ -139,10 +139,10 @@ export function calculateEvaluationSummary( ).length; const passedCount = results.filter((r) => r.executionStatus === 'ok').length; - // Aggregate by failure stage and reason + // Aggregate by failure stage and reason (execution errors only) const byFailureStage: Record = {}; const byFailureReason: Record = {}; - for (const result of results) { + for (const result of executionErrors) { if (result.failureStage) { byFailureStage[result.failureStage] = (byFailureStage[result.failureStage] ?? 0) + 1; } diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index dbcb64fea..9401b13db 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -69,6 +69,13 @@ import { type PromptInputs, buildPromptInputs, loadTests } from './yaml-parser.j type MaybePromise = T | Promise; +/** Threshold for classifying ok vs quality_failure (score >= threshold → ok). */ +const QUALITY_PASS_THRESHOLD = 0.8; + +function classifyQualityStatus(score: number): ExecutionStatus { + return score >= QUALITY_PASS_THRESHOLD ? 'ok' : 'quality_failure'; +} + function usesFileReferencePrompt(provider: Provider): boolean { return isAgentProvider(provider) || provider.kind === 'cli'; } @@ -1212,9 +1219,7 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise= 0.8 - ? 'ok' - : 'quality_failure'; + : classifyQualityStatus(result.score); const finalResult = providerError ? { @@ -1511,7 +1516,7 @@ async function evaluateCandidate(options: { trace: trace, output: output, fileChanges, - executionStatus: score.score >= 0.8 ? 'ok' as const : 'quality_failure' as const, + executionStatus: classifyQualityStatus(score.score), }; } From 015b77da2d286255ba12460f5053eee41809339c Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Fri, 6 Mar 2026 01:40:12 +0000 Subject: [PATCH 9/9] style: fix biome formatting in statistics and execution-status test Co-Authored-By: Claude Opus 4.6 --- apps/cli/src/commands/eval/statistics.ts | 4 +--- packages/core/test/evaluation/execution-status.test.ts | 9 ++------- 2 files changed, 3 insertions(+), 10 deletions(-) diff --git a/apps/cli/src/commands/eval/statistics.ts b/apps/cli/src/commands/eval/statistics.ts index 4ace301de..42106c0d7 100644 --- a/apps/cli/src/commands/eval/statistics.ts +++ b/apps/cli/src/commands/eval/statistics.ts @@ -134,9 +134,7 @@ export function calculateEvaluationSummary( // Count by execution status const executionErrorCount = executionErrors.length; - const qualityFailureCount = results.filter( - (r) => r.executionStatus === 'quality_failure', - ).length; + const qualityFailureCount = results.filter((r) => r.executionStatus === 'quality_failure').length; const passedCount = results.filter((r) => r.executionStatus === 'ok').length; // Aggregate by failure stage and reason (execution errors only) diff --git a/packages/core/test/evaluation/execution-status.test.ts b/packages/core/test/evaluation/execution-status.test.ts index 21748c162..21d222542 100644 --- a/packages/core/test/evaluation/execution-status.test.ts +++ b/packages/core/test/evaluation/execution-status.test.ts @@ -2,10 +2,7 @@ import { describe, expect, it } from 'bun:test'; import { runEvalCase } from '../../src/evaluation/orchestrator.js'; import type { ResolvedTarget } from '../../src/evaluation/providers/targets.js'; -import type { - Provider, - ProviderResponse, -} from '../../src/evaluation/providers/types.js'; +import type { Provider, ProviderResponse } from '../../src/evaluation/providers/types.js'; import type { EvalTest } from '../../src/evaluation/types.js'; // --------------------------------------------------------------------------- @@ -119,9 +116,7 @@ describe('execution status classification', () => { }); it('classifies high-scoring results as ok', async () => { - const provider = new FixedResponseProvider( - 'Add structured logging and avoid global state.', - ); + const provider = new FixedResponseProvider('Add structured logging and avoid global state.'); const result = await runEvalCase({ evalCase: baseTestCase,