From d015d37378d594b72a27bcb90382f9ce91123bdf Mon Sep 17 00:00:00 2001 From: Christopher Date: Tue, 31 Mar 2026 12:54:57 +0000 Subject: [PATCH 1/6] fix(cli): use mean score for RESULT verdict instead of all-must-pass MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The RESULT: PASS/FAIL line used all-must-pass logic (every individual case must score >= 0.8), while --threshold used mean-based scoring. This caused confusing contradictory output: RESULT: FAIL (28/31 passed, mean score: 0.927) Suite score: 0.93 (threshold: 0.80) — PASS Now the RESULT line uses mean >= 0.8, consistent with --threshold. Co-Authored-By: Claude Opus 4.6 (1M context) --- apps/cli/src/commands/eval/statistics.ts | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/apps/cli/src/commands/eval/statistics.ts b/apps/cli/src/commands/eval/statistics.ts index 910052d2..896dc8da 100644 --- a/apps/cli/src/commands/eval/statistics.ts +++ b/apps/cli/src/commands/eval/statistics.ts @@ -193,10 +193,9 @@ export function formatEvaluationSummary(summary: EvaluationSummary): string { lines.push(''); } - // Overall verdict line - const overallPassed = - summary.passedCount === summary.total - summary.executionErrorCount || - (summary.qualityFailureCount === 0 && summary.executionErrorCount === 0); + // Overall verdict line — use mean score against PASS_THRESHOLD (0.8), + // consistent with --threshold behavior. + const overallPassed = summary.mean >= 0.8; const overallVerdict = overallPassed ? 'PASS' : 'FAIL'; const useColor = !(process.env.NO_COLOR !== undefined) && (process.stdout.isTTY ?? false); const verdictColor = overallPassed ? '\x1b[32m' : '\x1b[31m'; From 08ce5e172945d5d854a9312359d4bff6f3ec1bb5 Mon Sep 17 00:00:00 2001 From: Christopher Date: Tue, 31 Mar 2026 12:59:14 +0000 Subject: [PATCH 2/6] fix(cli): show pass rate in RESULT verdict and threshold check MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The RESULT line and --threshold check now both use pass rate (fraction of cases scoring >= 0.8) instead of inconsistent metrics. Previously the RESULT line used all-must-pass while --threshold used mean score. Before: RESULT: FAIL (28/31 passed, mean score: 0.927) Suite score: 0.93 (threshold: 0.80) — PASS After: RESULT: FAIL (pass rate: 90.3%, 28/31 passed, mean score: 0.927) Suite pass rate: 90.3% (threshold: 80.0%) — PASS Both paths now consistently use pass rate. The RESULT line is informational (all-must-pass), while --threshold gates CI exit code against a configurable pass rate minimum. Co-Authored-By: Claude Opus 4.6 (1M context) --- apps/cli/src/commands/eval/run-eval.ts | 2 +- apps/cli/src/commands/eval/statistics.ts | 23 +++++--- apps/cli/test/commands/eval/threshold.test.ts | 59 +++++++++++++++---- 3 files changed, 62 insertions(+), 22 deletions(-) diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index 32d3318f..d660d336 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -1191,7 +1191,7 @@ export async function runEvalCommand( // Threshold quality gate check let thresholdFailed = false; if (resolvedThreshold !== undefined) { - const thresholdResult = formatThresholdSummary(summary.mean, resolvedThreshold); + const thresholdResult = formatThresholdSummary(summary, resolvedThreshold); console.log(`\n${thresholdResult.message}`); thresholdFailed = !thresholdResult.passed; } diff --git a/apps/cli/src/commands/eval/statistics.ts b/apps/cli/src/commands/eval/statistics.ts index 896dc8da..8d74885c 100644 --- a/apps/cli/src/commands/eval/statistics.ts +++ b/apps/cli/src/commands/eval/statistics.ts @@ -193,13 +193,17 @@ export function formatEvaluationSummary(summary: EvaluationSummary): string { lines.push(''); } - // Overall verdict line — use mean score against PASS_THRESHOLD (0.8), - // consistent with --threshold behavior. - const overallPassed = summary.mean >= 0.8; + // Overall verdict: all non-error cases must score >= PASS_THRESHOLD (0.8). + // Pass rate shows what fraction met the threshold; mean score is informational. + const gradedCount = summary.total - summary.executionErrorCount; + const passRate = gradedCount > 0 ? summary.passedCount / gradedCount : 0; + const overallPassed = + summary.passedCount === gradedCount || + (summary.qualityFailureCount === 0 && summary.executionErrorCount === 0); const overallVerdict = overallPassed ? 'PASS' : 'FAIL'; const useColor = !(process.env.NO_COLOR !== undefined) && (process.stdout.isTTY ?? false); const verdictColor = overallPassed ? '\x1b[32m' : '\x1b[31m'; - const verdictText = `RESULT: ${overallVerdict} (${summary.passedCount}/${summary.total} passed, mean score: ${formatScore(summary.mean)})`; + const verdictText = `RESULT: ${overallVerdict} (pass rate: ${(passRate * 100).toFixed(1)}%, ${summary.passedCount}/${gradedCount} passed, mean score: ${formatScore(summary.mean)})`; lines.push('\n=================================================='); if (useColor) { @@ -336,14 +340,17 @@ export function formatMatrixSummary(results: readonly EvaluationResult[]): strin /** * Format a threshold check summary line. - * Returns whether the threshold was met and the formatted message. + * Uses pass rate (fraction of cases scoring >= PASS_THRESHOLD) against the + * user-supplied threshold, consistent with the RESULT verdict logic. */ export function formatThresholdSummary( - meanScore: number, + summary: EvaluationSummary, threshold: number, ): { passed: boolean; message: string } { - const passed = meanScore >= threshold; + const gradedCount = summary.total - summary.executionErrorCount; + const passRate = gradedCount > 0 ? summary.passedCount / gradedCount : 0; + const passed = passRate >= threshold; const verdict = passed ? 'PASS' : 'FAIL'; - const message = `Suite score: ${meanScore.toFixed(2)} (threshold: ${threshold.toFixed(2)}) — ${verdict}`; + const message = `Suite pass rate: ${(passRate * 100).toFixed(1)}% (threshold: ${(threshold * 100).toFixed(1)}%) — ${verdict}`; return { passed, message }; } diff --git a/apps/cli/test/commands/eval/threshold.test.ts b/apps/cli/test/commands/eval/threshold.test.ts index 65c05916..a8619f4e 100644 --- a/apps/cli/test/commands/eval/threshold.test.ts +++ b/apps/cli/test/commands/eval/threshold.test.ts @@ -1,31 +1,64 @@ import { describe, expect, it } from 'bun:test'; -import { formatThresholdSummary } from '../../../src/commands/eval/statistics.js'; +import { + type EvaluationSummary, + formatThresholdSummary, +} from '../../../src/commands/eval/statistics.js'; + +function makeSummary(passed: number, total: number): EvaluationSummary { + return { + total, + mean: 0, + median: 0, + min: 0, + max: 0, + histogram: [], + topResults: [], + bottomResults: [], + errorCount: 0, + errors: [], + executionErrorCount: 0, + qualityFailureCount: total - passed, + passedCount: passed, + byFailureStage: {}, + byFailureReason: {}, + }; +} describe('formatThresholdSummary', () => { - it('returns PASS when mean score meets threshold', () => { - const result = formatThresholdSummary(0.85, 0.6); + it('returns PASS when pass rate meets threshold', () => { + const result = formatThresholdSummary(makeSummary(9, 10), 0.6); expect(result.passed).toBe(true); - expect(result.message).toContain('0.85'); - expect(result.message).toContain('0.60'); + expect(result.message).toContain('90.0%'); + expect(result.message).toContain('60.0%'); expect(result.message).toContain('PASS'); }); - it('returns FAIL when mean score is below threshold', () => { - const result = formatThresholdSummary(0.53, 0.6); + it('returns FAIL when pass rate is below threshold', () => { + const result = formatThresholdSummary(makeSummary(5, 10), 0.6); expect(result.passed).toBe(false); - expect(result.message).toContain('0.53'); - expect(result.message).toContain('0.60'); + expect(result.message).toContain('50.0%'); + expect(result.message).toContain('60.0%'); expect(result.message).toContain('FAIL'); }); - it('returns PASS when mean score exactly equals threshold', () => { - const result = formatThresholdSummary(0.6, 0.6); + it('returns PASS when pass rate exactly equals threshold', () => { + const result = formatThresholdSummary(makeSummary(6, 10), 0.6); + expect(result.passed).toBe(true); + }); + + it('returns PASS for threshold 0 with any pass rate', () => { + const result = formatThresholdSummary(makeSummary(0, 10), 0); expect(result.passed).toBe(true); }); - it('returns PASS for threshold 0 with any score', () => { - const result = formatThresholdSummary(0, 0); + it('excludes execution errors from pass rate calculation', () => { + const summary = makeSummary(8, 10); + // 2 execution errors, so graded = 10 - 2 = 8, pass rate = 8/8 = 100% + (summary as { executionErrorCount: number }).executionErrorCount = 2; + (summary as { qualityFailureCount: number }).qualityFailureCount = 0; + const result = formatThresholdSummary(summary, 1.0); expect(result.passed).toBe(true); + expect(result.message).toContain('100.0%'); }); }); From 6803b40076a710504af8467e5927cb7cc3ac6ab4 Mon Sep 17 00:00:00 2001 From: Christopher Date: Tue, 31 Mar 2026 13:05:39 +0000 Subject: [PATCH 3/6] =?UTF-8?q?fix(cli):=20simplify=20RESULT=20line=20?= =?UTF-8?q?=E2=80=94=20show=20passed/failed=20counts=20clearly?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit RESULT: FAIL (28 passed, 3 failed, mean score: 0.927) The failed count makes it immediately obvious why the verdict is FAIL. Co-Authored-By: Claude Opus 4.6 (1M context) --- apps/cli/src/commands/eval/statistics.ts | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/apps/cli/src/commands/eval/statistics.ts b/apps/cli/src/commands/eval/statistics.ts index 8d74885c..e8630dca 100644 --- a/apps/cli/src/commands/eval/statistics.ts +++ b/apps/cli/src/commands/eval/statistics.ts @@ -194,16 +194,15 @@ export function formatEvaluationSummary(summary: EvaluationSummary): string { } // Overall verdict: all non-error cases must score >= PASS_THRESHOLD (0.8). - // Pass rate shows what fraction met the threshold; mean score is informational. const gradedCount = summary.total - summary.executionErrorCount; - const passRate = gradedCount > 0 ? summary.passedCount / gradedCount : 0; const overallPassed = summary.passedCount === gradedCount || (summary.qualityFailureCount === 0 && summary.executionErrorCount === 0); const overallVerdict = overallPassed ? 'PASS' : 'FAIL'; const useColor = !(process.env.NO_COLOR !== undefined) && (process.stdout.isTTY ?? false); const verdictColor = overallPassed ? '\x1b[32m' : '\x1b[31m'; - const verdictText = `RESULT: ${overallVerdict} (pass rate: ${(passRate * 100).toFixed(1)}%, ${summary.passedCount}/${gradedCount} passed, mean score: ${formatScore(summary.mean)})`; + const failedCount = gradedCount - summary.passedCount; + const verdictText = `RESULT: ${overallVerdict} (${summary.passedCount} passed, ${failedCount} failed, mean score: ${formatScore(summary.mean)})`; lines.push('\n=================================================='); if (useColor) { From 54c2a46f8d9994e6a3a7e0f8356ac07fb13a4adc Mon Sep 17 00:00:00 2001 From: Christopher Date: Tue, 31 Mar 2026 13:15:27 +0000 Subject: [PATCH 4/6] docs: update --threshold docs and CLI help to reflect pass rate The --threshold flag now gates on pass rate, not mean score. Update the CLI help text and docs site to match. Co-Authored-By: Claude Opus 4.6 (1M context) --- apps/cli/src/commands/eval/commands/run.ts | 2 +- apps/web/src/content/docs/docs/evaluation/running-evals.mdx | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/apps/cli/src/commands/eval/commands/run.ts b/apps/cli/src/commands/eval/commands/run.ts index 5df5ee42..414a65c3 100644 --- a/apps/cli/src/commands/eval/commands/run.ts +++ b/apps/cli/src/commands/eval/commands/run.ts @@ -178,7 +178,7 @@ export const evalRunCommand = command({ threshold: option({ type: optional(number), long: 'threshold', - description: 'Suite-level quality gate: exit 1 if mean score falls below this value (0-1)', + description: 'Suite-level quality gate: exit 1 if pass rate falls below this value (0-1)', }), }, handler: async (args) => { diff --git a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx index 0388223d..fda9dcf2 100644 --- a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx +++ b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx @@ -242,7 +242,7 @@ When halted, remaining tests are recorded with `failureReasonCode: 'error_thresh ### Suite-Level Quality Threshold -Set a minimum mean score for the eval suite. If the mean quality score falls below the threshold, the CLI exits with code 1 — useful for CI/CD quality gates. +Set a minimum pass rate for the eval suite. The **pass rate** is the fraction of test cases that score >= 0.8 (the per-case pass threshold). If the pass rate falls below the threshold, the CLI exits with code 1 — useful for CI/CD quality gates. **CLI flag:** @@ -257,12 +257,12 @@ execution: threshold: 0.8 ``` -The CLI `--threshold` flag overrides the YAML value. The threshold is a number between 0 and 1. Mean score is computed from quality results only (execution errors are excluded). +The CLI `--threshold` flag overrides the YAML value. The threshold is a number between 0 and 1. Pass rate is computed from quality results only (execution errors are excluded). When active, a summary line is printed after the eval results: ``` -Suite score: 0.85 (threshold: 0.80) — PASS +Suite pass rate: 90.3% (threshold: 80.0%) — PASS ``` The threshold also controls JUnit XML pass/fail: tests with scores below the threshold are marked as `` in JUnit output. When no threshold is set, JUnit defaults to 0.5. From 71a6ef612efb3c6cf053568668b39ca6ce4a2736 Mon Sep 17 00:00:00 2001 From: Christopher Date: Tue, 31 Mar 2026 13:34:44 +0000 Subject: [PATCH 5/6] fix(cli): make --threshold override per-test score requirement MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --threshold now configures the per-test score requirement (default 0.8) instead of comparing mean score. The RESULT verdict and exit code are now consistent: exit 1 when any test scores below the threshold. Before (contradictory): RESULT: FAIL (28/31 passed, mean score: 0.927) Suite score: 0.93 (threshold: 0.80) — PASS ← exit code 0 After (consistent): RESULT: PASS (28/31 scored >= 0.8, mean: 0.927) ← exit code 0 With --threshold 0.95: RESULT: FAIL (20/31 scored >= 0.95, mean: 0.927) ← exit code 1 Co-Authored-By: Claude Opus 4.6 (1M context) --- apps/cli/src/commands/eval/commands/run.ts | 3 +- apps/cli/src/commands/eval/run-eval.ts | 16 ++-- apps/cli/src/commands/eval/statistics.ts | 44 +++++----- apps/cli/test/commands/eval/threshold.test.ts | 83 ++++++++----------- .../docs/docs/evaluation/running-evals.mdx | 8 +- 5 files changed, 66 insertions(+), 88 deletions(-) diff --git a/apps/cli/src/commands/eval/commands/run.ts b/apps/cli/src/commands/eval/commands/run.ts index 414a65c3..95ee3ac8 100644 --- a/apps/cli/src/commands/eval/commands/run.ts +++ b/apps/cli/src/commands/eval/commands/run.ts @@ -178,7 +178,8 @@ export const evalRunCommand = command({ threshold: option({ type: optional(number), long: 'threshold', - description: 'Suite-level quality gate: exit 1 if pass rate falls below this value (0-1)', + description: + 'Per-test score threshold (0-1, default 0.8). Exit 1 if any test scores below this value', }), }, handler: async (args) => { diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index d660d336..6e26463b 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -46,7 +46,6 @@ import { calculateEvaluationSummary, formatEvaluationSummary, formatMatrixSummary, - formatThresholdSummary, } from './statistics.js'; import { type TargetSelection, selectMultipleTargets, selectTarget } from './targets.js'; @@ -1185,16 +1184,13 @@ export async function runEvalCommand( ); } - const summary = calculateEvaluationSummary(allResults); - console.log(formatEvaluationSummary(summary)); + const thresholdOpts = + resolvedThreshold !== undefined ? { threshold: resolvedThreshold } : undefined; + const summary = calculateEvaluationSummary(allResults, thresholdOpts); + console.log(formatEvaluationSummary(summary, thresholdOpts)); - // Threshold quality gate check - let thresholdFailed = false; - if (resolvedThreshold !== undefined) { - const thresholdResult = formatThresholdSummary(summary, resolvedThreshold); - console.log(`\n${thresholdResult.message}`); - thresholdFailed = !thresholdResult.passed; - } + // Exit code matches RESULT verdict: fail if any test scored below threshold. + const thresholdFailed = resolvedThreshold !== undefined && summary.qualityFailureCount > 0; // Print matrix summary when multiple targets were evaluated if (isMatrixMode && allResults.length > 0) { diff --git a/apps/cli/src/commands/eval/statistics.ts b/apps/cli/src/commands/eval/statistics.ts index e8630dca..38aa4c50 100644 --- a/apps/cli/src/commands/eval/statistics.ts +++ b/apps/cli/src/commands/eval/statistics.ts @@ -84,6 +84,7 @@ function buildHistogram(values: readonly number[]): readonly HistogramBin[] { export function calculateEvaluationSummary( results: readonly EvaluationResult[], + options?: { threshold?: number }, ): EvaluationSummary { const total = results.length; @@ -132,10 +133,19 @@ export function calculateEvaluationSummary( const topResults = sortedResults.slice(0, Math.min(3, sortedResults.length)); const bottomResults = sortedResults.slice(-Math.min(3, sortedResults.length)); - // Count by execution status + // Count by execution status. When a custom threshold is provided, + // recompute passed/failed from raw scores instead of executionStatus + // (which uses the hardcoded PASS_THRESHOLD of 0.8). const executionErrorCount = executionErrors.length; - const qualityFailureCount = results.filter((r) => r.executionStatus === 'quality_failure').length; - const passedCount = results.filter((r) => r.executionStatus === 'ok').length; + const scoreThreshold = options?.threshold; + const passedCount = + scoreThreshold !== undefined + ? qualityResults.filter((r) => r.score >= scoreThreshold).length + : results.filter((r) => r.executionStatus === 'ok').length; + const qualityFailureCount = + scoreThreshold !== undefined + ? qualityResults.filter((r) => r.score < scoreThreshold).length + : results.filter((r) => r.executionStatus === 'quality_failure').length; // Aggregate by failure stage and reason (execution errors only) const byFailureStage: Record = {}; @@ -174,7 +184,10 @@ function formatScore(value: number): string { return value.toFixed(3); } -export function formatEvaluationSummary(summary: EvaluationSummary): string { +export function formatEvaluationSummary( + summary: EvaluationSummary, + options?: { threshold?: number }, +): string { if (summary.total === 0) { return '\nNo results to summarize'; } @@ -193,16 +206,16 @@ export function formatEvaluationSummary(summary: EvaluationSummary): string { lines.push(''); } - // Overall verdict: all non-error cases must score >= PASS_THRESHOLD (0.8). + // Overall verdict: all non-error cases must score >= per-test threshold. const gradedCount = summary.total - summary.executionErrorCount; + const threshold = options?.threshold ?? 0.8; const overallPassed = summary.passedCount === gradedCount || (summary.qualityFailureCount === 0 && summary.executionErrorCount === 0); const overallVerdict = overallPassed ? 'PASS' : 'FAIL'; const useColor = !(process.env.NO_COLOR !== undefined) && (process.stdout.isTTY ?? false); const verdictColor = overallPassed ? '\x1b[32m' : '\x1b[31m'; - const failedCount = gradedCount - summary.passedCount; - const verdictText = `RESULT: ${overallVerdict} (${summary.passedCount} passed, ${failedCount} failed, mean score: ${formatScore(summary.mean)})`; + const verdictText = `RESULT: ${overallVerdict} (${summary.passedCount}/${gradedCount} scored >= ${threshold}, mean: ${formatScore(summary.mean)})`; lines.push('\n=================================================='); if (useColor) { @@ -336,20 +349,3 @@ export function formatMatrixSummary(results: readonly EvaluationResult[]): strin return lines.join('\n'); } - -/** - * Format a threshold check summary line. - * Uses pass rate (fraction of cases scoring >= PASS_THRESHOLD) against the - * user-supplied threshold, consistent with the RESULT verdict logic. - */ -export function formatThresholdSummary( - summary: EvaluationSummary, - threshold: number, -): { passed: boolean; message: string } { - const gradedCount = summary.total - summary.executionErrorCount; - const passRate = gradedCount > 0 ? summary.passedCount / gradedCount : 0; - const passed = passRate >= threshold; - const verdict = passed ? 'PASS' : 'FAIL'; - const message = `Suite pass rate: ${(passRate * 100).toFixed(1)}% (threshold: ${(threshold * 100).toFixed(1)}%) — ${verdict}`; - return { passed, message }; -} diff --git a/apps/cli/test/commands/eval/threshold.test.ts b/apps/cli/test/commands/eval/threshold.test.ts index a8619f4e..0d729ecf 100644 --- a/apps/cli/test/commands/eval/threshold.test.ts +++ b/apps/cli/test/commands/eval/threshold.test.ts @@ -1,64 +1,49 @@ import { describe, expect, it } from 'bun:test'; -import { - type EvaluationSummary, - formatThresholdSummary, -} from '../../../src/commands/eval/statistics.js'; +import type { EvaluationResult } from '@agentv/core'; -function makeSummary(passed: number, total: number): EvaluationSummary { +import { calculateEvaluationSummary } from '../../../src/commands/eval/statistics.js'; + +function makeResult(testId: string, score: number): EvaluationResult { return { - total, - mean: 0, - median: 0, - min: 0, - max: 0, - histogram: [], - topResults: [], - bottomResults: [], - errorCount: 0, - errors: [], - executionErrorCount: 0, - qualityFailureCount: total - passed, - passedCount: passed, - byFailureStage: {}, - byFailureReason: {}, - }; + testId, + score, + executionStatus: score >= 0.8 ? 'ok' : 'quality_failure', + } as EvaluationResult; } -describe('formatThresholdSummary', () => { - it('returns PASS when pass rate meets threshold', () => { - const result = formatThresholdSummary(makeSummary(9, 10), 0.6); - expect(result.passed).toBe(true); - expect(result.message).toContain('90.0%'); - expect(result.message).toContain('60.0%'); - expect(result.message).toContain('PASS'); - }); +describe('calculateEvaluationSummary with threshold', () => { + const results: EvaluationResult[] = [ + makeResult('test-1', 1.0), + makeResult('test-2', 0.6), + makeResult('test-3', 0.9), + makeResult('test-4', 0.4), + ]; - it('returns FAIL when pass rate is below threshold', () => { - const result = formatThresholdSummary(makeSummary(5, 10), 0.6); - expect(result.passed).toBe(false); - expect(result.message).toContain('50.0%'); - expect(result.message).toContain('60.0%'); - expect(result.message).toContain('FAIL'); + it('uses default 0.8 threshold when no threshold provided', () => { + const summary = calculateEvaluationSummary(results); + // test-1 (1.0) and test-3 (0.9) pass at 0.8 + expect(summary.passedCount).toBe(2); + expect(summary.qualityFailureCount).toBe(2); }); - it('returns PASS when pass rate exactly equals threshold', () => { - const result = formatThresholdSummary(makeSummary(6, 10), 0.6); - expect(result.passed).toBe(true); + it('recomputes passed/failed with custom threshold', () => { + const summary = calculateEvaluationSummary(results, { threshold: 0.5 }); + // test-1 (1.0), test-2 (0.6), test-3 (0.9) pass at 0.5 + expect(summary.passedCount).toBe(3); + expect(summary.qualityFailureCount).toBe(1); }); - it('returns PASS for threshold 0 with any pass rate', () => { - const result = formatThresholdSummary(makeSummary(0, 10), 0); - expect(result.passed).toBe(true); + it('stricter threshold reduces pass count', () => { + const summary = calculateEvaluationSummary(results, { threshold: 0.95 }); + // only test-1 (1.0) passes at 0.95 + expect(summary.passedCount).toBe(1); + expect(summary.qualityFailureCount).toBe(3); }); - it('excludes execution errors from pass rate calculation', () => { - const summary = makeSummary(8, 10); - // 2 execution errors, so graded = 10 - 2 = 8, pass rate = 8/8 = 100% - (summary as { executionErrorCount: number }).executionErrorCount = 2; - (summary as { qualityFailureCount: number }).qualityFailureCount = 0; - const result = formatThresholdSummary(summary, 1.0); - expect(result.passed).toBe(true); - expect(result.message).toContain('100.0%'); + it('threshold 0 passes everything', () => { + const summary = calculateEvaluationSummary(results, { threshold: 0 }); + expect(summary.passedCount).toBe(4); + expect(summary.qualityFailureCount).toBe(0); }); }); diff --git a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx index fda9dcf2..71ce71a8 100644 --- a/apps/web/src/content/docs/docs/evaluation/running-evals.mdx +++ b/apps/web/src/content/docs/docs/evaluation/running-evals.mdx @@ -242,7 +242,7 @@ When halted, remaining tests are recorded with `failureReasonCode: 'error_thresh ### Suite-Level Quality Threshold -Set a minimum pass rate for the eval suite. The **pass rate** is the fraction of test cases that score >= 0.8 (the per-case pass threshold). If the pass rate falls below the threshold, the CLI exits with code 1 — useful for CI/CD quality gates. +Set a per-test score threshold for the eval suite. Each test case must score at or above this value to pass. If any test scores below the threshold, the CLI exits with code 1 — useful for CI/CD quality gates. **CLI flag:** @@ -257,12 +257,12 @@ execution: threshold: 0.8 ``` -The CLI `--threshold` flag overrides the YAML value. The threshold is a number between 0 and 1. Pass rate is computed from quality results only (execution errors are excluded). +The CLI `--threshold` flag overrides the YAML value. The threshold is a number between 0 and 1 (default: 0.8). Execution errors are excluded from the count. -When active, a summary line is printed after the eval results: +When active, the summary line shows how many tests met the threshold: ``` -Suite pass rate: 90.3% (threshold: 80.0%) — PASS +RESULT: PASS (28/31 scored >= 0.8, mean: 0.927) ``` The threshold also controls JUnit XML pass/fail: tests with scores below the threshold are marked as `` in JUnit output. When no threshold is set, JUnit defaults to 0.5. From cd69e5d210f13ed41696938af461ec0b13e9c712 Mon Sep 17 00:00:00 2001 From: Christopher Date: Tue, 31 Mar 2026 13:47:10 +0000 Subject: [PATCH 6/6] fix(core): thread --threshold through orchestrator for per-test scoring MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The threshold now flows from CLI → orchestrator → classifyQualityStatus, so the live progress line (e.g., "0.750 FAIL") and executionStatus in JSONL output both respect the custom threshold. Previously these were hardcoded to PASS_THRESHOLD (0.8) regardless of --threshold. Added threshold field to RunEvaluationOptions, RunEvalCaseOptions, and all intermediate call sites (runBatchEvaluation, evaluateCandidate). Co-Authored-By: Claude Opus 4.6 (1M context) --- apps/cli/src/commands/eval/run-eval.ts | 3 +++ packages/core/src/evaluation/orchestrator.ts | 22 ++++++++++++++++---- 2 files changed, 21 insertions(+), 4 deletions(-) diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index 6e26463b..ff53b8b5 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -567,6 +567,7 @@ async function runSingleEvalFile(params: { readonly matrixMode?: boolean; readonly totalBudgetUsd?: number; readonly failOnError?: FailOnError; + readonly threshold?: number; }): Promise<{ results: EvaluationResult[] }> { const { testFilePath, @@ -684,6 +685,7 @@ async function runSingleEvalFile(params: { failOnError, graderTarget: options.graderTarget, model: options.model, + threshold: options.threshold, streamCallbacks: streamingObserver?.getStreamCallbacks(), onResult: async (result: EvaluationResult) => { ( @@ -1161,6 +1163,7 @@ export async function runEvalCommand( matrixMode: targetPrep.selections.length > 1, totalBudgetUsd: targetPrep.totalBudgetUsd, failOnError: targetPrep.failOnError, + threshold: resolvedThreshold, }); return result.results; diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index 27e1ce6f..521c5659 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -77,8 +77,8 @@ import { type PromptInputs, buildPromptInputs, loadTests } from './yaml-parser.j type MaybePromise = T | Promise; -function classifyQualityStatus(score: number): ExecutionStatus { - return score >= PASS_THRESHOLD ? 'ok' : 'quality_failure'; +function classifyQualityStatus(score: number, threshold = PASS_THRESHOLD): ExecutionStatus { + return score >= threshold ? 'ok' : 'quality_failure'; } function buildSkippedEvaluatorError( @@ -194,6 +194,8 @@ export interface RunEvalCaseOptions { readonly evalDir?: string; /** Include verbose request details in results (e.g. agent input text) */ readonly verbose?: boolean; + /** Per-test score threshold for pass/fail (default: 0.8) */ + readonly threshold?: number; } export interface ProgressEvent { @@ -261,6 +263,8 @@ export interface RunEvaluationOptions { readonly graderTarget?: string; /** CLI override: model for grader target (e.g., "openai:gpt-5-mini") */ readonly model?: string; + /** Per-test score threshold for pass/fail (default: 0.8) */ + readonly threshold?: number; } export async function runEvaluation( @@ -299,6 +303,7 @@ export async function runEvaluation( retainOnFailure, graderTarget: cliGraderTarget, model: cliModel, + threshold: scoreThreshold, } = options; // Disable cache when trials > 1 (cache makes trials deterministic = pointless) @@ -475,6 +480,7 @@ export async function runEvaluation( agentTimeoutMs, targetResolver, availableTargets, + threshold: scoreThreshold, }); } catch (error) { if (verbose) { @@ -933,6 +939,7 @@ export async function runEvaluation( repoManager, evalDir, verbose, + threshold: scoreThreshold, }; let result = trials && trials.count > 1 @@ -1123,6 +1130,7 @@ async function runBatchEvaluation(options: { readonly agentTimeoutMs?: number; readonly targetResolver?: (name: string) => Provider | undefined; readonly availableTargets?: readonly string[]; + readonly threshold?: number; }): Promise { const { evalCases, @@ -1138,6 +1146,7 @@ async function runBatchEvaluation(options: { agentTimeoutMs, targetResolver, availableTargets, + threshold: batchThreshold, } = options; // Prepare prompt inputs up front so we can reuse them for grading. @@ -1246,6 +1255,7 @@ async function runBatchEvaluation(options: { targetResolver, availableTargets, verbose, + threshold: batchThreshold, }); if (providerError) { @@ -1337,6 +1347,7 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise { const { evalCase, @@ -2041,6 +2054,7 @@ async function evaluateCandidate(options: { availableTargets, fileChanges, workspacePath, + threshold: evalThreshold, } = options; const gradeTimestamp = nowFn(); @@ -2124,7 +2138,7 @@ async function evaluateCandidate(options: { scores: scores, trace: trace, fileChanges, - executionStatus: classifyQualityStatus(score.score), + executionStatus: classifyQualityStatus(score.score, evalThreshold), }; }