From 2324584df4b2e7da3f3201f41a2ec0bad053bdf8 Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Fri, 6 Mar 2026 00:23:31 +0000
Subject: [PATCH 1/9] feat: add ExecutionStatus, FailureStage types to
 EvaluationResult (#431)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 packages/core/src/evaluation/types.ts | 35 +++++++++++++++++++++++++++
 1 file changed, 35 insertions(+)

diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts
index fb500385c..657583818 100644
--- a/packages/core/src/evaluation/types.ts
+++ b/packages/core/src/evaluation/types.ts
@@ -782,6 +782,12 @@ export interface TrialResult {
   readonly scores?: readonly EvaluatorResult[];
   readonly error?: string;
   readonly costUsd?: number;
+  /** Primary classification for this trial attempt */
+  readonly executionStatus?: ExecutionStatus;
+  /** Pipeline stage where failure occurred */
+  readonly failureStage?: FailureStage;
+  /** Machine-readable failure reason code */
+  readonly failureReasonCode?: string;
 }
 
 /**
@@ -819,6 +825,27 @@ export interface ConfidenceIntervalAggregation {
  */
 export type TrialAggregation = PassAtKAggregation | MeanAggregation | ConfidenceIntervalAggregation;
 
+/**
+ * Primary classification of evaluation outcome.
+ * - 'ok': evaluation completed, score reflects model quality (score >= 0.8)
+ * - 'quality_failure': evaluation completed but model scored below threshold
+ * - 'execution_error': evaluation could not complete due to infrastructure/tooling error
+ */
+export type ExecutionStatus = 'ok' | 'quality_failure' | 'execution_error';
+
+/**
+ * Pipeline stage where the failure occurred.
+ */
+export type FailureStage = 'setup' | 'repo_setup' | 'agent' | 'evaluator' | 'teardown';
+
+/**
+ * Structured error detail for execution failures.
+ */
+export interface ExecutionError {
+  readonly message: string;
+  readonly stage: FailureStage;
+}
+
 /**
  * Evaluator scorecard for a single eval case run.
  */
@@ -876,6 +903,14 @@ export interface EvaluationResult {
   readonly costLimited?: boolean;
   /** Whether the evaluation was skipped due to suite-level budget exhaustion */
   readonly budgetExceeded?: boolean;
+  /** Primary classification: ok, quality_failure, or execution_error */
+  readonly executionStatus: ExecutionStatus;
+  /** Pipeline stage where failure occurred (only when executionStatus !== 'ok') */
+  readonly failureStage?: FailureStage;
+  /** Machine-readable failure reason code (only when executionStatus !== 'ok') */
+  readonly failureReasonCode?: string;
+  /** Structured error detail (only when executionStatus === 'execution_error') */
+  readonly executionError?: ExecutionError;
 }
 
 export type EvaluationVerdict = 'pass' | 'fail' | 'borderline' | 'skip';

From 1e23df18c4c0b6fe493170217fa7410f5d1fa0fd Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Fri, 6 Mar 2026 00:26:43 +0000
Subject: [PATCH 2/9] feat: classify execution errors at each orchestrator
 catch site (#431)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 packages/core/src/evaluation/orchestrator.ts | 65 ++++++++++++++++++--
 1 file changed, 61 insertions(+), 4 deletions(-)

diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts
index f6c9b86d8..f5a571cca 100644
--- a/packages/core/src/evaluation/orchestrator.ts
+++ b/packages/core/src/evaluation/orchestrator.ts
@@ -42,6 +42,8 @@ import type {
   EvaluatorConfig,
   EvaluatorKind,
   EvaluatorResult,
+  ExecutionStatus,
+  FailureStage,
   JsonObject,
   JsonValue,
   TrialResult,
@@ -466,6 +468,13 @@ export async function runEvaluation(
           target: target.name,
           error: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
           budgetExceeded: true,
+          executionStatus: 'execution_error',
+          failureStage: 'setup',
+          failureReasonCode: 'budget_exceeded',
+          executionError: {
+            message: `Suite budget exceeded ($${cumulativeBudgetCost.toFixed(4)} / $${totalBudgetUsd.toFixed(4)})`,
+            stage: 'setup',
+          },
         };
 
         if (onProgress) {
@@ -599,6 +608,8 @@ export async function runEvaluation(
         outcome.reason,
         promptInputs,
         primaryProvider,
+        'agent',
+        'provider_error',
       );
       results.push(errorResult);
       if (onResult) {
@@ -788,7 +799,14 @@ async function runBatchEvaluation(options: {
       });
 
       if (providerError) {
-        result = { ...result, error: providerError };
+        result = {
+          ...result,
+          error: providerError,
+          executionStatus: 'execution_error' as const,
+          failureStage: 'agent' as const,
+          failureReasonCode: 'provider_error',
+          executionError: { message: providerError, stage: 'agent' as const },
+        };
       }
     } catch (error) {
       const errorResult = buildErrorResult(
@@ -798,6 +816,8 @@ async function runBatchEvaluation(options: {
         error,
         promptInputs,
         provider,
+        'evaluator',
+        'evaluator_error',
       );
       results.push(errorResult);
       if (onResult) {
@@ -899,6 +919,8 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise<Evaluati
           new Error(`Failed to create workspace: ${message}`),
           promptInputs,
           provider,
+          'setup',
+          'template_error',
         );
       }
     }
@@ -927,6 +949,8 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise<Evaluati
           new Error(`Failed to materialize repos: ${message}`),
           promptInputs,
           provider,
+          'repo_setup',
+          'clone_error',
         );
       }
     }
@@ -957,6 +981,8 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise<Evaluati
           new Error(`before_all script failed: ${message}`),
           promptInputs,
           provider,
+          'setup',
+          'script_error',
         );
       }
     }
@@ -985,6 +1011,8 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise<Evaluati
         new Error(`before_each script failed: ${message}`),
         promptInputs,
         provider,
+        'setup',
+        'script_error',
       );
     }
   }
@@ -1032,6 +1060,8 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise<Evaluati
         error,
         promptInputs,
         provider,
+        'agent',
+        'provider_error',
       );
       if (workspacePath) {
         if (forceCleanup) {
@@ -1051,6 +1081,8 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise<Evaluati
       lastError ?? new Error('Provider did not return a response'),
       promptInputs,
       provider,
+      'agent',
+      'provider_error',
     );
     // On error, keep workspace for debugging (unless forceCleanup is set)
     if (workspacePath) {
@@ -1178,9 +1210,25 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise<Evaluati
       workspacePath,
     });
 
+    const executionStatus: ExecutionStatus = providerError
+      ? 'execution_error'
+      : result.score >= 0.8
+        ? 'ok'
+        : 'quality_failure';
+
     const finalResult = providerError
-      ? { ...result, error: providerError, beforeAllOutput, beforeEachOutput, afterEachOutput }
-      : { ...result, beforeAllOutput, beforeEachOutput, afterEachOutput };
+      ? {
+          ...result,
+          error: providerError,
+          executionStatus,
+          failureStage: 'agent' as const,
+          failureReasonCode: 'provider_error',
+          executionError: { message: providerError, stage: 'agent' as const },
+          beforeAllOutput,
+          beforeEachOutput,
+          afterEachOutput,
+        }
+      : { ...result, executionStatus, beforeAllOutput, beforeEachOutput, afterEachOutput };
 
     // Determine if this is a failure (has error or low score)
     const isFailure = !!finalResult.error || finalResult.score < 0.5;
@@ -1205,6 +1253,8 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise<Evaluati
       error,
       promptInputs,
       provider,
+      'evaluator',
+      'evaluator_error',
     );
     // On error, keep workspace for debugging (only for per-case workspaces)
     if (workspacePath && !isSharedWorkspace) {
@@ -1433,6 +1483,7 @@ async function evaluateCandidate(options: {
     trace: trace,
     output: output,
     fileChanges,
+    executionStatus: score.score >= 0.8 ? 'ok' as const : 'quality_failure' as const,
   };
 }
 
@@ -1861,7 +1912,9 @@ function buildErrorResult(
   timestamp: Date,
   error: unknown,
   promptInputs: PromptInputs,
-  provider?: Provider,
+  provider: Provider | undefined,
+  failureStage: FailureStage,
+  failureReasonCode: string,
 ): EvaluationResult {
   const message = error instanceof Error ? error.message : String(error);
 
@@ -1913,6 +1966,10 @@ function buildErrorResult(
     requests,
     input,
     error: message,
+    executionStatus: 'execution_error',
+    failureStage,
+    failureReasonCode,
+    executionError: { message, stage: failureStage },
   } satisfies EvaluationResult;
 }
 

From 091a67d7c170c5cf82a5fbab13b97e3e7817b9a6 Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Fri, 6 Mar 2026 00:30:00 +0000
Subject: [PATCH 3/9] feat: propagate executionStatus through trial aggregation
 (#431)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 packages/core/src/evaluation/orchestrator.ts | 28 ++++++++++++++++++++
 1 file changed, 28 insertions(+)

diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts
index f5a571cca..dbcb64fea 100644
--- a/packages/core/src/evaluation/orchestrator.ts
+++ b/packages/core/src/evaluation/orchestrator.ts
@@ -1307,6 +1307,9 @@ async function runEvalCaseWithTrials(
       scores: result.scores,
       error: result.error,
       costUsd: trialCost,
+      executionStatus: result.executionStatus,
+      failureStage: result.failureStage,
+      failureReasonCode: result.failureReasonCode,
     };
     trialResults.push(trial);
 
@@ -1343,12 +1346,37 @@ async function runEvalCaseWithTrials(
   );
   const baseResult = allResults[bestTrialIndex];
 
+  // Determine aggregate executionStatus from trial results:
+  // - If ANY trial succeeded → ok
+  // - If ALL trials had execution_error → execution_error
+  // - Otherwise → quality_failure
+  const hasOk = trialResults.some((t) => t.executionStatus === 'ok');
+  const allExecutionError =
+    trialResults.length > 0 && trialResults.every((t) => t.executionStatus === 'execution_error');
+  const aggregateExecutionStatus: ExecutionStatus = hasOk
+    ? 'ok'
+    : allExecutionError
+      ? 'execution_error'
+      : 'quality_failure';
+
+  // When the aggregate status differs from baseResult, clear failure fields that no longer apply
+  const aggregateFailureStage =
+    aggregateExecutionStatus === 'ok' ? undefined : baseResult.failureStage;
+  const aggregateFailureReasonCode =
+    aggregateExecutionStatus === 'ok' ? undefined : baseResult.failureReasonCode;
+  const aggregateExecutionError =
+    aggregateExecutionStatus === 'execution_error' ? baseResult.executionError : undefined;
+
   return {
     ...baseResult,
     score,
     trials: trialResults,
     aggregation,
     costLimited: costLimited || undefined,
+    executionStatus: aggregateExecutionStatus,
+    failureStage: aggregateFailureStage,
+    failureReasonCode: aggregateFailureReasonCode,
+    executionError: aggregateExecutionError,
   };
 }
 

From 1e23b27cb560656260887ebbaa949625de5187b7 Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Fri, 6 Mar 2026 00:31:17 +0000
Subject: [PATCH 4/9] feat: separate execution errors from quality metrics in
 summary (#431)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 apps/cli/src/commands/eval/statistics.ts | 96 ++++++++++++++++++++----
 1 file changed, 81 insertions(+), 15 deletions(-)

diff --git a/apps/cli/src/commands/eval/statistics.ts b/apps/cli/src/commands/eval/statistics.ts
index c830592fa..6ed2b1568 100644
--- a/apps/cli/src/commands/eval/statistics.ts
+++ b/apps/cli/src/commands/eval/statistics.ts
@@ -17,6 +17,11 @@ export interface EvaluationSummary {
   readonly bottomResults: readonly EvaluationResult[];
   readonly errorCount: number;
   readonly errors: readonly { readonly testId: string; readonly error: string }[];
+  readonly executionErrorCount: number;
+  readonly qualityFailureCount: number;
+  readonly passedCount: number;
+  readonly byFailureStage: Readonly<Record<string, number>>;
+  readonly byFailureReason: Readonly<Record<string, number>>;
 }
 
 const HISTOGRAM_BREAKPOINTS = [0, 0.2, 0.4, 0.6, 0.8, 1];
@@ -80,7 +85,6 @@ function buildHistogram(values: readonly number[]): readonly HistogramBin[] {
 export function calculateEvaluationSummary(
   results: readonly EvaluationResult[],
 ): EvaluationSummary {
-  const scores = results.map((result) => result.score);
   const total = results.length;
 
   // Track errors
@@ -102,20 +106,52 @@ export function calculateEvaluationSummary(
       bottomResults: [],
       errorCount: 0,
       errors: [],
+      executionErrorCount: 0,
+      qualityFailureCount: 0,
+      passedCount: 0,
+      byFailureStage: {},
+      byFailureReason: {},
     };
   }
 
-  const mean = computeMean(scores);
-  const median = computeMedian(scores);
-  const min = Math.min(...scores);
-  const max = Math.max(...scores);
-  const standardDeviation = computeStandardDeviation(scores);
-  const histogram = buildHistogram(scores);
-
-  const sortedResults = [...results].sort((a, b) => b.score - a.score);
+  // Separate execution errors from quality results
+  const executionErrors = results.filter((r) => r.executionStatus === 'execution_error');
+  const qualityResults = results.filter((r) => r.executionStatus !== 'execution_error');
+  const qualityScores = qualityResults.map((r) => r.score);
+
+  // Compute quality metrics from non-execution-error results only
+  const mean = computeMean(qualityScores);
+  const median = computeMedian(qualityScores);
+  const min = qualityScores.length > 0 ? Math.min(...qualityScores) : 0;
+  const max = qualityScores.length > 0 ? Math.max(...qualityScores) : 0;
+  const standardDeviation = computeStandardDeviation(qualityScores);
+  const histogram = buildHistogram(qualityScores);
+
+  // Top/bottom results consider only non-execution-error results
+  const sortedResults = [...qualityResults].sort((a, b) => b.score - a.score);
   const topResults = sortedResults.slice(0, Math.min(3, sortedResults.length));
   const bottomResults = sortedResults.slice(-Math.min(3, sortedResults.length));
 
+  // Count by execution status
+  const executionErrorCount = executionErrors.length;
+  const qualityFailureCount = results.filter(
+    (r) => r.executionStatus === 'quality_failure',
+  ).length;
+  const passedCount = results.filter((r) => r.executionStatus === 'ok').length;
+
+  // Aggregate by failure stage and reason
+  const byFailureStage: Record<string, number> = {};
+  const byFailureReason: Record<string, number> = {};
+  for (const result of results) {
+    if (result.failureStage) {
+      byFailureStage[result.failureStage] = (byFailureStage[result.failureStage] ?? 0) + 1;
+    }
+    if (result.failureReasonCode) {
+      byFailureReason[result.failureReasonCode] =
+        (byFailureReason[result.failureReasonCode] ?? 0) + 1;
+    }
+  }
+
   return {
     total,
     mean,
@@ -128,6 +164,11 @@ export function calculateEvaluationSummary(
     bottomResults,
     errorCount,
     errors,
+    executionErrorCount,
+    qualityFailureCount,
+    passedCount,
+    byFailureStage,
+    byFailureReason,
   };
 }
 
@@ -145,7 +186,7 @@ export function formatEvaluationSummary(summary: EvaluationSummary): string {
   // Display errors first if any exist
   if (summary.errorCount > 0) {
     lines.push('\n==================================================');
-    lines.push('ERRORS');
+    lines.push('EXECUTION ERRORS');
     lines.push('==================================================');
     for (const error of summary.errors) {
       lines.push(`\n❌ ${error.testId}`);
@@ -158,13 +199,22 @@ export function formatEvaluationSummary(summary: EvaluationSummary): string {
   lines.push('EVALUATION SUMMARY');
   lines.push('==================================================');
   lines.push(`Total tests: ${summary.total}`);
-
-  if (summary.errorCount > 0) {
-    lines.push(`Failed: ${summary.errorCount}`);
-    lines.push(`Passed: ${summary.total - summary.errorCount}`);
+  lines.push(`Passed: ${summary.passedCount}`);
+  if (summary.qualityFailureCount > 0) {
+    lines.push(`Quality failures: ${summary.qualityFailureCount}`);
+  }
+  if (summary.executionErrorCount > 0) {
+    lines.push(`Execution errors: ${summary.executionErrorCount}`);
   }
 
-  lines.push(`Mean score: ${formatScore(summary.mean)}`);
+  if (summary.executionErrorCount > 0) {
+    const qualityCount = summary.total - summary.executionErrorCount;
+    lines.push(
+      `Mean score: ${formatScore(summary.mean)} (${qualityCount} quality tests, ${summary.executionErrorCount} execution errors excluded)`,
+    );
+  } else {
+    lines.push(`Mean score: ${formatScore(summary.mean)}`);
+  }
   lines.push(`Median score: ${formatScore(summary.median)}`);
   lines.push(`Min score: ${formatScore(summary.min)}`);
   lines.push(`Max score: ${formatScore(summary.max)}`);
@@ -188,6 +238,22 @@ export function formatEvaluationSummary(summary: EvaluationSummary): string {
     lines.push(`  ${index + 1}. ${result.testId}: ${formatScore(result.score)}`);
   });
 
+  const failureStageEntries = Object.entries(summary.byFailureStage);
+  if (failureStageEntries.length > 0) {
+    lines.push('\nExecution errors by stage:');
+    for (const [stage, count] of failureStageEntries) {
+      lines.push(`  ${stage}: ${count}`);
+    }
+  }
+
+  const failureReasonEntries = Object.entries(summary.byFailureReason);
+  if (failureReasonEntries.length > 0) {
+    lines.push('\nExecution errors by reason:');
+    for (const [reason, count] of failureReasonEntries) {
+      lines.push(`  ${reason}: ${count}`);
+    }
+  }
+
   return lines.join('\n');
 }
 

From 12a9f4d07c5294b7d628f22c15ab71601bac49d7 Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Fri, 6 Mar 2026 00:34:45 +0000
Subject: [PATCH 5/9] test: add executionStatus assertions to existing
 orchestrator tests (#431)

Add executionStatus, failureStage, failureReasonCode, and executionError
assertions to representative existing tests:
- Success path: assert executionStatus === 'ok'
- Provider throw: assert execution_error with agent stage
- Provider raw.error: assert execution_error with provider_error code
- Setup script failure: assert execution_error with setup stage
- Successful workspace scripts: assert ok status

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../core/test/evaluation/orchestrator.test.ts   | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/packages/core/test/evaluation/orchestrator.test.ts b/packages/core/test/evaluation/orchestrator.test.ts
index 6262e779a..6be7439e7 100644
--- a/packages/core/test/evaluation/orchestrator.test.ts
+++ b/packages/core/test/evaluation/orchestrator.test.ts
@@ -177,6 +177,9 @@ describe('runTestCase', () => {
     expect(result.misses).toHaveLength(0);
     expect(result.timestamp).toBe('2024-01-01T00:00:00.000Z');
     expect(result.input).toBe('Explain logging improvements');
+    expect(result.executionStatus).toBe('ok');
+    expect(result.failureStage).toBeUndefined();
+    expect(result.failureReasonCode).toBeUndefined();
   });
 
   it('reuses cached provider response when available', async () => {
@@ -258,6 +261,11 @@ describe('runTestCase', () => {
     expect(result.score).toBe(0);
     expect(result.misses[0]).toContain('Provider failure');
     expect(result.input).toBe('Explain logging improvements');
+    expect(result.executionStatus).toBe('execution_error');
+    expect(result.failureStage).toBe('agent');
+    expect(result.failureReasonCode).toBe('provider_error');
+    expect(result.executionError).toBeDefined();
+    expect(result.executionError?.message).toContain('Provider failure');
   });
 
   it('surfaces provider raw.error as evaluation error', async () => {
@@ -278,6 +286,9 @@ describe('runTestCase', () => {
     });
 
     expect(result.error).toBe("Batch output missing id 'case-1'");
+    expect(result.executionStatus).toBe('execution_error');
+    expect(result.failureStage).toBe('agent');
+    expect(result.failureReasonCode).toBe('provider_error');
   });
 
   it('reports failed progress status for batch item errors', async () => {
@@ -1423,6 +1434,7 @@ rl.on('close', () => {
 
     expect(result.beforeAllOutput).toContain('Setup done for case-1');
     expect(result.error).toBeUndefined();
+    expect(result.executionStatus).toBe('ok');
   });
 
   it('returns error result when setup script fails', async () => {
@@ -1463,6 +1475,10 @@ rl.on('close', () => {
 
     expect(result.error).toContain('before_all script failed');
     expect(result.score).toBe(0);
+    expect(result.executionStatus).toBe('execution_error');
+    expect(result.failureStage).toBe('setup');
+    expect(result.failureReasonCode).toBe('script_error');
+    expect(result.executionError).toBeDefined();
   });
 
   it('executes teardown script and captures output in result', async () => {
@@ -1520,6 +1536,7 @@ rl.on('close', () => {
 
     expect(result.afterEachOutput).toContain('Teardown done for case-1');
     expect(result.error).toBeUndefined();
+    expect(result.executionStatus).toBe('ok');
   });
 });
 

From c3b6e7040f4fec6322d84ca993cd2ea5a8d31d47 Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Fri, 6 Mar 2026 00:35:55 +0000
Subject: [PATCH 6/9] test: add dedicated execution status classification tests
 (#431)

New test file covering all executionStatus classification paths:
- Provider throw -> execution_error with agent stage
- High score (>=0.8) -> ok
- Low score (<0.8) -> quality_failure
- Backward compatibility: error field still set alongside executionError
- Threshold boundary tests at exactly 0.8 and 0.79

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 .../test/evaluation/execution-status.test.ts  | 232 ++++++++++++++++++
 1 file changed, 232 insertions(+)
 create mode 100644 packages/core/test/evaluation/execution-status.test.ts

diff --git a/packages/core/test/evaluation/execution-status.test.ts b/packages/core/test/evaluation/execution-status.test.ts
new file mode 100644
index 000000000..21748c162
--- /dev/null
+++ b/packages/core/test/evaluation/execution-status.test.ts
@@ -0,0 +1,232 @@
+import { describe, expect, it } from 'bun:test';
+
+import { runEvalCase } from '../../src/evaluation/orchestrator.js';
+import type { ResolvedTarget } from '../../src/evaluation/providers/targets.js';
+import type {
+  Provider,
+  ProviderResponse,
+} from '../../src/evaluation/providers/types.js';
+import type { EvalTest } from '../../src/evaluation/types.js';
+
+// ---------------------------------------------------------------------------
+// Mock providers
+// ---------------------------------------------------------------------------
+
+class ErrorProvider implements Provider {
+  readonly id = 'mock:error';
+  readonly kind = 'mock' as const;
+  readonly targetName = 'error-target';
+
+  async invoke(): Promise<ProviderResponse> {
+    throw new Error('Provider failed');
+  }
+}
+
+class FixedResponseProvider implements Provider {
+  readonly id = 'mock:fixed';
+  readonly kind = 'mock' as const;
+  readonly targetName = 'fixed-target';
+
+  constructor(private readonly response: string) {}
+
+  async invoke(): Promise<ProviderResponse> {
+    return {
+      output: [{ role: 'assistant', content: this.response }],
+    };
+  }
+}
+
+// ---------------------------------------------------------------------------
+// Shared fixtures
+// ---------------------------------------------------------------------------
+
+const baseTestCase: EvalTest = {
+  id: 'exec-status-1',
+  dataset: 'test-dataset',
+  question: 'Explain logging improvements',
+  input: [{ role: 'user', content: 'Explain logging improvements' }],
+  input_segments: [{ type: 'text', value: 'Explain logging improvements' }],
+  expected_output: [],
+  reference_answer: '- add structured logging\n- avoid global state',
+  guideline_paths: [],
+  file_paths: [],
+  criteria: 'Logging improved',
+  evaluator: 'llm_judge',
+};
+
+const baseTarget: ResolvedTarget = {
+  kind: 'mock',
+  name: 'mock',
+  config: { response: '{}' },
+};
+
+/** Returns a score >= 0.8 → executionStatus 'ok' */
+const highScoreEvaluators = {
+  llm_judge: {
+    kind: 'llm_judge',
+    async evaluate() {
+      return {
+        score: 0.9,
+        verdict: 'pass' as const,
+        hits: ['good answer'],
+        misses: [],
+        expectedAspectCount: 1,
+      };
+    },
+  },
+};
+
+/** Returns a score < 0.8 → executionStatus 'quality_failure' */
+const lowScoreEvaluators = {
+  llm_judge: {
+    kind: 'llm_judge',
+    async evaluate() {
+      return {
+        score: 0.3,
+        verdict: 'fail' as const,
+        hits: [],
+        misses: ['missed the point'],
+        expectedAspectCount: 1,
+      };
+    },
+  },
+};
+
+// ---------------------------------------------------------------------------
+// Tests
+// ---------------------------------------------------------------------------
+
+describe('execution status classification', () => {
+  it('classifies provider errors as execution_error with agent stage', async () => {
+    const provider = new ErrorProvider();
+
+    const result = await runEvalCase({
+      evalCase: baseTestCase,
+      provider,
+      target: baseTarget,
+      evaluators: highScoreEvaluators,
+    });
+
+    expect(result.executionStatus).toBe('execution_error');
+    expect(result.failureStage).toBe('agent');
+    expect(result.failureReasonCode).toBe('provider_error');
+    expect(result.executionError).toBeDefined();
+    expect(result.executionError?.message).toContain('Provider failed');
+    expect(result.executionError?.stage).toBe('agent');
+    // Backward compat: error field still set
+    expect(result.error).toBeDefined();
+    expect(result.score).toBe(0);
+  });
+
+  it('classifies high-scoring results as ok', async () => {
+    const provider = new FixedResponseProvider(
+      'Add structured logging and avoid global state.',
+    );
+
+    const result = await runEvalCase({
+      evalCase: baseTestCase,
+      provider,
+      target: baseTarget,
+      evaluators: highScoreEvaluators,
+    });
+
+    expect(result.executionStatus).toBe('ok');
+    expect(result.failureStage).toBeUndefined();
+    expect(result.failureReasonCode).toBeUndefined();
+    expect(result.executionError).toBeUndefined();
+    expect(result.score).toBeGreaterThanOrEqual(0.8);
+  });
+
+  it('classifies low-scoring results as quality_failure', async () => {
+    const provider = new FixedResponseProvider('I have no idea about logging.');
+
+    const result = await runEvalCase({
+      evalCase: baseTestCase,
+      provider,
+      target: baseTarget,
+      evaluators: lowScoreEvaluators,
+    });
+
+    expect(result.executionStatus).toBe('quality_failure');
+    expect(result.failureStage).toBeUndefined();
+    expect(result.failureReasonCode).toBeUndefined();
+    expect(result.executionError).toBeUndefined();
+    expect(result.score).toBeLessThan(0.8);
+  });
+
+  it('preserves backward-compatible error field on execution errors', async () => {
+    const provider = new ErrorProvider();
+
+    const result = await runEvalCase({
+      evalCase: baseTestCase,
+      provider,
+      target: baseTarget,
+      evaluators: highScoreEvaluators,
+    });
+
+    // Both old and new fields are set
+    expect(result.error).toBeDefined();
+    expect(typeof result.error).toBe('string');
+    expect(result.executionStatus).toBe('execution_error');
+    expect(result.executionError).toBeDefined();
+    expect(result.executionError?.message).toBe(result.error);
+  });
+
+  it('sets executionStatus to ok at exact 0.8 threshold', async () => {
+    const thresholdEvaluators = {
+      llm_judge: {
+        kind: 'llm_judge',
+        async evaluate() {
+          return {
+            score: 0.8,
+            verdict: 'pass' as const,
+            hits: ['acceptable'],
+            misses: [],
+            expectedAspectCount: 1,
+          };
+        },
+      },
+    };
+
+    const provider = new FixedResponseProvider('Adequate answer.');
+
+    const result = await runEvalCase({
+      evalCase: baseTestCase,
+      provider,
+      target: baseTarget,
+      evaluators: thresholdEvaluators,
+    });
+
+    expect(result.executionStatus).toBe('ok');
+    expect(result.score).toBe(0.8);
+  });
+
+  it('sets executionStatus to quality_failure just below threshold', async () => {
+    const belowThresholdEvaluators = {
+      llm_judge: {
+        kind: 'llm_judge',
+        async evaluate() {
+          return {
+            score: 0.79,
+            verdict: 'fail' as const,
+            hits: [],
+            misses: ['barely missed'],
+            expectedAspectCount: 1,
+          };
+        },
+      },
+    };
+
+    const provider = new FixedResponseProvider('Almost adequate answer.');
+
+    const result = await runEvalCase({
+      evalCase: baseTestCase,
+      provider,
+      target: baseTarget,
+      evaluators: belowThresholdEvaluators,
+    });
+
+    expect(result.executionStatus).toBe('quality_failure');
+    expect(result.score).toBe(0.79);
+  });
+});

From b56dc9cafd559f5c3ad28c5a28db01211d6fcf19 Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Fri, 6 Mar 2026 00:38:21 +0000
Subject: [PATCH 7/9] fix: add executionStatus to baseline test fixture (#431)

Add required executionStatus: 'ok' to makeFullResult() in baseline.test.ts
to match the updated EvaluationResult type which now requires this field.

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 ...026-02-26-eval-schema-generation-design.md | 652 ++++++++++++++++++
 .../core/test/evaluation/baseline.test.ts     |   1 +
 2 files changed, 653 insertions(+)
 create mode 100644 docs/plans/2026-02-26-eval-schema-generation-design.md

diff --git a/docs/plans/2026-02-26-eval-schema-generation-design.md b/docs/plans/2026-02-26-eval-schema-generation-design.md
new file mode 100644
index 000000000..c1edcc646
--- /dev/null
+++ b/docs/plans/2026-02-26-eval-schema-generation-design.md
@@ -0,0 +1,652 @@
+# Eval Schema Generation Implementation Plan
+
+> **For Claude:** REQUIRED SUB-SKILL: Use superpowers:executing-plans to implement this plan task-by-task.
+
+**Goal:** Auto-generate `eval-schema.json` from a Zod schema and add a diff test to catch drift.
+
+**Architecture:** Create a comprehensive Zod schema (`eval-file.schema.ts`) that mirrors the eval YAML file structure. A generator script converts it to JSON Schema via `zod-to-json-schema`. A test regenerates and diffs against the committed file — if they diverge, it fails.
+
+**Tech Stack:** Zod, zod-to-json-schema, Vitest
+
+---
+
+### Task 1: Add `zod-to-json-schema` dependency
+
+**Files:**
+- Modify: `packages/core/package.json`
+
+**Step 1: Install the dependency**
+
+Run: `cd /home/christso/projects/agentv && bun add -d zod-to-json-schema --cwd packages/core`
+
+**Step 2: Verify installation**
+
+Run: `grep zod-to-json-schema packages/core/package.json`
+Expected: `"zod-to-json-schema": "^3.x.x"` in devDependencies
+
+**Step 3: Commit**
+
+```bash
+git add packages/core/package.json bun.lock
+git commit -m "chore: add zod-to-json-schema dev dependency"
+```
+
+---
+
+### Task 2: Create the eval file Zod schema
+
+**Files:**
+- Create: `packages/core/src/evaluation/validation/eval-file.schema.ts`
+
+**Context:** This schema represents the **YAML input format** (what users write), not the parsed runtime types. Key differences from runtime types:
+- Uses snake_case field names (YAML convention)
+- Includes shorthands (string input → message array)
+- Includes deprecated aliases (eval_cases, script, expected_outcome)
+- Uses `additionalProperties` / `.passthrough()` where custom config is allowed
+- Does NOT include resolved/computed fields (resolvedCwd, resolvedPromptPath, etc.)
+
+The schema should import `EVALUATOR_KIND_VALUES` from `types.ts` to stay in sync with the evaluator kind enum.
+
+**Step 1: Write the schema file**
+
+Create `packages/core/src/evaluation/validation/eval-file.schema.ts` with:
+
+```typescript
+/**
+ * Zod schema for eval YAML file format.
+ * Used to generate eval-schema.json for AI agent reference.
+ *
+ * IMPORTANT: This schema describes the YAML input format, not the parsed runtime types.
+ * When adding new eval features, update this schema AND run `bun run generate:schema`
+ * to regenerate eval-schema.json. The sync test will fail if they diverge.
+ */
+import { z } from 'zod';
+
+// ---------------------------------------------------------------------------
+// Shared primitives
+// ---------------------------------------------------------------------------
+
+/** Message content: string or structured array */
+const ContentItemSchema = z.object({
+  type: z.enum(['text', 'file']),
+  value: z.string(),
+});
+
+const MessageContentSchema = z.union([
+  z.string(),
+  z.array(ContentItemSchema),
+]);
+
+const MessageSchema = z.object({
+  role: z.enum(['system', 'user', 'assistant', 'tool']),
+  content: MessageContentSchema,
+});
+
+/** Input: string shorthand or message array */
+const InputSchema = z.union([z.string(), z.array(MessageSchema)]);
+
+/** Expected output: string, object, or message array */
+const ExpectedOutputSchema = z.union([
+  z.string(),
+  z.record(z.unknown()),
+  z.array(MessageSchema),
+]);
+
+// ---------------------------------------------------------------------------
+// Evaluator schemas (YAML input format)
+// ---------------------------------------------------------------------------
+
+/** Common fields shared by all evaluators */
+const EvaluatorCommonSchema = z.object({
+  name: z.string().optional(),
+  weight: z.number().min(0).optional(),
+  required: z.union([z.boolean(), z.number().gt(0).lte(1)]).optional(),
+  negate: z.boolean().optional(),
+});
+
+/** Prompt: string (inline/file path) or executable script config */
+const PromptSchema = z.union([
+  z.string(),
+  z.object({
+    command: z.union([z.string(), z.array(z.string())]).optional(),
+    script: z.union([z.string(), z.array(z.string())]).optional(),
+    config: z.record(z.unknown()).optional(),
+  }),
+]);
+
+/** Score range for analytic rubrics */
+const ScoreRangeSchema = z.object({
+  score_range: z.tuple([z.number().int().min(0).max(10), z.number().int().min(0).max(10)]),
+  outcome: z.string().min(1),
+});
+
+/** Rubric item (checklist or score-range mode) */
+const RubricItemSchema = z.object({
+  id: z.string().optional(),
+  outcome: z.string().optional(),
+  weight: z.number().optional(),
+  required: z.boolean().optional(),
+  required_min_score: z.number().int().min(0).max(10).optional(),
+  score_ranges: z.array(ScoreRangeSchema).optional(),
+});
+
+// --- Type-specific evaluator schemas ---
+
+const CodeJudgeSchema = EvaluatorCommonSchema.extend({
+  type: z.literal('code_judge'),
+  command: z.union([z.string(), z.array(z.string())]),
+  script: z.union([z.string(), z.array(z.string())]).optional(),
+  cwd: z.string().optional(),
+  target: z.union([z.boolean(), z.object({ max_calls: z.number().optional() })]).optional(),
+  config: z.record(z.unknown()).optional(),
+});
+
+const LlmJudgeSchema = EvaluatorCommonSchema.extend({
+  type: z.literal('llm_judge'),
+  prompt: PromptSchema.optional(),
+  rubrics: z.array(RubricItemSchema).optional(),
+  model: z.string().optional(),
+  config: z.record(z.unknown()).optional(),
+});
+
+/** Aggregator configs for composite evaluator */
+const AggregatorSchema = z.discriminatedUnion('type', [
+  z.object({
+    type: z.literal('weighted_average'),
+    weights: z.record(z.number()).optional(),
+  }),
+  z.object({
+    type: z.literal('threshold'),
+    threshold: z.number().min(0).max(1),
+  }),
+  z.object({
+    type: z.literal('code_judge'),
+    path: z.string(),
+    cwd: z.string().optional(),
+  }),
+  z.object({
+    type: z.literal('llm_judge'),
+    prompt: z.string().optional(),
+    model: z.string().optional(),
+  }),
+]);
+
+// Use z.lazy for recursive composite evaluator
+const CompositeSchema: z.ZodType = z.lazy(() =>
+  EvaluatorCommonSchema.extend({
+    type: z.literal('composite'),
+    assert: z.array(EvaluatorSchema).optional(),
+    evaluators: z.array(EvaluatorSchema).optional(),
+    aggregator: AggregatorSchema,
+  }),
+);
+
+const ArgsMatchSchema = z.union([
+  z.enum(['exact', 'ignore', 'subset', 'superset']),
+  z.array(z.string()),
+]);
+
+const ToolTrajectoryExpectedItemSchema = z.object({
+  tool: z.string(),
+  args: z.union([z.literal('any'), z.record(z.unknown())]).optional(),
+  max_duration_ms: z.number().min(0).optional(),
+  maxDurationMs: z.number().min(0).optional(),
+  args_match: ArgsMatchSchema.optional(),
+  argsMatch: ArgsMatchSchema.optional(),
+});
+
+const ToolTrajectorySchema = EvaluatorCommonSchema.extend({
+  type: z.literal('tool_trajectory'),
+  mode: z.enum(['any_order', 'in_order', 'exact', 'subset', 'superset']),
+  minimums: z.record(z.number().int().min(0)).optional(),
+  expected: z.array(ToolTrajectoryExpectedItemSchema).optional(),
+  args_match: ArgsMatchSchema.optional(),
+  argsMatch: ArgsMatchSchema.optional(),
+});
+
+const FieldConfigSchema = z.object({
+  path: z.string(),
+  match: z.enum(['exact', 'numeric_tolerance', 'date']),
+  required: z.boolean().optional(),
+  weight: z.number().optional(),
+  tolerance: z.number().min(0).optional(),
+  relative: z.boolean().optional(),
+  formats: z.array(z.string()).optional(),
+});
+
+const FieldAccuracySchema = EvaluatorCommonSchema.extend({
+  type: z.literal('field_accuracy'),
+  fields: z.array(FieldConfigSchema).min(1),
+  aggregation: z.enum(['weighted_average', 'all_or_nothing']).optional(),
+});
+
+const LatencySchema = EvaluatorCommonSchema.extend({
+  type: z.literal('latency'),
+  threshold: z.number().min(0),
+});
+
+const CostSchema = EvaluatorCommonSchema.extend({
+  type: z.literal('cost'),
+  budget: z.number().min(0),
+});
+
+const TokenUsageSchema = EvaluatorCommonSchema.extend({
+  type: z.literal('token_usage'),
+  max_total: z.number().min(0).optional(),
+  max_input: z.number().min(0).optional(),
+  max_output: z.number().min(0).optional(),
+});
+
+const ExecutionMetricsSchema = EvaluatorCommonSchema.extend({
+  type: z.literal('execution_metrics'),
+  max_tool_calls: z.number().min(0).optional(),
+  max_llm_calls: z.number().min(0).optional(),
+  max_tokens: z.number().min(0).optional(),
+  max_cost_usd: z.number().min(0).optional(),
+  max_duration_ms: z.number().min(0).optional(),
+  target_exploration_ratio: z.number().min(0).max(1).optional(),
+  exploration_tolerance: z.number().min(0).optional(),
+});
+
+const AgentJudgeSchema = EvaluatorCommonSchema.extend({
+  type: z.literal('agent_judge'),
+  prompt: z.string().optional(),
+  rubrics: z.array(RubricItemSchema).optional(),
+  max_steps: z.number().int().min(1).max(50).optional(),
+  temperature: z.number().min(0).max(2).optional(),
+  target: z.string().optional(),
+});
+
+const ContainsSchema = EvaluatorCommonSchema.extend({
+  type: z.literal('contains'),
+  value: z.string(),
+});
+
+const RegexSchema = EvaluatorCommonSchema.extend({
+  type: z.literal('regex'),
+  value: z.string(),
+});
+
+const IsJsonSchema = EvaluatorCommonSchema.extend({
+  type: z.literal('is_json'),
+});
+
+const EqualsSchema = EvaluatorCommonSchema.extend({
+  type: z.literal('equals'),
+  value: z.string(),
+});
+
+const RubricsSchema = EvaluatorCommonSchema.extend({
+  type: z.literal('rubrics'),
+  criteria: z.array(RubricItemSchema).min(1),
+});
+
+/** Union of all evaluator types */
+const EvaluatorSchema = z.union([
+  CodeJudgeSchema,
+  LlmJudgeSchema,
+  CompositeSchema,
+  ToolTrajectorySchema,
+  FieldAccuracySchema,
+  LatencySchema,
+  CostSchema,
+  TokenUsageSchema,
+  ExecutionMetricsSchema,
+  AgentJudgeSchema,
+  ContainsSchema,
+  RegexSchema,
+  IsJsonSchema,
+  EqualsSchema,
+  RubricsSchema,
+]);
+
+// ---------------------------------------------------------------------------
+// Workspace
+// ---------------------------------------------------------------------------
+
+const WorkspaceScriptSchema = z.object({
+  command: z.union([z.string(), z.array(z.string())]).optional(),
+  script: z.union([z.string(), z.array(z.string())]).optional(),
+  timeout_ms: z.number().min(0).optional(),
+  cwd: z.string().optional(),
+});
+
+const WorkspaceSchema = z.object({
+  template: z.string().optional(),
+  before_all: WorkspaceScriptSchema.optional(),
+  after_all: WorkspaceScriptSchema.optional(),
+  before_each: WorkspaceScriptSchema.optional(),
+  after_each: WorkspaceScriptSchema.optional(),
+});
+
+// ---------------------------------------------------------------------------
+// Execution block
+// ---------------------------------------------------------------------------
+
+const TrialsSchema = z.object({
+  count: z.number().int().min(1),
+  strategy: z.enum(['pass_at_k', 'mean', 'confidence_interval']).optional(),
+  cost_limit_usd: z.number().min(0).optional(),
+  costLimitUsd: z.number().min(0).optional(),
+});
+
+const ExecutionSchema = z.object({
+  target: z.string().optional(),
+  targets: z.array(z.string()).optional(),
+  assert: z.array(EvaluatorSchema).optional(),
+  evaluators: z.array(EvaluatorSchema).optional(),
+  skip_defaults: z.boolean().optional(),
+  cache: z.boolean().optional(),
+  trials: TrialsSchema.optional(),
+  total_budget_usd: z.number().min(0).optional(),
+  totalBudgetUsd: z.number().min(0).optional(),
+});
+
+// ---------------------------------------------------------------------------
+// Test case
+// ---------------------------------------------------------------------------
+
+const EvalTestSchema = z.object({
+  id: z.string().min(1),
+  criteria: z.string().optional(),
+  expected_outcome: z.string().optional(),
+  input: InputSchema.optional(),
+  expected_output: ExpectedOutputSchema.optional(),
+  assert: z.array(EvaluatorSchema).optional(),
+  evaluators: z.array(EvaluatorSchema).optional(),
+  execution: ExecutionSchema.optional(),
+  workspace: WorkspaceSchema.optional(),
+  metadata: z.record(z.unknown()).optional(),
+  conversation_id: z.string().optional(),
+  dataset: z.string().optional(),
+  note: z.string().optional(),
+});
+
+// ---------------------------------------------------------------------------
+// Top-level eval file
+// ---------------------------------------------------------------------------
+
+export const EvalFileSchema = z.object({
+  $schema: z.string().optional(),
+  // Metadata
+  name: z.string().regex(/^[a-z0-9-]+$/).optional(),
+  description: z.string().optional(),
+  version: z.string().optional(),
+  author: z.string().optional(),
+  tags: z.array(z.string()).optional(),
+  license: z.string().optional(),
+  requires: z.object({ agentv: z.string().optional() }).optional(),
+  // Suite-level input
+  input: InputSchema.optional(),
+  // Tests (array or external file path)
+  tests: z.union([z.array(EvalTestSchema), z.string()]),
+  // Deprecated aliases
+  eval_cases: z.union([z.array(EvalTestSchema), z.string()]).optional(),
+  // Target
+  target: z.string().optional(),
+  // Execution
+  execution: ExecutionSchema.optional(),
+  // Suite-level assertions
+  assert: z.array(EvaluatorSchema).optional(),
+  // Workspace
+  workspace: WorkspaceSchema.optional(),
+});
+```
+
+**Step 2: Verify the file compiles**
+
+Run: `cd /home/christso/projects/agentv && bunx tsc --noEmit packages/core/src/evaluation/validation/eval-file.schema.ts --esModuleInterop --moduleResolution bundler --module esnext --target es2022 --strict`
+
+If tsc is fussy with standalone file checking, just run the full typecheck:
+Run: `bun run typecheck --filter @agentv/core`
+
+**Step 3: Commit**
+
+```bash
+git add packages/core/src/evaluation/validation/eval-file.schema.ts
+git commit -m "feat: add Zod schema for eval YAML file format"
+```
+
+---
+
+### Task 3: Create the generator script
+
+**Files:**
+- Create: `packages/core/scripts/generate-eval-schema.ts`
+- Modify: `packages/core/package.json` (add script)
+
+**Step 1: Write the generator script**
+
+Create `packages/core/scripts/generate-eval-schema.ts`:
+
+```typescript
+#!/usr/bin/env bun
+/**
+ * Generates eval-schema.json from the Zod schema.
+ * Run: bun run generate:schema (from packages/core)
+ * Or:  bun packages/core/scripts/generate-eval-schema.ts (from repo root)
+ */
+import { zodToJsonSchema } from 'zod-to-json-schema';
+import { writeFile } from 'node:fs/promises';
+import path from 'node:path';
+import { EvalFileSchema } from '../src/evaluation/validation/eval-file.schema.js';
+
+const jsonSchema = zodToJsonSchema(EvalFileSchema, {
+  name: 'EvalFile',
+  $refStrategy: 'none',
+});
+
+// Add JSON Schema metadata
+const schema = {
+  $schema: 'http://json-schema.org/draft-07/schema#',
+  title: 'AgentV Eval File',
+  description: 'Schema for AgentV evaluation YAML files (.eval.yaml)',
+  ...jsonSchema,
+};
+
+const outputPath = path.resolve(
+  import.meta.dirname,
+  '../../../plugins/agentv-dev/skills/agentv-eval-builder/references/eval-schema.json',
+);
+
+await writeFile(outputPath, `${JSON.stringify(schema, null, 2)}\n`);
+console.log(`Generated: ${outputPath}`);
+```
+
+**Step 2: Add the script to package.json**
+
+Add to `packages/core/package.json` scripts:
+```json
+"generate:schema": "bun scripts/generate-eval-schema.ts"
+```
+
+**Step 3: Run the generator and verify output**
+
+Run: `cd /home/christso/projects/agentv/packages/core && bun run generate:schema`
+Expected: `Generated: .../plugins/agentv-dev/skills/agentv-eval-builder/references/eval-schema.json`
+
+Inspect the output:
+Run: `head -30 /home/christso/projects/agentv/plugins/agentv-dev/skills/agentv-eval-builder/references/eval-schema.json`
+Expected: Valid JSON with `$schema`, `title`, `properties` including `tests`, `execution`, `assert`, etc.
+
+**Step 4: Run biome format on the generated file**
+
+Run: `cd /home/christso/projects/agentv && bunx biome format --write plugins/agentv-dev/skills/agentv-eval-builder/references/eval-schema.json`
+
+**Step 5: Commit**
+
+```bash
+git add packages/core/scripts/generate-eval-schema.ts packages/core/package.json
+git add plugins/agentv-dev/skills/agentv-eval-builder/references/eval-schema.json
+git commit -m "feat: add eval-schema.json generator from Zod schema"
+```
+
+---
+
+### Task 4: Add the sync diff test
+
+**Files:**
+- Create: `packages/core/test/evaluation/validation/eval-schema-sync.test.ts`
+
+**Step 1: Write the failing test (schema should already be in sync from Task 3)**
+
+Create `packages/core/test/evaluation/validation/eval-schema-sync.test.ts`:
+
+```typescript
+import { describe, expect, it } from 'bun:test';
+import { readFile } from 'node:fs/promises';
+import path from 'node:path';
+import { zodToJsonSchema } from 'zod-to-json-schema';
+import { EvalFileSchema } from '../../../src/evaluation/validation/eval-file.schema.js';
+
+describe('eval-schema.json sync', () => {
+  it('matches the generated schema from Zod', async () => {
+    const repoRoot = path.resolve(import.meta.dirname, '../../../..');
+    const schemaPath = path.join(
+      repoRoot,
+      'plugins/agentv-dev/skills/agentv-eval-builder/references/eval-schema.json',
+    );
+
+    // Read committed schema
+    const committed = JSON.parse(await readFile(schemaPath, 'utf8'));
+
+    // Generate fresh schema from Zod
+    const generated = zodToJsonSchema(EvalFileSchema, {
+      name: 'EvalFile',
+      $refStrategy: 'none',
+    });
+
+    const expected = {
+      $schema: 'http://json-schema.org/draft-07/schema#',
+      title: 'AgentV Eval File',
+      description: 'Schema for AgentV evaluation YAML files (.eval.yaml)',
+      ...generated,
+    };
+
+    // Compare (ignoring formatting differences)
+    expect(JSON.parse(JSON.stringify(committed))).toEqual(
+      JSON.parse(JSON.stringify(expected)),
+    );
+  });
+});
+```
+
+**Step 2: Run the test to verify it passes**
+
+Run: `cd /home/christso/projects/agentv && bun test packages/core/test/evaluation/validation/eval-schema-sync.test.ts`
+Expected: PASS (since we just generated the schema in Task 3)
+
+**Step 3: Commit**
+
+```bash
+git add packages/core/test/evaluation/validation/eval-schema-sync.test.ts
+git commit -m "test: add eval-schema.json sync test"
+```
+
+---
+
+### Task 5: Also copy generated schema to CLI dist templates
+
+**Context:** The schema is also bundled in `apps/cli/dist/templates/`. Check if this is done by the build or needs manual sync.
+
+**Step 1: Check how CLI templates reference the schema**
+
+Run: `diff plugins/agentv-dev/skills/agentv-eval-builder/references/eval-schema.json apps/cli/dist/templates/.claude/skills/agentv-eval-builder/references/eval-schema.json`
+
+If they differ, the CLI build should copy from the source. Check the CLI build process:
+Run: `grep -r "eval-schema" apps/cli/tsup.config.ts apps/cli/package.json 2>/dev/null`
+
+If no copy step exists, the template copies are stale artifacts. Either:
+- Add a copy step to the CLI build, or
+- Note this as out of scope (the CLI templates are created by `agentv create` and may have their own update cycle)
+
+**Step 2: Determine action and commit if needed**
+
+This step is investigative — commit only if a change is needed.
+
+---
+
+### Task 6: Run full test suite and push
+
+**Step 1: Run all tests**
+
+Run: `cd /home/christso/projects/agentv && bun run test`
+Expected: All tests pass
+
+**Step 2: Run typecheck**
+
+Run: `cd /home/christso/projects/agentv && bun run typecheck`
+Expected: No errors
+
+**Step 3: Run lint**
+
+Run: `cd /home/christso/projects/agentv && bun run lint`
+Expected: No errors (fix any formatting issues from generated file)
+
+**Step 4: Push the branch**
+
+Run: `git push -u origin chore/update-eval-schema`
+
+---
+
+### Task 7: Create PR and file follow-up issue
+
+**Step 1: Create PR**
+
+```bash
+gh pr create --title "chore: auto-generate eval-schema.json from Zod" --body "$(cat <<'EOF'
+## Summary
+- Adds a comprehensive Zod schema (`eval-file.schema.ts`) that describes the eval YAML file format
+- Generates `eval-schema.json` from this Zod schema via `zod-to-json-schema`
+- Adds a sync test that regenerates and diffs — fails if schema drifts from Zod
+
+## Motivation
+The JSON schema was manually maintained and had drifted significantly from the actual validation logic. This ensures the schema stays current as the codebase evolves.
+
+## How to update the schema
+When adding new eval features, update `eval-file.schema.ts` and run:
+```bash
+cd packages/core && bun run generate:schema
+```
+
+## Test plan
+- [ ] `bun test packages/core/test/evaluation/validation/eval-schema-sync.test.ts` passes
+- [ ] Full test suite passes
+- [ ] Schema validates against existing example eval files
+
+🤖 Generated with [Claude Code](https://claude.com/claude-code)
+EOF
+)"
+```
+
+**Step 2: File follow-up issue for Approach B**
+
+```bash
+gh issue create --title "refactor: migrate eval-validator.ts from procedural to Zod-based validation" --body "$(cat <<'EOF'
+## Context
+The eval file validation in `eval-validator.ts` uses procedural if/else logic (~500+ lines). A parallel Zod schema (`eval-file.schema.ts`) was added in #<PR_NUMBER> for JSON Schema generation, creating two sources of truth.
+
+## Proposal
+Refactor `eval-validator.ts` to use the Zod schema as the single source of truth for both:
+1. Runtime validation (Zod `.safeParse()`)
+2. JSON Schema generation (`zod-to-json-schema`)
+
+## Benefits
+- Single source of truth for eval file structure
+- Better error messages from Zod
+- Removes ~500 lines of manual validation code
+- Type-safe parsing (no type casts)
+
+## Considerations
+- The current procedural validator supports warnings (not just errors) — Zod only does pass/fail
+- Custom evaluator types use `.passthrough()` which needs careful handling
+- Backward-compatible aliases (eval_cases, script, expected_outcome) need Zod transforms
+- Extensive test coverage exists in `eval-validator.test.ts` — migration should preserve all test cases
+
+## Scope
+- `packages/core/src/evaluation/validation/eval-validator.ts` → refactor to use Zod
+- `packages/core/test/evaluation/validation/eval-validator.test.ts` → update test setup
+- Remove the separate `eval-file.schema.ts` once validator uses Zod natively
+EOF
+)"
+```
diff --git a/packages/core/test/evaluation/baseline.test.ts b/packages/core/test/evaluation/baseline.test.ts
index 5814accf1..4d9b0e152 100644
--- a/packages/core/test/evaluation/baseline.test.ts
+++ b/packages/core/test/evaluation/baseline.test.ts
@@ -30,6 +30,7 @@ function makeFullResult(overrides: Partial<EvaluationResult> = {}): EvaluationRe
     beforeAllOutput: 'setup done',
     afterEachOutput: 'teardown done',
     fileChanges: '--- a/file\n+++ b/file\n@@ -1 +1 @@\n-old\n+new',
+    executionStatus: 'ok',
     ...overrides,
   };
 }

From bcae7e9f497da4f8e9caba1a00f465336bce522d Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Fri, 6 Mar 2026 00:46:34 +0000
Subject: [PATCH 8/9] refactor: extract QUALITY_PASS_THRESHOLD constant, scope
 failure aggregation

- Extract 0.8 threshold into QUALITY_PASS_THRESHOLD constant with
  classifyQualityStatus helper to prevent threshold drift
- Scope byFailureStage/byFailureReason aggregation to execution_error
  results only (was iterating all results)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 apps/cli/src/commands/eval/statistics.ts     |  4 ++--
 packages/core/src/evaluation/orchestrator.ts | 13 +++++++++----
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/apps/cli/src/commands/eval/statistics.ts b/apps/cli/src/commands/eval/statistics.ts
index 6ed2b1568..4ace301de 100644
--- a/apps/cli/src/commands/eval/statistics.ts
+++ b/apps/cli/src/commands/eval/statistics.ts
@@ -139,10 +139,10 @@ export function calculateEvaluationSummary(
   ).length;
   const passedCount = results.filter((r) => r.executionStatus === 'ok').length;
 
-  // Aggregate by failure stage and reason
+  // Aggregate by failure stage and reason (execution errors only)
   const byFailureStage: Record<string, number> = {};
   const byFailureReason: Record<string, number> = {};
-  for (const result of results) {
+  for (const result of executionErrors) {
     if (result.failureStage) {
       byFailureStage[result.failureStage] = (byFailureStage[result.failureStage] ?? 0) + 1;
     }
diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts
index dbcb64fea..9401b13db 100644
--- a/packages/core/src/evaluation/orchestrator.ts
+++ b/packages/core/src/evaluation/orchestrator.ts
@@ -69,6 +69,13 @@ import { type PromptInputs, buildPromptInputs, loadTests } from './yaml-parser.j
 
 type MaybePromise<T> = T | Promise<T>;
 
+/** Threshold for classifying ok vs quality_failure (score >= threshold → ok). */
+const QUALITY_PASS_THRESHOLD = 0.8;
+
+function classifyQualityStatus(score: number): ExecutionStatus {
+  return score >= QUALITY_PASS_THRESHOLD ? 'ok' : 'quality_failure';
+}
+
 function usesFileReferencePrompt(provider: Provider): boolean {
   return isAgentProvider(provider) || provider.kind === 'cli';
 }
@@ -1212,9 +1219,7 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise<Evaluati
 
     const executionStatus: ExecutionStatus = providerError
       ? 'execution_error'
-      : result.score >= 0.8
-        ? 'ok'
-        : 'quality_failure';
+      : classifyQualityStatus(result.score);
 
     const finalResult = providerError
       ? {
@@ -1511,7 +1516,7 @@ async function evaluateCandidate(options: {
     trace: trace,
     output: output,
     fileChanges,
-    executionStatus: score.score >= 0.8 ? 'ok' as const : 'quality_failure' as const,
+    executionStatus: classifyQualityStatus(score.score),
   };
 }
 

From 015b77da2d286255ba12460f5053eee41809339c Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Fri, 6 Mar 2026 01:40:12 +0000
Subject: [PATCH 9/9] style: fix biome formatting in statistics and
 execution-status test

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 apps/cli/src/commands/eval/statistics.ts               | 4 +---
 packages/core/test/evaluation/execution-status.test.ts | 9 ++-------
 2 files changed, 3 insertions(+), 10 deletions(-)

diff --git a/apps/cli/src/commands/eval/statistics.ts b/apps/cli/src/commands/eval/statistics.ts
index 4ace301de..42106c0d7 100644
--- a/apps/cli/src/commands/eval/statistics.ts
+++ b/apps/cli/src/commands/eval/statistics.ts
@@ -134,9 +134,7 @@ export function calculateEvaluationSummary(
 
   // Count by execution status
   const executionErrorCount = executionErrors.length;
-  const qualityFailureCount = results.filter(
-    (r) => r.executionStatus === 'quality_failure',
-  ).length;
+  const qualityFailureCount = results.filter((r) => r.executionStatus === 'quality_failure').length;
   const passedCount = results.filter((r) => r.executionStatus === 'ok').length;
 
   // Aggregate by failure stage and reason (execution errors only)
diff --git a/packages/core/test/evaluation/execution-status.test.ts b/packages/core/test/evaluation/execution-status.test.ts
index 21748c162..21d222542 100644
--- a/packages/core/test/evaluation/execution-status.test.ts
+++ b/packages/core/test/evaluation/execution-status.test.ts
@@ -2,10 +2,7 @@ import { describe, expect, it } from 'bun:test';
 
 import { runEvalCase } from '../../src/evaluation/orchestrator.js';
 import type { ResolvedTarget } from '../../src/evaluation/providers/targets.js';
-import type {
-  Provider,
-  ProviderResponse,
-} from '../../src/evaluation/providers/types.js';
+import type { Provider, ProviderResponse } from '../../src/evaluation/providers/types.js';
 import type { EvalTest } from '../../src/evaluation/types.js';
 
 // ---------------------------------------------------------------------------
@@ -119,9 +116,7 @@ describe('execution status classification', () => {
   });
 
   it('classifies high-scoring results as ok', async () => {
-    const provider = new FixedResponseProvider(
-      'Add structured logging and avoid global state.',
-    );
+    const provider = new FixedResponseProvider('Add structured logging and avoid global state.');
 
     const result = await runEvalCase({
       evalCase: baseTestCase,