EntityProcess · christso · Mar 6, 2026 · Mar 6, 2026 · Mar 6, 2026 · Mar 6, 2026
diff --git a/CLAUDE.md b/CLAUDE.md
@@ -35,10 +35,13 @@ If a feature serves a niche use case or adds conditional logic, it belongs in a
 ### 3. Align with Industry Standards
 Before adding features, research how peer frameworks solve the problem. Prefer the **lowest common denominator** that covers most use cases. Novel features without industry precedent require strong justification and should default to plugin implementation.
 
-### 4. Non-Breaking Extensions
+### 4. YAGNI — You Aren't Gonna Need It
+Don't build features until there's a concrete need. Before adding a new capability, ask: "Is there real demand for this today, or am I anticipating future needs?" Numeric thresholds, extra tracking fields, and configurable knobs should be omitted until users actually request them. Start with the simplest version (e.g., boolean over numeric range) and extend later if needed.
+
+### 5. Non-Breaking Extensions
 New fields should be optional. Existing configurations must continue working unchanged.
 
-### 5. AI-First Design
+### 6. AI-First Design
 AI agents are the primary users of AgentV—not humans reading docs. Design for AI comprehension and composability.
 
 **Skills over rigid commands:**

diff --git a/apps/cli/src/commands/eval/commands/run.ts b/apps/cli/src/commands/eval/commands/run.ts
@@ -142,6 +142,11 @@ export const evalRunCommand = command({
       description:
         'Group messages into turn spans for multi-turn evaluations (requires --export-otel)',
     }),
+    retryErrors: option({
+      type: optional(string),
+      long: 'retry-errors',
+      description: 'Path to previous output JSONL — re-run only execution_error test cases',
+    }),
   },
   handler: async (args) => {
     // Launch interactive wizard when no eval paths and stdin is a TTY
@@ -178,6 +183,7 @@ export const evalRunCommand = command({
       otelBackend: args.otelBackend,
       otelCaptureContent: args.otelCaptureContent,
       otelGroupTurns: args.otelGroupTurns,
+      retryErrors: args.retryErrors,
     };
     await runEvalCommand({ testFiles: resolvedPaths, rawOptions });
   },

diff --git a/apps/cli/src/commands/eval/retry-errors.ts b/apps/cli/src/commands/eval/retry-errors.ts
@@ -0,0 +1,58 @@
+import { createReadStream } from 'node:fs';
+import { createInterface } from 'node:readline';
+
+import type { EvaluationResult } from '@agentv/core';
+
+/**
+ * Load test IDs from a JSONL results file that have executionStatus === 'execution_error'.
+ */
+export async function loadErrorTestIds(jsonlPath: string): Promise<readonly string[]> {
+  const ids: string[] = [];
+  const rl = createInterface({
+    input: createReadStream(jsonlPath),
+    crlfDelay: Number.POSITIVE_INFINITY,
+  });
+
+  for await (const line of rl) {
+    const trimmed = line.trim();
+    if (!trimmed) continue;
+    try {
+      const parsed = JSON.parse(trimmed) as Partial<EvaluationResult>;
+      if (parsed.executionStatus === 'execution_error' && parsed.testId) {
+        ids.push(parsed.testId);
+      }
+    } catch {
+      // Skip malformed lines
+    }
+  }
+
+  return [...new Set(ids)];
+}
+
+/**
+ * Load results from a JSONL file that do NOT have executionStatus === 'execution_error'.
+ * These are the "good" results that should be preserved when merging retry output.
+ */
+export async function loadNonErrorResults(jsonlPath: string): Promise<readonly EvaluationResult[]> {
+  const results: EvaluationResult[] = [];
+  const rl = createInterface({
+    input: createReadStream(jsonlPath),
+    crlfDelay: Number.POSITIVE_INFINITY,
+  });
+
+  for await (const line of rl) {
+    const trimmed = line.trim();
+    if (!trimmed) continue;
+    try {
+      const parsed = JSON.parse(trimmed) as Partial<EvaluationResult>;
+      if (!parsed.testId || parsed.score === undefined) continue;
+      if (parsed.executionStatus !== 'execution_error') {
+        results.push(parsed as EvaluationResult);
+      }
+    } catch {
+      // Skip malformed lines
+    }
+  }
+
+  return results;
+}
diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts
@@ -8,6 +8,7 @@ import {
   type EvaluationCache,
   type EvaluationResult,
   type ExecutionDefaults,
+  type FailOnError,
   type OtelTraceExporter as OtelTraceExporterType,
   ResponseCache,
   type TrialsConfig,
@@ -33,6 +34,7 @@ import {
   getDefaultExtension,
 } from './output-writer.js';
 import { ProgressDisplay, type WorkerProgress } from './progress-display.js';
+import { loadErrorTestIds, loadNonErrorResults } from './retry-errors.js';
 import { findRepoRoot } from './shared.js';
 import {
   calculateEvaluationSummary,
@@ -74,6 +76,7 @@ interface NormalizedOptions {
   readonly otelBackend?: string;
   readonly otelCaptureContent: boolean;
   readonly otelGroupTurns: boolean;
+  readonly retryErrors?: string;
 }
 
 function normalizeBoolean(value: unknown): boolean {
@@ -225,6 +228,7 @@ function normalizeOptions(
     otelBackend: normalizeString(rawOptions.otelBackend),
     otelCaptureContent: normalizeBoolean(rawOptions.otelCaptureContent),
     otelGroupTurns: normalizeBoolean(rawOptions.otelGroupTurns),
+    retryErrors: normalizeString(rawOptions.retryErrors),
   } satisfies NormalizedOptions;
 }
 
@@ -328,6 +332,7 @@ async function prepareFileMetadata(params: {
   readonly yamlCache?: boolean;
   readonly yamlCachePath?: string;
   readonly totalBudgetUsd?: number;
+  readonly failOnError?: FailOnError;
 }> {
   const { testFilePath, repoRoot, cwd, options } = params;
 
@@ -419,6 +424,7 @@ async function prepareFileMetadata(params: {
     yamlCache: suite.cacheConfig?.enabled,
     yamlCachePath: suite.cacheConfig?.cachePath,
     totalBudgetUsd: suite.totalBudgetUsd,
+    failOnError: suite.failOnError,
   };
 }
 
@@ -460,6 +466,7 @@ async function runSingleEvalFile(params: {
   readonly trialsConfig?: TrialsConfig;
   readonly matrixMode?: boolean;
   readonly totalBudgetUsd?: number;
+  readonly failOnError?: FailOnError;
 }): Promise<{ results: EvaluationResult[] }> {
   const {
     testFilePath,
@@ -480,6 +487,7 @@ async function runSingleEvalFile(params: {
     trialsConfig,
     matrixMode,
     totalBudgetUsd,
+    failOnError,
   } = params;
 
   const targetName = selection.targetName;
@@ -562,6 +570,7 @@ async function runSingleEvalFile(params: {
     cleanupWorkspaces: options.cleanupWorkspaces,
     trials: trialsConfig,
     totalBudgetUsd,
+    failOnError,
     streamCallbacks: streamingObserver?.getStreamCallbacks(),
     onResult: async (result: EvaluationResult) => {
       // Finalize streaming observer span with score
@@ -634,7 +643,26 @@ export async function runEvalCommand(input: RunEvalCommandInput): Promise<void>
   // Pass a dummy file in cwd so the search starts from the working directory.
   const yamlConfig = await loadConfig(path.join(cwd, '_'), repoRoot);
 
-  const options = normalizeOptions(input.rawOptions, config, yamlConfig?.execution);
+  let options = normalizeOptions(input.rawOptions, config, yamlConfig?.execution);
+
+  // --retry-errors: override filter to only re-run execution_error test cases.
+  // IMPORTANT: JSONL must be fully loaded here, before the output writer is created below,
+  // since the retry source and output destination may refer to the same file.
+  let retryNonErrorResults: readonly EvaluationResult[] | undefined;
+  if (options.retryErrors) {
+    const retryPath = path.resolve(options.retryErrors);
+    await ensureFileExists(retryPath, 'Retry-errors JSONL file');
+    const errorIds = await loadErrorTestIds(retryPath);
+    if (errorIds.length === 0) {
+      console.log('No execution errors found in the previous output. Nothing to retry.');
+      return;
+    }
+    console.log(`Retrying ${errorIds.length} execution-error test(s): ${errorIds.join(', ')}`);
+    // Override the filter to match only error test IDs using micromatch brace expansion
+    const filterPattern = errorIds.length === 1 ? errorIds[0] : `{${errorIds.join(',')}}`;
+    options = { ...options, filter: filterPattern };
+    retryNonErrorResults = await loadNonErrorResults(retryPath);
+  }
 
   if (options.keepWorkspaces && options.cleanupWorkspaces) {
     console.warn(
@@ -767,6 +795,7 @@ export async function runEvalCommand(input: RunEvalCommandInput): Promise<void>
       readonly yamlCache?: boolean;
       readonly yamlCachePath?: string;
       readonly totalBudgetUsd?: number;
+      readonly failOnError?: FailOnError;
     }
   >();
   for (const testFilePath of resolvedTestFiles) {
@@ -915,6 +944,7 @@ export async function runEvalCommand(input: RunEvalCommandInput): Promise<void>
           trialsConfig: targetPrep.trialsConfig,
           matrixMode: targetPrep.selections.length > 1,
           totalBudgetUsd: targetPrep.totalBudgetUsd,
+          failOnError: targetPrep.failOnError,
         });
 
         allResults.push(...result.results);
@@ -923,6 +953,17 @@ export async function runEvalCommand(input: RunEvalCommandInput): Promise<void>
 
     progressReporter.finish();
 
+    // Merge non-error results from previous run when using --retry-errors
+    if (retryNonErrorResults && retryNonErrorResults.length > 0) {
+      for (const preserved of retryNonErrorResults) {
+        await outputWriter.append(preserved);
+      }
+      allResults.push(...retryNonErrorResults);
+      console.log(
+        `Merged ${retryNonErrorResults.length} non-error result(s) from previous output.`,
+      );
+    }
+
     const summary = calculateEvaluationSummary(allResults);
     console.log(formatEvaluationSummary(summary));
 

diff --git a/apps/cli/test/unit/retry-errors.test.ts b/apps/cli/test/unit/retry-errors.test.ts
@@ -0,0 +1,89 @@
+import { afterEach, describe, expect, it } from 'bun:test';
+import { mkdtempSync, rmSync, writeFileSync } from 'node:fs';
+import { tmpdir } from 'node:os';
+import path from 'node:path';
+
+import { loadErrorTestIds, loadNonErrorResults } from '../../src/commands/eval/retry-errors.js';
+
+describe('retry-errors', () => {
+  let tmpDir: string;
+
+  afterEach(() => {
+    if (tmpDir) {
+      rmSync(tmpDir, { recursive: true, force: true });
+    }
+  });
+
+  function createJsonlFile(lines: object[]): string {
+    tmpDir = mkdtempSync(path.join(tmpdir(), 'retry-errors-test-'));
+    const filePath = path.join(tmpDir, 'results.jsonl');
+    writeFileSync(filePath, lines.map((l) => JSON.stringify(l)).join('\n'));
+    return filePath;
+  }
+
+  it('loadErrorTestIds returns only execution_error test IDs', async () => {
+    const filePath = createJsonlFile([
+      { testId: 'case-1', executionStatus: 'ok', score: 0.9 },
+      { testId: 'case-2', executionStatus: 'execution_error', score: 0, error: 'timeout' },
+      { testId: 'case-3', executionStatus: 'quality_failure', score: 0.3 },
+      { testId: 'case-4', executionStatus: 'execution_error', score: 0, error: 'provider failed' },
+    ]);
+
+    const ids = await loadErrorTestIds(filePath);
+    expect(ids).toEqual(['case-2', 'case-4']);
+  });
+
+  it('loadErrorTestIds deduplicates IDs', async () => {
+    const filePath = createJsonlFile([
+      { testId: 'case-1', executionStatus: 'execution_error', score: 0 },
+      { testId: 'case-1', executionStatus: 'execution_error', score: 0 },
+    ]);
+
+    const ids = await loadErrorTestIds(filePath);
+    expect(ids).toEqual(['case-1']);
+  });
+
+  it('loadErrorTestIds returns empty array when no errors', async () => {
+    const filePath = createJsonlFile([
+      { testId: 'case-1', executionStatus: 'ok', score: 0.9 },
+      { testId: 'case-2', executionStatus: 'quality_failure', score: 0.5 },
+    ]);
+
+    const ids = await loadErrorTestIds(filePath);
+    expect(ids).toEqual([]);
+  });
+
+  it('loadNonErrorResults returns only non-error results', async () => {
+    const filePath = createJsonlFile([
+      { testId: 'case-1', executionStatus: 'ok', score: 0.9 },
+      { testId: 'case-2', executionStatus: 'execution_error', score: 0 },
+      { testId: 'case-3', executionStatus: 'quality_failure', score: 0.5 },
+    ]);
+
+    const results = await loadNonErrorResults(filePath);
+    expect(results).toHaveLength(2);
+    expect(results[0].testId).toBe('case-1');
+    expect(results[1].testId).toBe('case-3');
+  });
+
+  it('skips malformed JSON lines', async () => {
+    tmpDir = mkdtempSync(path.join(tmpdir(), 'retry-errors-test-'));
+    const filePath = path.join(tmpDir, 'results.jsonl');
+    writeFileSync(
+      filePath,
+      [
+        JSON.stringify({ testId: 'case-1', executionStatus: 'execution_error', score: 0 }),
+        'not valid json',
+        '',
+        JSON.stringify({ testId: 'case-2', executionStatus: 'ok', score: 0.9 }),
+      ].join('\n'),
+    );
+
+    const ids = await loadErrorTestIds(filePath);
+    expect(ids).toEqual(['case-1']);
+
+    const results = await loadNonErrorResults(filePath);
+    expect(results).toHaveLength(1);
+    expect(results[0].testId).toBe('case-2');
+  });
+});
diff --git a/apps/web/src/content/docs/evaluation/eval-files.mdx b/apps/web/src/content/docs/evaluation/eval-files.mdx
@@ -34,7 +34,7 @@ tests:
 |-------|-------------|
 | `description` | Human-readable description of the evaluation |
 | `dataset` | Optional dataset identifier |
-| `execution` | Default execution config (for example `target`) |
+| `execution` | Default execution config (`target`, `fail_on_error`, etc.) |
 | `workspace` | Suite-level workspace config (lifecycle hooks, template) |
 | `tests` | Array of individual tests, or a string path to an external file |
 | `assert` | Suite-level evaluators appended to each test unless `execution.skip_defaults: true` is set on the test |

diff --git a/apps/web/src/content/docs/evaluation/running-evals.mdx b/apps/web/src/content/docs/evaluation/running-evals.mdx
@@ -87,6 +87,33 @@ agentv eval evals/my-eval.yaml --cleanup-workspaces
 
 Workspaces are stored at `~/.agentv/workspaces/<eval-run-id>/<test-id>/`.
 
+### Retry Execution Errors
+
+Re-run only the tests that had infrastructure/execution errors from a previous output:
+
+```bash
+agentv eval evals/my-eval.yaml --retry-errors .agentv/results/eval_previous.jsonl
+```
+
+This reads the previous JSONL, filters for `executionStatus === 'execution_error'`, and re-runs only those test cases. Non-error results from the previous run are preserved and merged into the new output.
+
+### Execution Error Tolerance
+
+Control whether the eval run halts on execution errors using `execution.fail_on_error` in the eval YAML:
+
+```yaml
+execution:
+  fail_on_error: false    # never halt on errors (default)
+  # fail_on_error: true   # halt on first execution error
+```
+
+| Value | Behavior |
+|-------|----------|
+| `true` | Halt immediately on first execution error |
+| `false` | Continue despite errors (default) |
+
+When halted, remaining tests are recorded with `failureReasonCode: 'error_threshold_exceeded'`. With concurrency > 1, a few additional tests may complete before halting takes effect.
+
 ## Validate Before Running
 
 Check eval files for schema errors without executing: