From 391e4fd8d4af13a8c6dceb0cb7844f27f077ea4d Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Sat, 14 Mar 2026 04:22:39 +0000
Subject: [PATCH] feat: skill-eval companion artifacts (grading, timing,
 benchmark) (#565)

Add ArtifactWriter module that produces grading/<test>.json, timing.json,
and benchmark.json from existing JSONL eval results. Includes --artifacts
CLI flag for eval run command.

- Grading artifacts map per-evaluator hits/misses to skill-creator's
  expectations/evidence format with AgentV extensions (evaluators,
  workspace_changes, conversation)
- Timing artifact aggregates duration and token usage across all results
- Benchmark artifact computes per-target statistics (mean/stddev) for
  pass_rate, time, tokens, tool_calls, and cost
- JSONL parser handles snake_case keys from existing output files
- 29 tests covering artifact generation, schema compatibility, and I/O
- Schemas are supersets of Anthropic skill-creator conventions

Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com>
---
 apps/cli/src/commands/eval/artifact-writer.ts | 492 +++++++++++++++++
 apps/cli/src/commands/eval/commands/run.ts    |   7 +
 apps/cli/src/commands/eval/run-eval.ts        |  18 +
 .../commands/eval/artifact-writer.test.ts     | 507 ++++++++++++++++++
 4 files changed, 1024 insertions(+)
 create mode 100644 apps/cli/src/commands/eval/artifact-writer.ts
 create mode 100644 apps/cli/test/commands/eval/artifact-writer.test.ts
diff --git a/apps/cli/src/commands/eval/artifact-writer.ts b/apps/cli/src/commands/eval/artifact-writer.ts
new file mode 100644
index 000000000..275484c01
--- /dev/null
+++ b/apps/cli/src/commands/eval/artifact-writer.ts
@@ -0,0 +1,492 @@
+import { mkdir, readFile, writeFile } from 'node:fs/promises';
+import path from 'node:path';
+
+import type { EvaluationResult, EvaluatorResult } from '@agentv/core';
+
+// ---------------------------------------------------------------------------
+// Artifact interfaces (snake_case to match skill-creator conventions)
+// ---------------------------------------------------------------------------
+
+export interface GradingArtifact {
+  readonly expectations: readonly {
+    readonly text: string;
+    readonly passed: boolean;
+    readonly evidence: string;
+  }[];
+  readonly summary: {
+    readonly passed: number;
+    readonly failed: number;
+    readonly total: number;
+    readonly pass_rate: number;
+  };
+  readonly execution_metrics: {
+    readonly tool_calls: Record<string, number>;
+    readonly total_tool_calls: number;
+    readonly errors_encountered: number;
+  };
+  readonly evaluators?: readonly {
+    readonly name: string;
+    readonly type: string;
+    readonly score: number;
+    readonly reasoning: string;
+    readonly [key: string]: unknown;
+  }[];
+  readonly workspace_changes?: {
+    readonly files_modified: number;
+    readonly files_created: number;
+    readonly diff_summary: string;
+  };
+  readonly conversation?: {
+    readonly turns: number;
+    readonly conversation_id: string;
+  };
+}
+
+export interface TimingArtifact {
+  readonly total_tokens: number;
+  readonly duration_ms: number;
+  readonly total_duration_seconds: number;
+  readonly token_usage: {
+    readonly input: number;
+    readonly output: number;
+  };
+}
+
+export interface BenchmarkArtifact {
+  readonly metadata: {
+    readonly eval_file: string;
+    readonly timestamp: string;
+    readonly targets: readonly string[];
+    readonly tests_run: readonly string[];
+  };
+  readonly run_summary: Record<
+    string,
+    {
+      readonly pass_rate: { readonly mean: number; readonly stddev: number };
+      readonly time_seconds: { readonly mean: number; readonly stddev: number };
+      readonly tokens: { readonly mean: number; readonly stddev: number };
+      readonly tool_calls?: { readonly mean: number; readonly stddev: number };
+      readonly cost_usd?: { readonly mean: number; readonly stddev: number };
+    }
+  >;
+  readonly per_evaluator_summary?: Record<
+    string,
+    { readonly mean: number; readonly stddev: number }
+  >;
+  readonly notes: readonly string[];
+}
+
+// ---------------------------------------------------------------------------
+// Statistics helpers
+// ---------------------------------------------------------------------------
+
+const PASS_THRESHOLD = 0.8;
+
+function computeStats(values: readonly number[]): { mean: number; stddev: number } {
+  if (values.length === 0) {
+    return { mean: 0, stddev: 0 };
+  }
+  const mean = values.reduce((sum, v) => sum + v, 0) / values.length;
+  const variance = values.reduce((sum, v) => sum + (v - mean) ** 2, 0) / values.length;
+  return {
+    mean: Math.round(mean * 1000) / 1000,
+    stddev: Math.round(Math.sqrt(variance) * 1000) / 1000,
+  };
+}
+
+function computePassRate(result: EvaluationResult): number {
+  const scores = result.scores;
+  if (scores && scores.length > 0) {
+    const passed = scores.filter((s) => s.score >= PASS_THRESHOLD).length;
+    return passed / scores.length;
+  }
+  return result.score >= PASS_THRESHOLD ? 1.0 : 0.0;
+}
+
+// ---------------------------------------------------------------------------
+// Tool-call counting from trace data
+// ---------------------------------------------------------------------------
+
+function countToolCalls(result: EvaluationResult): {
+  toolCalls: Record<string, number>;
+  total: number;
+} {
+  const toolCalls: Record<string, number> = {};
+  let total = 0;
+
+  const trace = result.trace as
+    | { steps?: readonly { toolName?: string; type?: string }[] }
+    | undefined;
+
+  if (trace?.steps) {
+    for (const step of trace.steps) {
+      if (step.toolName || step.type === 'tool') {
+        const name = step.toolName ?? 'unknown';
+        toolCalls[name] = (toolCalls[name] ?? 0) + 1;
+        total += 1;
+      }
+    }
+  }
+
+  return { toolCalls, total };
+}
+
+// ---------------------------------------------------------------------------
+// Workspace change parsing from fileChanges diff
+// ---------------------------------------------------------------------------
+
+function parseWorkspaceChanges(
+  fileChanges: string | undefined,
+): GradingArtifact['workspace_changes'] | undefined {
+  if (!fileChanges) {
+    return undefined;
+  }
+
+  let filesModified = 0;
+  let filesCreated = 0;
+
+  const lines = fileChanges.split('\n');
+  for (const line of lines) {
+    if (line.startsWith('--- /dev/null')) {
+      filesCreated += 1;
+    } else if (line.startsWith('--- a/')) {
+      filesModified += 1;
+    }
+  }
+
+  const summaryLines = lines.slice(0, 20);
+  const diffSummary =
+    lines.length > 20
+      ? `${summaryLines.join('\n')}\n... (${lines.length - 20} more lines)`
+      : fileChanges;
+
+  return {
+    files_modified: filesModified,
+    files_created: filesCreated,
+    diff_summary: diffSummary,
+  };
+}
+
+// ---------------------------------------------------------------------------
+// Build expectations from evaluator results (skill-creator compatible)
+// ---------------------------------------------------------------------------
+
+function buildExpectations(result: EvaluationResult): GradingArtifact['expectations'] {
+  const expectations: {
+    text: string;
+    passed: boolean;
+    evidence: string;
+  }[] = [];
+
+  if (result.scores && result.scores.length > 0) {
+    for (const evaluator of result.scores) {
+      for (const hit of evaluator.hits) {
+        expectations.push({
+          text: hit,
+          passed: true,
+          evidence: evaluator.reasoning ?? '',
+        });
+      }
+      for (const miss of evaluator.misses) {
+        expectations.push({
+          text: miss,
+          passed: false,
+          evidence: evaluator.reasoning ?? '',
+        });
+      }
+    }
+  } else {
+    for (const hit of result.hits) {
+      expectations.push({ text: hit, passed: true, evidence: result.reasoning ?? '' });
+    }
+    for (const miss of result.misses) {
+      expectations.push({ text: miss, passed: false, evidence: result.reasoning ?? '' });
+    }
+  }
+
+  return expectations;
+}
+
+// ---------------------------------------------------------------------------
+// Build evaluators list
+// ---------------------------------------------------------------------------
+
+function buildEvaluators(
+  scores: readonly EvaluatorResult[] | undefined,
+): GradingArtifact['evaluators'] {
+  if (!scores || scores.length === 0) {
+    return undefined;
+  }
+
+  return scores.map((s) => ({
+    name: s.name,
+    type: s.type,
+    score: s.score,
+    reasoning: s.reasoning ?? '',
+    weight: s.weight,
+    verdict: s.verdict,
+    hits: s.hits,
+    misses: s.misses,
+    details: s.details,
+  }));
+}
+
+// ---------------------------------------------------------------------------
+// Public artifact builders
+// ---------------------------------------------------------------------------
+
+export function buildGradingArtifact(result: EvaluationResult): GradingArtifact {
+  const expectations = buildExpectations(result);
+  const passed = expectations.filter((e) => e.passed).length;
+  const failed = expectations.filter((e) => !e.passed).length;
+  const total = expectations.length;
+
+  const { toolCalls, total: totalToolCalls } = countToolCalls(result);
+  const errorsEncountered = result.error ? 1 : 0;
+
+  return {
+    expectations,
+    summary: {
+      passed,
+      failed,
+      total,
+      pass_rate: total > 0 ? Math.round((passed / total) * 1000) / 1000 : 0,
+    },
+    execution_metrics: {
+      tool_calls: toolCalls,
+      total_tool_calls: totalToolCalls,
+      errors_encountered: errorsEncountered,
+    },
+    evaluators: buildEvaluators(result.scores),
+    workspace_changes: parseWorkspaceChanges(result.fileChanges),
+    conversation: result.conversationId
+      ? {
+          turns: result.trace
+            ? ((result.trace as { steps?: readonly unknown[] }).steps?.length ?? 0)
+            : 0,
+          conversation_id: result.conversationId,
+        }
+      : undefined,
+  };
+}
+
+export function buildTimingArtifact(results: readonly EvaluationResult[]): TimingArtifact {
+  let totalInput = 0;
+  let totalOutput = 0;
+  let totalDurationMs = 0;
+
+  for (const result of results) {
+    const usage = result.tokenUsage as { input?: number; output?: number } | undefined;
+    if (usage) {
+      totalInput += usage.input ?? 0;
+      totalOutput += usage.output ?? 0;
+    }
+    if (result.durationMs != null) {
+      totalDurationMs += result.durationMs;
+    }
+  }
+
+  return {
+    total_tokens: totalInput + totalOutput,
+    duration_ms: totalDurationMs,
+    total_duration_seconds: Math.round((totalDurationMs / 1000) * 1000) / 1000,
+    token_usage: {
+      input: totalInput,
+      output: totalOutput,
+    },
+  };
+}
+
+export function buildBenchmarkArtifact(
+  results: readonly EvaluationResult[],
+  evalFile = '',
+): BenchmarkArtifact {
+  const targetSet = new Set<string>();
+  const testIdSet = new Set<string>();
+  for (const result of results) {
+    targetSet.add(result.target);
+    testIdSet.add(result.testId);
+  }
+
+  const targets = [...targetSet].sort();
+  const testIds = [...testIdSet].sort();
+
+  const runSummary: BenchmarkArtifact['run_summary'] = {};
+  const notes: string[] = [];
+
+  for (const target of targets) {
+    const targetResults = results.filter((r) => r.target === target);
+
+    const passRates = targetResults.map(computePassRate);
+    const timings = targetResults
+      .filter((r) => r.durationMs != null)
+      .map((r) => (r.durationMs as number) / 1000);
+    const tokens = targetResults
+      .filter((r) => r.tokenUsage != null)
+      .map((r) => {
+        const usage = r.tokenUsage as { input?: number; output?: number };
+        return (usage.input ?? 0) + (usage.output ?? 0);
+      });
+
+    const entry: Record<string, unknown> = {
+      pass_rate: computeStats(passRates),
+      time_seconds: computeStats(timings),
+      tokens: computeStats(tokens),
+    };
+
+    // Optional tool_calls stats from trace data
+    const toolCallCounts = targetResults.map((r) => countToolCalls(r).total);
+    if (toolCallCounts.some((c) => c > 0)) {
+      entry.tool_calls = computeStats(toolCallCounts);
+    }
+
+    // Optional cost stats
+    const costs = targetResults.filter((r) => r.costUsd != null).map((r) => r.costUsd as number);
+    if (costs.length > 0) {
+      entry.cost_usd = computeStats(costs);
+    }
+
+    runSummary[target] = entry as (typeof runSummary)[string];
+  }
+
+  // Per-evaluator summary across all results
+  const evaluatorScores = new Map<string, number[]>();
+  for (const result of results) {
+    if (result.scores) {
+      for (const score of result.scores) {
+        const key = `${score.name}:${score.type}`;
+        if (!evaluatorScores.has(key)) {
+          evaluatorScores.set(key, []);
+        }
+        evaluatorScores.get(key)?.push(score.score);
+      }
+    }
+  }
+
+  let perEvaluatorSummary: Record<string, { mean: number; stddev: number }> | undefined;
+  if (evaluatorScores.size > 0) {
+    perEvaluatorSummary = {};
+    for (const [key, scores] of evaluatorScores) {
+      perEvaluatorSummary[key] = computeStats(scores);
+    }
+  }
+
+  const errorCount = results.filter((r) => r.executionStatus === 'execution_error').length;
+  if (errorCount > 0) {
+    notes.push(
+      `${errorCount} test(s) had execution errors and are included in pass_rate as failures`,
+    );
+  }
+  if (results.length === 0) {
+    notes.push('No results to summarize');
+  }
+
+  const firstResult = results[0];
+  const timestamp = firstResult?.timestamp ?? new Date().toISOString();
+
+  return {
+    metadata: {
+      eval_file: evalFile,
+      timestamp,
+      targets,
+      tests_run: testIds,
+    },
+    run_summary: runSummary,
+    per_evaluator_summary: perEvaluatorSummary,
+    notes,
+  };
+}
+
+// ---------------------------------------------------------------------------
+// Snake_case to camelCase conversion for reading JSONL files
+// ---------------------------------------------------------------------------
+
+function toCamelCase(str: string): string {
+  return str.replace(/_([a-z])/g, (_, letter) => letter.toUpperCase());
+}
+
+function toCamelCaseDeep(obj: unknown): unknown {
+  if (obj === null || obj === undefined) {
+    return obj;
+  }
+  if (Array.isArray(obj)) {
+    return obj.map((item) => toCamelCaseDeep(item));
+  }
+  if (typeof obj === 'object') {
+    const result: Record<string, unknown> = {};
+    for (const [key, value] of Object.entries(obj)) {
+      result[toCamelCase(key)] = toCamelCaseDeep(value);
+    }
+    return result;
+  }
+  return obj;
+}
+
+// ---------------------------------------------------------------------------
+// JSONL parsing
+// ---------------------------------------------------------------------------
+
+export function parseJsonlResults(content: string): EvaluationResult[] {
+  const results: EvaluationResult[] = [];
+  const lines = content.split('\n');
+  for (const line of lines) {
+    const trimmed = line.trim();
+    if (trimmed.length === 0) {
+      continue;
+    }
+    try {
+      const parsed = JSON.parse(trimmed);
+      // JSONL files from AgentV use snake_case; convert back to camelCase
+      const camelCased = toCamelCaseDeep(parsed);
+      results.push(camelCased as EvaluationResult);
+    } catch {
+      // Skip malformed lines
+    }
+  }
+  return results;
+}
+
+// ---------------------------------------------------------------------------
+// Artifact writer — reads JSONL and writes all three artifact types
+// ---------------------------------------------------------------------------
+
+export async function writeArtifacts(
+  jsonlPath: string,
+  outputDir: string,
+  options?: { evalFile?: string },
+): Promise<{ gradingDir: string; timingPath: string; benchmarkPath: string }> {
+  const content = await readFile(jsonlPath, 'utf8');
+  const results = parseJsonlResults(content);
+
+  return writeArtifactsFromResults(results, outputDir, options);
+}
+
+export async function writeArtifactsFromResults(
+  results: readonly EvaluationResult[],
+  outputDir: string,
+  options?: { evalFile?: string },
+): Promise<{ gradingDir: string; timingPath: string; benchmarkPath: string }> {
+  const gradingDir = path.join(outputDir, 'grading');
+  const timingPath = path.join(outputDir, 'timing.json');
+  const benchmarkPath = path.join(outputDir, 'benchmark.json');
+
+  await mkdir(gradingDir, { recursive: true });
+
+  // Write per-test grading artifacts
+  for (const result of results) {
+    const grading = buildGradingArtifact(result);
+    const safeTestId = result.testId.replace(/[/\\:*?"<>|]/g, '_');
+    const gradingPath = path.join(gradingDir, `${safeTestId}.json`);
+    await writeFile(gradingPath, `${JSON.stringify(grading, null, 2)}\n`, 'utf8');
+  }
+
+  // Write aggregate timing
+  const timing = buildTimingArtifact(results);
+  await writeFile(timingPath, `${JSON.stringify(timing, null, 2)}\n`, 'utf8');
+
+  // Write benchmark
+  const benchmark = buildBenchmarkArtifact(results, options?.evalFile);
+  await writeFile(benchmarkPath, `${JSON.stringify(benchmark, null, 2)}\n`, 'utf8');
+
+  return { gradingDir, timingPath, benchmarkPath };
+}
diff --git a/apps/cli/src/commands/eval/commands/run.ts b/apps/cli/src/commands/eval/commands/run.ts
index 3b1357b94..da8d39fc9 100644
--- a/apps/cli/src/commands/eval/commands/run.ts
+++ b/apps/cli/src/commands/eval/commands/run.ts
@@ -157,6 +157,12 @@ export const evalRunCommand = command({
       long: 'benchmark-json',
       description: 'Write Agent Skills benchmark.json to the specified path',
     }),
+    artifacts: option({
+      type: optional(string),
+      long: 'artifacts',
+      description:
+        'Write companion artifacts (grading/<test>.json, timing.json, benchmark.json) to the specified directory',
+    }),
   },
   handler: async (args) => {
     // Launch interactive wizard when no eval paths and stdin is a TTY
@@ -196,6 +202,7 @@ export const evalRunCommand = command({
       retryErrors: args.retryErrors,
       strict: args.strict,
       benchmarkJson: args.benchmarkJson,
+      artifacts: args.artifacts,
     };
     await runEvalCommand({ testFiles: resolvedPaths, rawOptions });
   },
diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts
index c73779f94..43eec380b 100644
--- a/apps/cli/src/commands/eval/run-eval.ts
+++ b/apps/cli/src/commands/eval/run-eval.ts
@@ -26,6 +26,7 @@ import {
 } from '@agentv/core';
 
 import { enforceRequiredVersion } from '../../version-check.js';
+import { writeArtifactsFromResults } from './artifact-writer.js';
 import { writeBenchmarkJson } from './benchmark-writer.js';
 import { loadEnvFromHierarchy } from './env.js';
 import {
@@ -80,6 +81,7 @@ interface NormalizedOptions {
   readonly workspaceMode?: 'pooled' | 'temp' | 'static';
   readonly workspacePath?: string;
   readonly benchmarkJson?: string;
+  readonly artifacts?: string;
 }
 
 function normalizeBoolean(value: unknown): boolean {
@@ -246,6 +248,7 @@ function normalizeOptions(
     workspaceMode,
     workspacePath,
     benchmarkJson: normalizeString(rawOptions.benchmarkJson),
+    artifacts: normalizeString(rawOptions.artifacts),
   } satisfies NormalizedOptions;
 }
 
@@ -1046,6 +1049,21 @@ export async function runEvalCommand(input: RunEvalCommandInput): Promise<void>
       console.log(`Benchmark written to: ${benchmarkPath}`);
     }
 
+    // Write companion artifacts (grading, timing, benchmark) if requested
+    if (options.artifacts && allResults.length > 0) {
+      const artifactsDir = path.resolve(options.artifacts);
+      const evalFile = resolvedTestFiles.length === 1 ? resolvedTestFiles[0] : '';
+      const {
+        gradingDir,
+        timingPath,
+        benchmarkPath: abp,
+      } = await writeArtifactsFromResults(allResults, artifactsDir, { evalFile });
+      console.log(`Artifacts written to: ${artifactsDir}`);
+      console.log(`  Grading: ${gradingDir} (${allResults.length} files)`);
+      console.log(`  Timing:  ${timingPath}`);
+      console.log(`  Benchmark: ${abp}`);
+    }
+
     // Print workspace paths for failed cases (when preserved for debugging)
     const failedWithWorkspaces = allResults.filter(
       (r) => r.workspacePath && (r.error || r.score < 0.5),
diff --git a/apps/cli/test/commands/eval/artifact-writer.test.ts b/apps/cli/test/commands/eval/artifact-writer.test.ts
new file mode 100644
index 000000000..0d46e3d63
--- /dev/null
+++ b/apps/cli/test/commands/eval/artifact-writer.test.ts
@@ -0,0 +1,507 @@
+import { afterEach, beforeEach, describe, expect, it } from 'bun:test';
+import { readFile, readdir, rm } from 'node:fs/promises';
+import path from 'node:path';
+
+import type { EvaluationResult, EvaluatorResult } from '@agentv/core';
+
+import {
+  type BenchmarkArtifact,
+  type GradingArtifact,
+  type TimingArtifact,
+  buildBenchmarkArtifact,
+  buildGradingArtifact,
+  buildTimingArtifact,
+  parseJsonlResults,
+  writeArtifacts,
+  writeArtifactsFromResults,
+} from '../../../src/commands/eval/artifact-writer.js';
+
+function makeResult(overrides: Partial<EvaluationResult> = {}): EvaluationResult {
+  return {
+    timestamp: '2026-03-13T00:00:00.000Z',
+    testId: 'test-1',
+    score: 0.9,
+    hits: ['criterion-1'],
+    misses: [],
+    answer: 'test answer',
+    target: 'test-target',
+    executionStatus: 'ok',
+    ...overrides,
+  } as EvaluationResult;
+}
+
+function makeEvaluatorResult(overrides: Partial<EvaluatorResult> = {}): EvaluatorResult {
+  return {
+    name: 'judge-1',
+    type: 'llm-judge',
+    score: 0.85,
+    hits: ['criterion-a'],
+    misses: ['criterion-b'],
+    reasoning: 'Good output overall',
+    ...overrides,
+  } as EvaluatorResult;
+}
+
+// ---------------------------------------------------------------------------
+// Grading artifact
+// ---------------------------------------------------------------------------
+
+describe('buildGradingArtifact', () => {
+  it('maps evaluator hits/misses to expectations', () => {
+    const result = makeResult({
+      scores: [
+        makeEvaluatorResult({
+          hits: ['correct format', 'has code'],
+          misses: ['missing tests'],
+          reasoning: 'Output was formatted well',
+        }),
+      ],
+    });
+
+    const grading = buildGradingArtifact(result);
+
+    expect(grading.expectations).toHaveLength(3);
+    expect(grading.expectations[0]).toEqual({
+      text: 'correct format',
+      passed: true,
+      evidence: 'Output was formatted well',
+    });
+    expect(grading.expectations[1]).toEqual({
+      text: 'has code',
+      passed: true,
+      evidence: 'Output was formatted well',
+    });
+    expect(grading.expectations[2]).toEqual({
+      text: 'missing tests',
+      passed: false,
+      evidence: 'Output was formatted well',
+    });
+  });
+
+  it('computes correct summary', () => {
+    const result = makeResult({
+      scores: [
+        makeEvaluatorResult({
+          hits: ['a', 'b'],
+          misses: ['c'],
+        }),
+      ],
+    });
+
+    const grading = buildGradingArtifact(result);
+
+    expect(grading.summary).toEqual({
+      passed: 2,
+      failed: 1,
+      total: 3,
+      pass_rate: 0.667,
+    });
+  });
+
+  it('falls back to top-level hits/misses when no evaluator scores', () => {
+    const result = makeResult({
+      hits: ['ok-1', 'ok-2'],
+      misses: ['miss-1'],
+      reasoning: 'top-level reasoning',
+    });
+
+    const grading = buildGradingArtifact(result);
+
+    expect(grading.expectations).toHaveLength(3);
+    expect(grading.expectations[0].text).toBe('ok-1');
+    expect(grading.expectations[0].evidence).toBe('top-level reasoning');
+    expect(grading.expectations[2].text).toBe('miss-1');
+    expect(grading.expectations[2].passed).toBe(false);
+  });
+
+  it('includes evaluators list with AgentV extensions', () => {
+    const result = makeResult({
+      scores: [
+        makeEvaluatorResult({ name: 'format-check', type: 'code-judge', score: 1.0 }),
+        makeEvaluatorResult({ name: 'quality', type: 'llm-judge', score: 0.7 }),
+      ],
+    });
+
+    const grading = buildGradingArtifact(result);
+
+    expect(grading.evaluators).toHaveLength(2);
+    expect(grading.evaluators?.[0].name).toBe('format-check');
+    expect(grading.evaluators?.[0].type).toBe('code-judge');
+    expect(grading.evaluators?.[1].score).toBe(0.7);
+  });
+
+  it('records error as errors_encountered', () => {
+    const result = makeResult({ error: 'Timeout exceeded' });
+    const grading = buildGradingArtifact(result);
+    expect(grading.execution_metrics.errors_encountered).toBe(1);
+  });
+
+  it('handles result with no hits, misses, or scores', () => {
+    const result = makeResult({ hits: [], misses: [], scores: undefined });
+    const grading = buildGradingArtifact(result);
+
+    expect(grading.expectations).toHaveLength(0);
+    expect(grading.summary).toEqual({
+      passed: 0,
+      failed: 0,
+      total: 0,
+      pass_rate: 0,
+    });
+    expect(grading.evaluators).toBeUndefined();
+  });
+
+  it('includes workspace_changes when fileChanges present', () => {
+    const diff = [
+      '--- /dev/null',
+      '+++ b/new-file.ts',
+      '@@ -0,0 +1 @@',
+      '+console.log("hello")',
+      '--- a/existing.ts',
+      '+++ b/existing.ts',
+      '@@ -1 +1 @@',
+      '-old',
+      '+new',
+    ].join('\n');
+
+    const result = makeResult({ fileChanges: diff });
+    const grading = buildGradingArtifact(result);
+
+    expect(grading.workspace_changes).toBeDefined();
+    expect(grading.workspace_changes?.files_created).toBe(1);
+    expect(grading.workspace_changes?.files_modified).toBe(1);
+  });
+
+  it('includes conversation when conversationId present', () => {
+    const result = makeResult({ conversationId: 'conv-abc-123' });
+    const grading = buildGradingArtifact(result);
+
+    expect(grading.conversation).toBeDefined();
+    expect(grading.conversation?.conversation_id).toBe('conv-abc-123');
+  });
+});
+
+// ---------------------------------------------------------------------------
+// Timing artifact
+// ---------------------------------------------------------------------------
+
+describe('buildTimingArtifact', () => {
+  it('aggregates timing across results', () => {
+    const results = [
+      makeResult({
+        durationMs: 30000,
+        tokenUsage: { input: 1000, output: 500 },
+      } as Partial<EvaluationResult>),
+      makeResult({
+        durationMs: 60000,
+        tokenUsage: { input: 2000, output: 1000 },
+      } as Partial<EvaluationResult>),
+    ];
+
+    const timing = buildTimingArtifact(results);
+
+    expect(timing.total_tokens).toBe(4500);
+    expect(timing.duration_ms).toBe(90000);
+    expect(timing.total_duration_seconds).toBe(90);
+    expect(timing.token_usage).toEqual({ input: 3000, output: 1500 });
+  });
+
+  it('handles results with no timing data', () => {
+    const results = [makeResult({})];
+    const timing = buildTimingArtifact(results);
+
+    expect(timing.total_tokens).toBe(0);
+    expect(timing.duration_ms).toBe(0);
+    expect(timing.total_duration_seconds).toBe(0);
+    expect(timing.token_usage).toEqual({ input: 0, output: 0 });
+  });
+
+  it('handles empty results array', () => {
+    const timing = buildTimingArtifact([]);
+
+    expect(timing.total_tokens).toBe(0);
+    expect(timing.duration_ms).toBe(0);
+    expect(timing.total_duration_seconds).toBe(0);
+  });
+
+  it('handles partial token usage', () => {
+    const results = [
+      makeResult({
+        tokenUsage: { input: 500 },
+      } as Partial<EvaluationResult>),
+    ];
+
+    const timing = buildTimingArtifact(results);
+    expect(timing.total_tokens).toBe(500);
+    expect(timing.token_usage).toEqual({ input: 500, output: 0 });
+  });
+});
+
+// ---------------------------------------------------------------------------
+// Benchmark artifact
+// ---------------------------------------------------------------------------
+
+describe('buildBenchmarkArtifact', () => {
+  it('computes per-target statistics', () => {
+    const results = [
+      makeResult({ target: 'gpt-4', score: 0.9, durationMs: 30000 }),
+      makeResult({ target: 'gpt-4', testId: 'test-2', score: 0.8, durationMs: 60000 }),
+      makeResult({ target: 'claude', score: 0.5, durationMs: 45000 }),
+    ];
+
+    const benchmark = buildBenchmarkArtifact(results, 'test.eval.yaml');
+
+    expect(benchmark.metadata.eval_file).toBe('test.eval.yaml');
+    expect(benchmark.metadata.targets).toEqual(['claude', 'gpt-4']);
+    expect(benchmark.metadata.tests_run).toEqual(['test-1', 'test-2']);
+
+    // gpt-4: both pass (>= 0.8), pass_rate mean = 1.0
+    expect(benchmark.run_summary['gpt-4'].pass_rate.mean).toBe(1);
+    // claude: 0.5 < 0.8 → 0.0, pass_rate mean = 0.0
+    expect(benchmark.run_summary.claude.pass_rate.mean).toBe(0);
+
+    // gpt-4: (30+60)/2 = 45 seconds
+    expect(benchmark.run_summary['gpt-4'].time_seconds.mean).toBe(45);
+    expect(benchmark.run_summary['gpt-4'].time_seconds.stddev).toBe(15);
+  });
+
+  it('includes per-evaluator summary', () => {
+    const results = [
+      makeResult({
+        scores: [makeEvaluatorResult({ name: 'quality', type: 'llm-judge', score: 0.9 })],
+      }),
+      makeResult({
+        testId: 'test-2',
+        scores: [makeEvaluatorResult({ name: 'quality', type: 'llm-judge', score: 0.7 })],
+      }),
+    ];
+
+    const benchmark = buildBenchmarkArtifact(results);
+
+    expect(benchmark.per_evaluator_summary).toBeDefined();
+    expect(benchmark.per_evaluator_summary?.['quality:llm-judge'].mean).toBe(0.8);
+  });
+
+  it('adds note when execution errors present', () => {
+    const results = [makeResult({ executionStatus: 'execution_error', score: 0 })];
+
+    const benchmark = buildBenchmarkArtifact(results);
+    expect(benchmark.notes.some((n) => n.includes('execution errors'))).toBe(true);
+  });
+
+  it('handles empty results', () => {
+    const benchmark = buildBenchmarkArtifact([]);
+
+    expect(benchmark.metadata.targets).toEqual([]);
+    expect(benchmark.metadata.tests_run).toEqual([]);
+    expect(benchmark.notes).toContain('No results to summarize');
+  });
+
+  it('includes cost_usd when available', () => {
+    const results = [makeResult({ costUsd: 0.05 }), makeResult({ testId: 'test-2', costUsd: 0.1 })];
+
+    const benchmark = buildBenchmarkArtifact(results);
+    const summary = benchmark.run_summary['test-target'];
+    expect(summary.cost_usd).toBeDefined();
+    expect(summary.cost_usd?.mean).toBe(0.075);
+  });
+});
+
+// ---------------------------------------------------------------------------
+// JSONL parsing
+// ---------------------------------------------------------------------------
+
+describe('parseJsonlResults', () => {
+  it('parses multi-line JSONL', () => {
+    const line1 = JSON.stringify({ testId: 'a', score: 0.9 });
+    const line2 = JSON.stringify({ testId: 'b', score: 0.5 });
+    const content = `${line1}\n${line2}\n`;
+
+    const results = parseJsonlResults(content);
+    expect(results).toHaveLength(2);
+    expect(results[0].testId).toBe('a');
+    expect(results[1].testId).toBe('b');
+  });
+
+  it('handles empty content', () => {
+    expect(parseJsonlResults('')).toHaveLength(0);
+  });
+
+  it('skips blank lines', () => {
+    const line = JSON.stringify({ testId: 'a', score: 0.9 });
+    const content = `\n${line}\n\n`;
+    expect(parseJsonlResults(content)).toHaveLength(1);
+  });
+
+  it('skips malformed lines', () => {
+    const good = JSON.stringify({ testId: 'a', score: 0.9 });
+    const content = `${good}\nnot json\n`;
+    expect(parseJsonlResults(content)).toHaveLength(1);
+  });
+});
+
+// ---------------------------------------------------------------------------
+// Schema compatibility (shared fields match skill-creator format)
+// ---------------------------------------------------------------------------
+
+describe('schema compatibility', () => {
+  it('grading expectations have text/passed/evidence fields', () => {
+    const result = makeResult({
+      scores: [makeEvaluatorResult({ hits: ['x'], misses: ['y'], reasoning: 'r' })],
+    });
+    const grading = buildGradingArtifact(result);
+
+    for (const exp of grading.expectations) {
+      expect(exp).toHaveProperty('text');
+      expect(exp).toHaveProperty('passed');
+      expect(exp).toHaveProperty('evidence');
+      expect(typeof exp.text).toBe('string');
+      expect(typeof exp.passed).toBe('boolean');
+      expect(typeof exp.evidence).toBe('string');
+    }
+  });
+
+  it('grading summary has passed/failed/total/pass_rate', () => {
+    const result = makeResult({
+      scores: [makeEvaluatorResult({ hits: ['a'], misses: [] })],
+    });
+    const grading = buildGradingArtifact(result);
+
+    expect(grading.summary).toHaveProperty('passed');
+    expect(grading.summary).toHaveProperty('failed');
+    expect(grading.summary).toHaveProperty('total');
+    expect(grading.summary).toHaveProperty('pass_rate');
+    expect(typeof grading.summary.pass_rate).toBe('number');
+  });
+
+  it('timing has total_tokens, duration_ms, total_duration_seconds, token_usage', () => {
+    const timing = buildTimingArtifact([makeResult({})]);
+
+    expect(timing).toHaveProperty('total_tokens');
+    expect(timing).toHaveProperty('duration_ms');
+    expect(timing).toHaveProperty('total_duration_seconds');
+    expect(timing).toHaveProperty('token_usage');
+    expect(timing.token_usage).toHaveProperty('input');
+    expect(timing.token_usage).toHaveProperty('output');
+  });
+
+  it('benchmark run_summary has pass_rate/time_seconds/tokens with mean/stddev', () => {
+    const benchmark = buildBenchmarkArtifact([makeResult({})]);
+    const summary = benchmark.run_summary['test-target'];
+
+    expect(summary).toBeDefined();
+    expect(summary.pass_rate).toHaveProperty('mean');
+    expect(summary.pass_rate).toHaveProperty('stddev');
+    expect(summary.time_seconds).toHaveProperty('mean');
+    expect(summary.time_seconds).toHaveProperty('stddev');
+    expect(summary.tokens).toHaveProperty('mean');
+    expect(summary.tokens).toHaveProperty('stddev');
+  });
+});
+
+// ---------------------------------------------------------------------------
+// File I/O: writeArtifacts / writeArtifactsFromResults
+// ---------------------------------------------------------------------------
+
+describe('writeArtifactsFromResults', () => {
+  const testDir = path.join(import.meta.dir, '.test-artifact-output');
+
+  beforeEach(() => {
+    // Clean before each test to ensure isolation
+  });
+
+  afterEach(async () => {
+    await rm(testDir, { recursive: true, force: true }).catch(() => undefined);
+  });
+
+  it('writes grading, timing, and benchmark files', async () => {
+    const results = [
+      makeResult({ testId: 'alpha', score: 0.9, durationMs: 5000 }),
+      makeResult({ testId: 'beta', score: 0.6, durationMs: 8000 }),
+    ];
+
+    const paths = await writeArtifactsFromResults(results, testDir, {
+      evalFile: 'my-eval.yaml',
+    });
+
+    // Check grading files
+    const gradingFiles = await readdir(paths.gradingDir);
+    expect(gradingFiles.sort()).toEqual(['alpha.json', 'beta.json']);
+
+    const alphaGrading: GradingArtifact = JSON.parse(
+      await readFile(path.join(paths.gradingDir, 'alpha.json'), 'utf8'),
+    );
+    expect(alphaGrading.summary).toBeDefined();
+    expect(alphaGrading.execution_metrics).toBeDefined();
+
+    // Check timing
+    const timing: TimingArtifact = JSON.parse(await readFile(paths.timingPath, 'utf8'));
+    expect(timing.duration_ms).toBe(13000);
+
+    // Check benchmark
+    const benchmark: BenchmarkArtifact = JSON.parse(await readFile(paths.benchmarkPath, 'utf8'));
+    expect(benchmark.metadata.eval_file).toBe('my-eval.yaml');
+    expect(benchmark.metadata.tests_run.sort()).toEqual(['alpha', 'beta']);
+  });
+
+  it('handles empty results array', async () => {
+    const paths = await writeArtifactsFromResults([], testDir);
+
+    const gradingFiles = await readdir(paths.gradingDir);
+    expect(gradingFiles).toHaveLength(0);
+
+    const timing: TimingArtifact = JSON.parse(await readFile(paths.timingPath, 'utf8'));
+    expect(timing.total_tokens).toBe(0);
+
+    const benchmark: BenchmarkArtifact = JSON.parse(await readFile(paths.benchmarkPath, 'utf8'));
+    expect(benchmark.notes).toContain('No results to summarize');
+  });
+
+  it('sanitizes test IDs for filenames', async () => {
+    const results = [makeResult({ testId: 'path/to:test*1' })];
+    await writeArtifactsFromResults(results, testDir);
+
+    const gradingFiles = await readdir(path.join(testDir, 'grading'));
+    expect(gradingFiles).toEqual(['path_to_test_1.json']);
+  });
+});
+
+describe('writeArtifacts (from JSONL file)', () => {
+  const testDir = path.join(import.meta.dir, '.test-artifact-jsonl');
+  const jsonlPath = path.join(testDir, 'results.jsonl');
+
+  beforeEach(async () => {
+    const { mkdir, writeFile } = await import('node:fs/promises');
+    await mkdir(testDir, { recursive: true });
+    const lines = [
+      JSON.stringify({
+        timestamp: '2026-01-01T00:00:00Z',
+        test_id: 'from-file',
+        score: 0.85,
+        hits: ['pass-1'],
+        misses: [],
+        answer: 'file answer',
+        target: 'default',
+        execution_status: 'ok',
+        duration_ms: 12000,
+        token_usage: { input: 500, output: 200 },
+      }),
+    ];
+    await writeFile(jsonlPath, `${lines.join('\n')}\n`, 'utf8');
+  });
+
+  afterEach(async () => {
+    await rm(testDir, { recursive: true, force: true }).catch(() => undefined);
+  });
+
+  it('reads JSONL and produces artifacts', async () => {
+    const outputDir = path.join(testDir, 'out');
+    const paths = await writeArtifacts(jsonlPath, outputDir);
+
+    const gradingFiles = await readdir(paths.gradingDir);
+    expect(gradingFiles).toHaveLength(1);
+
+    const timing: TimingArtifact = JSON.parse(await readFile(paths.timingPath, 'utf8'));
+    expect(timing.duration_ms).toBe(12000);
+    expect(timing.total_tokens).toBe(700);
+  });
+});