From 391e4fd8d4af13a8c6dceb0cb7844f27f077ea4d Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sat, 14 Mar 2026 04:22:39 +0000 Subject: [PATCH] feat: skill-eval companion artifacts (grading, timing, benchmark) (#565) Add ArtifactWriter module that produces grading/.json, timing.json, and benchmark.json from existing JSONL eval results. Includes --artifacts CLI flag for eval run command. - Grading artifacts map per-evaluator hits/misses to skill-creator's expectations/evidence format with AgentV extensions (evaluators, workspace_changes, conversation) - Timing artifact aggregates duration and token usage across all results - Benchmark artifact computes per-target statistics (mean/stddev) for pass_rate, time, tokens, tool_calls, and cost - JSONL parser handles snake_case keys from existing output files - 29 tests covering artifact generation, schema compatibility, and I/O - Schemas are supersets of Anthropic skill-creator conventions Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- apps/cli/src/commands/eval/artifact-writer.ts | 492 +++++++++++++++++ apps/cli/src/commands/eval/commands/run.ts | 7 + apps/cli/src/commands/eval/run-eval.ts | 18 + .../commands/eval/artifact-writer.test.ts | 507 ++++++++++++++++++ 4 files changed, 1024 insertions(+) create mode 100644 apps/cli/src/commands/eval/artifact-writer.ts create mode 100644 apps/cli/test/commands/eval/artifact-writer.test.ts diff --git a/apps/cli/src/commands/eval/artifact-writer.ts b/apps/cli/src/commands/eval/artifact-writer.ts new file mode 100644 index 000000000..275484c01 --- /dev/null +++ b/apps/cli/src/commands/eval/artifact-writer.ts @@ -0,0 +1,492 @@ +import { mkdir, readFile, writeFile } from 'node:fs/promises'; +import path from 'node:path'; + +import type { EvaluationResult, EvaluatorResult } from '@agentv/core'; + +// --------------------------------------------------------------------------- +// Artifact interfaces (snake_case to match skill-creator conventions) +// --------------------------------------------------------------------------- + +export interface GradingArtifact { + readonly expectations: readonly { + readonly text: string; + readonly passed: boolean; + readonly evidence: string; + }[]; + readonly summary: { + readonly passed: number; + readonly failed: number; + readonly total: number; + readonly pass_rate: number; + }; + readonly execution_metrics: { + readonly tool_calls: Record; + readonly total_tool_calls: number; + readonly errors_encountered: number; + }; + readonly evaluators?: readonly { + readonly name: string; + readonly type: string; + readonly score: number; + readonly reasoning: string; + readonly [key: string]: unknown; + }[]; + readonly workspace_changes?: { + readonly files_modified: number; + readonly files_created: number; + readonly diff_summary: string; + }; + readonly conversation?: { + readonly turns: number; + readonly conversation_id: string; + }; +} + +export interface TimingArtifact { + readonly total_tokens: number; + readonly duration_ms: number; + readonly total_duration_seconds: number; + readonly token_usage: { + readonly input: number; + readonly output: number; + }; +} + +export interface BenchmarkArtifact { + readonly metadata: { + readonly eval_file: string; + readonly timestamp: string; + readonly targets: readonly string[]; + readonly tests_run: readonly string[]; + }; + readonly run_summary: Record< + string, + { + readonly pass_rate: { readonly mean: number; readonly stddev: number }; + readonly time_seconds: { readonly mean: number; readonly stddev: number }; + readonly tokens: { readonly mean: number; readonly stddev: number }; + readonly tool_calls?: { readonly mean: number; readonly stddev: number }; + readonly cost_usd?: { readonly mean: number; readonly stddev: number }; + } + >; + readonly per_evaluator_summary?: Record< + string, + { readonly mean: number; readonly stddev: number } + >; + readonly notes: readonly string[]; +} + +// --------------------------------------------------------------------------- +// Statistics helpers +// --------------------------------------------------------------------------- + +const PASS_THRESHOLD = 0.8; + +function computeStats(values: readonly number[]): { mean: number; stddev: number } { + if (values.length === 0) { + return { mean: 0, stddev: 0 }; + } + const mean = values.reduce((sum, v) => sum + v, 0) / values.length; + const variance = values.reduce((sum, v) => sum + (v - mean) ** 2, 0) / values.length; + return { + mean: Math.round(mean * 1000) / 1000, + stddev: Math.round(Math.sqrt(variance) * 1000) / 1000, + }; +} + +function computePassRate(result: EvaluationResult): number { + const scores = result.scores; + if (scores && scores.length > 0) { + const passed = scores.filter((s) => s.score >= PASS_THRESHOLD).length; + return passed / scores.length; + } + return result.score >= PASS_THRESHOLD ? 1.0 : 0.0; +} + +// --------------------------------------------------------------------------- +// Tool-call counting from trace data +// --------------------------------------------------------------------------- + +function countToolCalls(result: EvaluationResult): { + toolCalls: Record; + total: number; +} { + const toolCalls: Record = {}; + let total = 0; + + const trace = result.trace as + | { steps?: readonly { toolName?: string; type?: string }[] } + | undefined; + + if (trace?.steps) { + for (const step of trace.steps) { + if (step.toolName || step.type === 'tool') { + const name = step.toolName ?? 'unknown'; + toolCalls[name] = (toolCalls[name] ?? 0) + 1; + total += 1; + } + } + } + + return { toolCalls, total }; +} + +// --------------------------------------------------------------------------- +// Workspace change parsing from fileChanges diff +// --------------------------------------------------------------------------- + +function parseWorkspaceChanges( + fileChanges: string | undefined, +): GradingArtifact['workspace_changes'] | undefined { + if (!fileChanges) { + return undefined; + } + + let filesModified = 0; + let filesCreated = 0; + + const lines = fileChanges.split('\n'); + for (const line of lines) { + if (line.startsWith('--- /dev/null')) { + filesCreated += 1; + } else if (line.startsWith('--- a/')) { + filesModified += 1; + } + } + + const summaryLines = lines.slice(0, 20); + const diffSummary = + lines.length > 20 + ? `${summaryLines.join('\n')}\n... (${lines.length - 20} more lines)` + : fileChanges; + + return { + files_modified: filesModified, + files_created: filesCreated, + diff_summary: diffSummary, + }; +} + +// --------------------------------------------------------------------------- +// Build expectations from evaluator results (skill-creator compatible) +// --------------------------------------------------------------------------- + +function buildExpectations(result: EvaluationResult): GradingArtifact['expectations'] { + const expectations: { + text: string; + passed: boolean; + evidence: string; + }[] = []; + + if (result.scores && result.scores.length > 0) { + for (const evaluator of result.scores) { + for (const hit of evaluator.hits) { + expectations.push({ + text: hit, + passed: true, + evidence: evaluator.reasoning ?? '', + }); + } + for (const miss of evaluator.misses) { + expectations.push({ + text: miss, + passed: false, + evidence: evaluator.reasoning ?? '', + }); + } + } + } else { + for (const hit of result.hits) { + expectations.push({ text: hit, passed: true, evidence: result.reasoning ?? '' }); + } + for (const miss of result.misses) { + expectations.push({ text: miss, passed: false, evidence: result.reasoning ?? '' }); + } + } + + return expectations; +} + +// --------------------------------------------------------------------------- +// Build evaluators list +// --------------------------------------------------------------------------- + +function buildEvaluators( + scores: readonly EvaluatorResult[] | undefined, +): GradingArtifact['evaluators'] { + if (!scores || scores.length === 0) { + return undefined; + } + + return scores.map((s) => ({ + name: s.name, + type: s.type, + score: s.score, + reasoning: s.reasoning ?? '', + weight: s.weight, + verdict: s.verdict, + hits: s.hits, + misses: s.misses, + details: s.details, + })); +} + +// --------------------------------------------------------------------------- +// Public artifact builders +// --------------------------------------------------------------------------- + +export function buildGradingArtifact(result: EvaluationResult): GradingArtifact { + const expectations = buildExpectations(result); + const passed = expectations.filter((e) => e.passed).length; + const failed = expectations.filter((e) => !e.passed).length; + const total = expectations.length; + + const { toolCalls, total: totalToolCalls } = countToolCalls(result); + const errorsEncountered = result.error ? 1 : 0; + + return { + expectations, + summary: { + passed, + failed, + total, + pass_rate: total > 0 ? Math.round((passed / total) * 1000) / 1000 : 0, + }, + execution_metrics: { + tool_calls: toolCalls, + total_tool_calls: totalToolCalls, + errors_encountered: errorsEncountered, + }, + evaluators: buildEvaluators(result.scores), + workspace_changes: parseWorkspaceChanges(result.fileChanges), + conversation: result.conversationId + ? { + turns: result.trace + ? ((result.trace as { steps?: readonly unknown[] }).steps?.length ?? 0) + : 0, + conversation_id: result.conversationId, + } + : undefined, + }; +} + +export function buildTimingArtifact(results: readonly EvaluationResult[]): TimingArtifact { + let totalInput = 0; + let totalOutput = 0; + let totalDurationMs = 0; + + for (const result of results) { + const usage = result.tokenUsage as { input?: number; output?: number } | undefined; + if (usage) { + totalInput += usage.input ?? 0; + totalOutput += usage.output ?? 0; + } + if (result.durationMs != null) { + totalDurationMs += result.durationMs; + } + } + + return { + total_tokens: totalInput + totalOutput, + duration_ms: totalDurationMs, + total_duration_seconds: Math.round((totalDurationMs / 1000) * 1000) / 1000, + token_usage: { + input: totalInput, + output: totalOutput, + }, + }; +} + +export function buildBenchmarkArtifact( + results: readonly EvaluationResult[], + evalFile = '', +): BenchmarkArtifact { + const targetSet = new Set(); + const testIdSet = new Set(); + for (const result of results) { + targetSet.add(result.target); + testIdSet.add(result.testId); + } + + const targets = [...targetSet].sort(); + const testIds = [...testIdSet].sort(); + + const runSummary: BenchmarkArtifact['run_summary'] = {}; + const notes: string[] = []; + + for (const target of targets) { + const targetResults = results.filter((r) => r.target === target); + + const passRates = targetResults.map(computePassRate); + const timings = targetResults + .filter((r) => r.durationMs != null) + .map((r) => (r.durationMs as number) / 1000); + const tokens = targetResults + .filter((r) => r.tokenUsage != null) + .map((r) => { + const usage = r.tokenUsage as { input?: number; output?: number }; + return (usage.input ?? 0) + (usage.output ?? 0); + }); + + const entry: Record = { + pass_rate: computeStats(passRates), + time_seconds: computeStats(timings), + tokens: computeStats(tokens), + }; + + // Optional tool_calls stats from trace data + const toolCallCounts = targetResults.map((r) => countToolCalls(r).total); + if (toolCallCounts.some((c) => c > 0)) { + entry.tool_calls = computeStats(toolCallCounts); + } + + // Optional cost stats + const costs = targetResults.filter((r) => r.costUsd != null).map((r) => r.costUsd as number); + if (costs.length > 0) { + entry.cost_usd = computeStats(costs); + } + + runSummary[target] = entry as (typeof runSummary)[string]; + } + + // Per-evaluator summary across all results + const evaluatorScores = new Map(); + for (const result of results) { + if (result.scores) { + for (const score of result.scores) { + const key = `${score.name}:${score.type}`; + if (!evaluatorScores.has(key)) { + evaluatorScores.set(key, []); + } + evaluatorScores.get(key)?.push(score.score); + } + } + } + + let perEvaluatorSummary: Record | undefined; + if (evaluatorScores.size > 0) { + perEvaluatorSummary = {}; + for (const [key, scores] of evaluatorScores) { + perEvaluatorSummary[key] = computeStats(scores); + } + } + + const errorCount = results.filter((r) => r.executionStatus === 'execution_error').length; + if (errorCount > 0) { + notes.push( + `${errorCount} test(s) had execution errors and are included in pass_rate as failures`, + ); + } + if (results.length === 0) { + notes.push('No results to summarize'); + } + + const firstResult = results[0]; + const timestamp = firstResult?.timestamp ?? new Date().toISOString(); + + return { + metadata: { + eval_file: evalFile, + timestamp, + targets, + tests_run: testIds, + }, + run_summary: runSummary, + per_evaluator_summary: perEvaluatorSummary, + notes, + }; +} + +// --------------------------------------------------------------------------- +// Snake_case to camelCase conversion for reading JSONL files +// --------------------------------------------------------------------------- + +function toCamelCase(str: string): string { + return str.replace(/_([a-z])/g, (_, letter) => letter.toUpperCase()); +} + +function toCamelCaseDeep(obj: unknown): unknown { + if (obj === null || obj === undefined) { + return obj; + } + if (Array.isArray(obj)) { + return obj.map((item) => toCamelCaseDeep(item)); + } + if (typeof obj === 'object') { + const result: Record = {}; + for (const [key, value] of Object.entries(obj)) { + result[toCamelCase(key)] = toCamelCaseDeep(value); + } + return result; + } + return obj; +} + +// --------------------------------------------------------------------------- +// JSONL parsing +// --------------------------------------------------------------------------- + +export function parseJsonlResults(content: string): EvaluationResult[] { + const results: EvaluationResult[] = []; + const lines = content.split('\n'); + for (const line of lines) { + const trimmed = line.trim(); + if (trimmed.length === 0) { + continue; + } + try { + const parsed = JSON.parse(trimmed); + // JSONL files from AgentV use snake_case; convert back to camelCase + const camelCased = toCamelCaseDeep(parsed); + results.push(camelCased as EvaluationResult); + } catch { + // Skip malformed lines + } + } + return results; +} + +// --------------------------------------------------------------------------- +// Artifact writer — reads JSONL and writes all three artifact types +// --------------------------------------------------------------------------- + +export async function writeArtifacts( + jsonlPath: string, + outputDir: string, + options?: { evalFile?: string }, +): Promise<{ gradingDir: string; timingPath: string; benchmarkPath: string }> { + const content = await readFile(jsonlPath, 'utf8'); + const results = parseJsonlResults(content); + + return writeArtifactsFromResults(results, outputDir, options); +} + +export async function writeArtifactsFromResults( + results: readonly EvaluationResult[], + outputDir: string, + options?: { evalFile?: string }, +): Promise<{ gradingDir: string; timingPath: string; benchmarkPath: string }> { + const gradingDir = path.join(outputDir, 'grading'); + const timingPath = path.join(outputDir, 'timing.json'); + const benchmarkPath = path.join(outputDir, 'benchmark.json'); + + await mkdir(gradingDir, { recursive: true }); + + // Write per-test grading artifacts + for (const result of results) { + const grading = buildGradingArtifact(result); + const safeTestId = result.testId.replace(/[/\\:*?"<>|]/g, '_'); + const gradingPath = path.join(gradingDir, `${safeTestId}.json`); + await writeFile(gradingPath, `${JSON.stringify(grading, null, 2)}\n`, 'utf8'); + } + + // Write aggregate timing + const timing = buildTimingArtifact(results); + await writeFile(timingPath, `${JSON.stringify(timing, null, 2)}\n`, 'utf8'); + + // Write benchmark + const benchmark = buildBenchmarkArtifact(results, options?.evalFile); + await writeFile(benchmarkPath, `${JSON.stringify(benchmark, null, 2)}\n`, 'utf8'); + + return { gradingDir, timingPath, benchmarkPath }; +} diff --git a/apps/cli/src/commands/eval/commands/run.ts b/apps/cli/src/commands/eval/commands/run.ts index 3b1357b94..da8d39fc9 100644 --- a/apps/cli/src/commands/eval/commands/run.ts +++ b/apps/cli/src/commands/eval/commands/run.ts @@ -157,6 +157,12 @@ export const evalRunCommand = command({ long: 'benchmark-json', description: 'Write Agent Skills benchmark.json to the specified path', }), + artifacts: option({ + type: optional(string), + long: 'artifacts', + description: + 'Write companion artifacts (grading/.json, timing.json, benchmark.json) to the specified directory', + }), }, handler: async (args) => { // Launch interactive wizard when no eval paths and stdin is a TTY @@ -196,6 +202,7 @@ export const evalRunCommand = command({ retryErrors: args.retryErrors, strict: args.strict, benchmarkJson: args.benchmarkJson, + artifacts: args.artifacts, }; await runEvalCommand({ testFiles: resolvedPaths, rawOptions }); }, diff --git a/apps/cli/src/commands/eval/run-eval.ts b/apps/cli/src/commands/eval/run-eval.ts index c73779f94..43eec380b 100644 --- a/apps/cli/src/commands/eval/run-eval.ts +++ b/apps/cli/src/commands/eval/run-eval.ts @@ -26,6 +26,7 @@ import { } from '@agentv/core'; import { enforceRequiredVersion } from '../../version-check.js'; +import { writeArtifactsFromResults } from './artifact-writer.js'; import { writeBenchmarkJson } from './benchmark-writer.js'; import { loadEnvFromHierarchy } from './env.js'; import { @@ -80,6 +81,7 @@ interface NormalizedOptions { readonly workspaceMode?: 'pooled' | 'temp' | 'static'; readonly workspacePath?: string; readonly benchmarkJson?: string; + readonly artifacts?: string; } function normalizeBoolean(value: unknown): boolean { @@ -246,6 +248,7 @@ function normalizeOptions( workspaceMode, workspacePath, benchmarkJson: normalizeString(rawOptions.benchmarkJson), + artifacts: normalizeString(rawOptions.artifacts), } satisfies NormalizedOptions; } @@ -1046,6 +1049,21 @@ export async function runEvalCommand(input: RunEvalCommandInput): Promise console.log(`Benchmark written to: ${benchmarkPath}`); } + // Write companion artifacts (grading, timing, benchmark) if requested + if (options.artifacts && allResults.length > 0) { + const artifactsDir = path.resolve(options.artifacts); + const evalFile = resolvedTestFiles.length === 1 ? resolvedTestFiles[0] : ''; + const { + gradingDir, + timingPath, + benchmarkPath: abp, + } = await writeArtifactsFromResults(allResults, artifactsDir, { evalFile }); + console.log(`Artifacts written to: ${artifactsDir}`); + console.log(` Grading: ${gradingDir} (${allResults.length} files)`); + console.log(` Timing: ${timingPath}`); + console.log(` Benchmark: ${abp}`); + } + // Print workspace paths for failed cases (when preserved for debugging) const failedWithWorkspaces = allResults.filter( (r) => r.workspacePath && (r.error || r.score < 0.5), diff --git a/apps/cli/test/commands/eval/artifact-writer.test.ts b/apps/cli/test/commands/eval/artifact-writer.test.ts new file mode 100644 index 000000000..0d46e3d63 --- /dev/null +++ b/apps/cli/test/commands/eval/artifact-writer.test.ts @@ -0,0 +1,507 @@ +import { afterEach, beforeEach, describe, expect, it } from 'bun:test'; +import { readFile, readdir, rm } from 'node:fs/promises'; +import path from 'node:path'; + +import type { EvaluationResult, EvaluatorResult } from '@agentv/core'; + +import { + type BenchmarkArtifact, + type GradingArtifact, + type TimingArtifact, + buildBenchmarkArtifact, + buildGradingArtifact, + buildTimingArtifact, + parseJsonlResults, + writeArtifacts, + writeArtifactsFromResults, +} from '../../../src/commands/eval/artifact-writer.js'; + +function makeResult(overrides: Partial = {}): EvaluationResult { + return { + timestamp: '2026-03-13T00:00:00.000Z', + testId: 'test-1', + score: 0.9, + hits: ['criterion-1'], + misses: [], + answer: 'test answer', + target: 'test-target', + executionStatus: 'ok', + ...overrides, + } as EvaluationResult; +} + +function makeEvaluatorResult(overrides: Partial = {}): EvaluatorResult { + return { + name: 'judge-1', + type: 'llm-judge', + score: 0.85, + hits: ['criterion-a'], + misses: ['criterion-b'], + reasoning: 'Good output overall', + ...overrides, + } as EvaluatorResult; +} + +// --------------------------------------------------------------------------- +// Grading artifact +// --------------------------------------------------------------------------- + +describe('buildGradingArtifact', () => { + it('maps evaluator hits/misses to expectations', () => { + const result = makeResult({ + scores: [ + makeEvaluatorResult({ + hits: ['correct format', 'has code'], + misses: ['missing tests'], + reasoning: 'Output was formatted well', + }), + ], + }); + + const grading = buildGradingArtifact(result); + + expect(grading.expectations).toHaveLength(3); + expect(grading.expectations[0]).toEqual({ + text: 'correct format', + passed: true, + evidence: 'Output was formatted well', + }); + expect(grading.expectations[1]).toEqual({ + text: 'has code', + passed: true, + evidence: 'Output was formatted well', + }); + expect(grading.expectations[2]).toEqual({ + text: 'missing tests', + passed: false, + evidence: 'Output was formatted well', + }); + }); + + it('computes correct summary', () => { + const result = makeResult({ + scores: [ + makeEvaluatorResult({ + hits: ['a', 'b'], + misses: ['c'], + }), + ], + }); + + const grading = buildGradingArtifact(result); + + expect(grading.summary).toEqual({ + passed: 2, + failed: 1, + total: 3, + pass_rate: 0.667, + }); + }); + + it('falls back to top-level hits/misses when no evaluator scores', () => { + const result = makeResult({ + hits: ['ok-1', 'ok-2'], + misses: ['miss-1'], + reasoning: 'top-level reasoning', + }); + + const grading = buildGradingArtifact(result); + + expect(grading.expectations).toHaveLength(3); + expect(grading.expectations[0].text).toBe('ok-1'); + expect(grading.expectations[0].evidence).toBe('top-level reasoning'); + expect(grading.expectations[2].text).toBe('miss-1'); + expect(grading.expectations[2].passed).toBe(false); + }); + + it('includes evaluators list with AgentV extensions', () => { + const result = makeResult({ + scores: [ + makeEvaluatorResult({ name: 'format-check', type: 'code-judge', score: 1.0 }), + makeEvaluatorResult({ name: 'quality', type: 'llm-judge', score: 0.7 }), + ], + }); + + const grading = buildGradingArtifact(result); + + expect(grading.evaluators).toHaveLength(2); + expect(grading.evaluators?.[0].name).toBe('format-check'); + expect(grading.evaluators?.[0].type).toBe('code-judge'); + expect(grading.evaluators?.[1].score).toBe(0.7); + }); + + it('records error as errors_encountered', () => { + const result = makeResult({ error: 'Timeout exceeded' }); + const grading = buildGradingArtifact(result); + expect(grading.execution_metrics.errors_encountered).toBe(1); + }); + + it('handles result with no hits, misses, or scores', () => { + const result = makeResult({ hits: [], misses: [], scores: undefined }); + const grading = buildGradingArtifact(result); + + expect(grading.expectations).toHaveLength(0); + expect(grading.summary).toEqual({ + passed: 0, + failed: 0, + total: 0, + pass_rate: 0, + }); + expect(grading.evaluators).toBeUndefined(); + }); + + it('includes workspace_changes when fileChanges present', () => { + const diff = [ + '--- /dev/null', + '+++ b/new-file.ts', + '@@ -0,0 +1 @@', + '+console.log("hello")', + '--- a/existing.ts', + '+++ b/existing.ts', + '@@ -1 +1 @@', + '-old', + '+new', + ].join('\n'); + + const result = makeResult({ fileChanges: diff }); + const grading = buildGradingArtifact(result); + + expect(grading.workspace_changes).toBeDefined(); + expect(grading.workspace_changes?.files_created).toBe(1); + expect(grading.workspace_changes?.files_modified).toBe(1); + }); + + it('includes conversation when conversationId present', () => { + const result = makeResult({ conversationId: 'conv-abc-123' }); + const grading = buildGradingArtifact(result); + + expect(grading.conversation).toBeDefined(); + expect(grading.conversation?.conversation_id).toBe('conv-abc-123'); + }); +}); + +// --------------------------------------------------------------------------- +// Timing artifact +// --------------------------------------------------------------------------- + +describe('buildTimingArtifact', () => { + it('aggregates timing across results', () => { + const results = [ + makeResult({ + durationMs: 30000, + tokenUsage: { input: 1000, output: 500 }, + } as Partial), + makeResult({ + durationMs: 60000, + tokenUsage: { input: 2000, output: 1000 }, + } as Partial), + ]; + + const timing = buildTimingArtifact(results); + + expect(timing.total_tokens).toBe(4500); + expect(timing.duration_ms).toBe(90000); + expect(timing.total_duration_seconds).toBe(90); + expect(timing.token_usage).toEqual({ input: 3000, output: 1500 }); + }); + + it('handles results with no timing data', () => { + const results = [makeResult({})]; + const timing = buildTimingArtifact(results); + + expect(timing.total_tokens).toBe(0); + expect(timing.duration_ms).toBe(0); + expect(timing.total_duration_seconds).toBe(0); + expect(timing.token_usage).toEqual({ input: 0, output: 0 }); + }); + + it('handles empty results array', () => { + const timing = buildTimingArtifact([]); + + expect(timing.total_tokens).toBe(0); + expect(timing.duration_ms).toBe(0); + expect(timing.total_duration_seconds).toBe(0); + }); + + it('handles partial token usage', () => { + const results = [ + makeResult({ + tokenUsage: { input: 500 }, + } as Partial), + ]; + + const timing = buildTimingArtifact(results); + expect(timing.total_tokens).toBe(500); + expect(timing.token_usage).toEqual({ input: 500, output: 0 }); + }); +}); + +// --------------------------------------------------------------------------- +// Benchmark artifact +// --------------------------------------------------------------------------- + +describe('buildBenchmarkArtifact', () => { + it('computes per-target statistics', () => { + const results = [ + makeResult({ target: 'gpt-4', score: 0.9, durationMs: 30000 }), + makeResult({ target: 'gpt-4', testId: 'test-2', score: 0.8, durationMs: 60000 }), + makeResult({ target: 'claude', score: 0.5, durationMs: 45000 }), + ]; + + const benchmark = buildBenchmarkArtifact(results, 'test.eval.yaml'); + + expect(benchmark.metadata.eval_file).toBe('test.eval.yaml'); + expect(benchmark.metadata.targets).toEqual(['claude', 'gpt-4']); + expect(benchmark.metadata.tests_run).toEqual(['test-1', 'test-2']); + + // gpt-4: both pass (>= 0.8), pass_rate mean = 1.0 + expect(benchmark.run_summary['gpt-4'].pass_rate.mean).toBe(1); + // claude: 0.5 < 0.8 → 0.0, pass_rate mean = 0.0 + expect(benchmark.run_summary.claude.pass_rate.mean).toBe(0); + + // gpt-4: (30+60)/2 = 45 seconds + expect(benchmark.run_summary['gpt-4'].time_seconds.mean).toBe(45); + expect(benchmark.run_summary['gpt-4'].time_seconds.stddev).toBe(15); + }); + + it('includes per-evaluator summary', () => { + const results = [ + makeResult({ + scores: [makeEvaluatorResult({ name: 'quality', type: 'llm-judge', score: 0.9 })], + }), + makeResult({ + testId: 'test-2', + scores: [makeEvaluatorResult({ name: 'quality', type: 'llm-judge', score: 0.7 })], + }), + ]; + + const benchmark = buildBenchmarkArtifact(results); + + expect(benchmark.per_evaluator_summary).toBeDefined(); + expect(benchmark.per_evaluator_summary?.['quality:llm-judge'].mean).toBe(0.8); + }); + + it('adds note when execution errors present', () => { + const results = [makeResult({ executionStatus: 'execution_error', score: 0 })]; + + const benchmark = buildBenchmarkArtifact(results); + expect(benchmark.notes.some((n) => n.includes('execution errors'))).toBe(true); + }); + + it('handles empty results', () => { + const benchmark = buildBenchmarkArtifact([]); + + expect(benchmark.metadata.targets).toEqual([]); + expect(benchmark.metadata.tests_run).toEqual([]); + expect(benchmark.notes).toContain('No results to summarize'); + }); + + it('includes cost_usd when available', () => { + const results = [makeResult({ costUsd: 0.05 }), makeResult({ testId: 'test-2', costUsd: 0.1 })]; + + const benchmark = buildBenchmarkArtifact(results); + const summary = benchmark.run_summary['test-target']; + expect(summary.cost_usd).toBeDefined(); + expect(summary.cost_usd?.mean).toBe(0.075); + }); +}); + +// --------------------------------------------------------------------------- +// JSONL parsing +// --------------------------------------------------------------------------- + +describe('parseJsonlResults', () => { + it('parses multi-line JSONL', () => { + const line1 = JSON.stringify({ testId: 'a', score: 0.9 }); + const line2 = JSON.stringify({ testId: 'b', score: 0.5 }); + const content = `${line1}\n${line2}\n`; + + const results = parseJsonlResults(content); + expect(results).toHaveLength(2); + expect(results[0].testId).toBe('a'); + expect(results[1].testId).toBe('b'); + }); + + it('handles empty content', () => { + expect(parseJsonlResults('')).toHaveLength(0); + }); + + it('skips blank lines', () => { + const line = JSON.stringify({ testId: 'a', score: 0.9 }); + const content = `\n${line}\n\n`; + expect(parseJsonlResults(content)).toHaveLength(1); + }); + + it('skips malformed lines', () => { + const good = JSON.stringify({ testId: 'a', score: 0.9 }); + const content = `${good}\nnot json\n`; + expect(parseJsonlResults(content)).toHaveLength(1); + }); +}); + +// --------------------------------------------------------------------------- +// Schema compatibility (shared fields match skill-creator format) +// --------------------------------------------------------------------------- + +describe('schema compatibility', () => { + it('grading expectations have text/passed/evidence fields', () => { + const result = makeResult({ + scores: [makeEvaluatorResult({ hits: ['x'], misses: ['y'], reasoning: 'r' })], + }); + const grading = buildGradingArtifact(result); + + for (const exp of grading.expectations) { + expect(exp).toHaveProperty('text'); + expect(exp).toHaveProperty('passed'); + expect(exp).toHaveProperty('evidence'); + expect(typeof exp.text).toBe('string'); + expect(typeof exp.passed).toBe('boolean'); + expect(typeof exp.evidence).toBe('string'); + } + }); + + it('grading summary has passed/failed/total/pass_rate', () => { + const result = makeResult({ + scores: [makeEvaluatorResult({ hits: ['a'], misses: [] })], + }); + const grading = buildGradingArtifact(result); + + expect(grading.summary).toHaveProperty('passed'); + expect(grading.summary).toHaveProperty('failed'); + expect(grading.summary).toHaveProperty('total'); + expect(grading.summary).toHaveProperty('pass_rate'); + expect(typeof grading.summary.pass_rate).toBe('number'); + }); + + it('timing has total_tokens, duration_ms, total_duration_seconds, token_usage', () => { + const timing = buildTimingArtifact([makeResult({})]); + + expect(timing).toHaveProperty('total_tokens'); + expect(timing).toHaveProperty('duration_ms'); + expect(timing).toHaveProperty('total_duration_seconds'); + expect(timing).toHaveProperty('token_usage'); + expect(timing.token_usage).toHaveProperty('input'); + expect(timing.token_usage).toHaveProperty('output'); + }); + + it('benchmark run_summary has pass_rate/time_seconds/tokens with mean/stddev', () => { + const benchmark = buildBenchmarkArtifact([makeResult({})]); + const summary = benchmark.run_summary['test-target']; + + expect(summary).toBeDefined(); + expect(summary.pass_rate).toHaveProperty('mean'); + expect(summary.pass_rate).toHaveProperty('stddev'); + expect(summary.time_seconds).toHaveProperty('mean'); + expect(summary.time_seconds).toHaveProperty('stddev'); + expect(summary.tokens).toHaveProperty('mean'); + expect(summary.tokens).toHaveProperty('stddev'); + }); +}); + +// --------------------------------------------------------------------------- +// File I/O: writeArtifacts / writeArtifactsFromResults +// --------------------------------------------------------------------------- + +describe('writeArtifactsFromResults', () => { + const testDir = path.join(import.meta.dir, '.test-artifact-output'); + + beforeEach(() => { + // Clean before each test to ensure isolation + }); + + afterEach(async () => { + await rm(testDir, { recursive: true, force: true }).catch(() => undefined); + }); + + it('writes grading, timing, and benchmark files', async () => { + const results = [ + makeResult({ testId: 'alpha', score: 0.9, durationMs: 5000 }), + makeResult({ testId: 'beta', score: 0.6, durationMs: 8000 }), + ]; + + const paths = await writeArtifactsFromResults(results, testDir, { + evalFile: 'my-eval.yaml', + }); + + // Check grading files + const gradingFiles = await readdir(paths.gradingDir); + expect(gradingFiles.sort()).toEqual(['alpha.json', 'beta.json']); + + const alphaGrading: GradingArtifact = JSON.parse( + await readFile(path.join(paths.gradingDir, 'alpha.json'), 'utf8'), + ); + expect(alphaGrading.summary).toBeDefined(); + expect(alphaGrading.execution_metrics).toBeDefined(); + + // Check timing + const timing: TimingArtifact = JSON.parse(await readFile(paths.timingPath, 'utf8')); + expect(timing.duration_ms).toBe(13000); + + // Check benchmark + const benchmark: BenchmarkArtifact = JSON.parse(await readFile(paths.benchmarkPath, 'utf8')); + expect(benchmark.metadata.eval_file).toBe('my-eval.yaml'); + expect(benchmark.metadata.tests_run.sort()).toEqual(['alpha', 'beta']); + }); + + it('handles empty results array', async () => { + const paths = await writeArtifactsFromResults([], testDir); + + const gradingFiles = await readdir(paths.gradingDir); + expect(gradingFiles).toHaveLength(0); + + const timing: TimingArtifact = JSON.parse(await readFile(paths.timingPath, 'utf8')); + expect(timing.total_tokens).toBe(0); + + const benchmark: BenchmarkArtifact = JSON.parse(await readFile(paths.benchmarkPath, 'utf8')); + expect(benchmark.notes).toContain('No results to summarize'); + }); + + it('sanitizes test IDs for filenames', async () => { + const results = [makeResult({ testId: 'path/to:test*1' })]; + await writeArtifactsFromResults(results, testDir); + + const gradingFiles = await readdir(path.join(testDir, 'grading')); + expect(gradingFiles).toEqual(['path_to_test_1.json']); + }); +}); + +describe('writeArtifacts (from JSONL file)', () => { + const testDir = path.join(import.meta.dir, '.test-artifact-jsonl'); + const jsonlPath = path.join(testDir, 'results.jsonl'); + + beforeEach(async () => { + const { mkdir, writeFile } = await import('node:fs/promises'); + await mkdir(testDir, { recursive: true }); + const lines = [ + JSON.stringify({ + timestamp: '2026-01-01T00:00:00Z', + test_id: 'from-file', + score: 0.85, + hits: ['pass-1'], + misses: [], + answer: 'file answer', + target: 'default', + execution_status: 'ok', + duration_ms: 12000, + token_usage: { input: 500, output: 200 }, + }), + ]; + await writeFile(jsonlPath, `${lines.join('\n')}\n`, 'utf8'); + }); + + afterEach(async () => { + await rm(testDir, { recursive: true, force: true }).catch(() => undefined); + }); + + it('reads JSONL and produces artifacts', async () => { + const outputDir = path.join(testDir, 'out'); + const paths = await writeArtifacts(jsonlPath, outputDir); + + const gradingFiles = await readdir(paths.gradingDir); + expect(gradingFiles).toHaveLength(1); + + const timing: TimingArtifact = JSON.parse(await readFile(paths.timingPath, 'utf8')); + expect(timing.duration_ms).toBe(12000); + expect(timing.total_tokens).toBe(700); + }); +});