From ff83a052cf6a807f00ea5ba5dd94d3dcb38ab253 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 15 Mar 2026 04:23:57 +0000 Subject: [PATCH] =?UTF-8?q?feat(cli):=20EVAL.yaml=20=E2=86=92=20evals.json?= =?UTF-8?q?=20transpiler=20(agentv=20transpile)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a transpiler that converts AgentV EVAL.yaml format to Agent Skills evals.json format for consumption by the skill-creator pipeline. - New core module: eval-yaml-transpiler (transpileEvalYaml, transpileEvalYamlFile, getOutputFilenames) - New CLI command: agentv transpile [--out-dir dir] [--stdout] - Handles both assertions: (current) and assert: (deprecated alias) - Extracts prompt and files from structured content blocks (type: text/file) - Converts all assertion types to natural language strings - trigger-judge maps to should_trigger bool, not assertions - Root-level assertions distributed to every test - Multi-skill input produces one evals.json per skill - Tests without trigger-judge assigned to dominant skill or _no-skill - 43 unit tests covering all field mappings and edge cases - Example EVAL.yaml and expected evals.json in examples/features/transpile/ --- apps/cli/src/commands/transpile/index.ts | 69 ++ apps/cli/src/index.ts | 2 + .../features/transpile/csv-analyzer.EVAL.yaml | 26 + .../transpile/csv-analyzer.evals.json | 23 + .../loaders/eval-yaml-transpiler.ts | 525 ++++++++++++++ packages/core/src/index.ts | 10 + .../loaders/eval-yaml-transpiler.test.ts | 672 ++++++++++++++++++ 7 files changed, 1327 insertions(+) create mode 100644 apps/cli/src/commands/transpile/index.ts create mode 100644 examples/features/transpile/csv-analyzer.EVAL.yaml create mode 100644 examples/features/transpile/csv-analyzer.evals.json create mode 100644 packages/core/src/evaluation/loaders/eval-yaml-transpiler.ts create mode 100644 packages/core/test/evaluation/loaders/eval-yaml-transpiler.test.ts diff --git a/apps/cli/src/commands/transpile/index.ts b/apps/cli/src/commands/transpile/index.ts new file mode 100644 index 000000000..82c6ec663 --- /dev/null +++ b/apps/cli/src/commands/transpile/index.ts @@ -0,0 +1,69 @@ +import { writeFileSync } from 'node:fs'; +import path from 'node:path'; +import { command, flag, option, optional, positional, string } from 'cmd-ts'; + +import { getOutputFilenames, transpileEvalYamlFile } from '@agentv/core'; + +export const transpileCommand = command({ + name: 'transpile', + description: 'Convert an EVAL.yaml file to Agent Skills evals.json format', + args: { + input: positional({ + type: string, + displayName: 'input', + description: 'Path to EVAL.yaml file', + }), + outDir: option({ + type: optional(string), + long: 'out-dir', + short: 'd', + description: 'Output directory (defaults to directory of input file)', + }), + stdout: flag({ + long: 'stdout', + description: 'Write to stdout instead of file(s) (only valid for single-skill output)', + }), + }, + handler: async ({ input, outDir, stdout }) => { + let result: ReturnType; + try { + result = transpileEvalYamlFile(path.resolve(input)); + } catch (error) { + console.error(`Error: ${(error as Error).message}`); + process.exit(1); + } + + // Print warnings + for (const warning of result.warnings) { + console.warn(`Warning: ${warning}`); + } + + if (result.files.size === 0) { + console.error('Error: No output produced (no tests found)'); + process.exit(1); + } + + if (stdout) { + if (result.files.size > 1) { + console.error( + 'Error: --stdout is only valid when input produces a single evals.json (multi-skill input produces multiple files)', + ); + process.exit(1); + } + const [file] = result.files.values(); + process.stdout.write(JSON.stringify(file, null, 2)); + process.stdout.write('\n'); + return; + } + + const outputDir = outDir ? path.resolve(outDir) : path.dirname(path.resolve(input)); + const fileNames = getOutputFilenames(result); + + for (const [skill, evalsJson] of result.files) { + const fileName = fileNames.get(skill) ?? 'evals.json'; + const outputPath = path.join(outputDir, fileName); + writeFileSync(outputPath, `${JSON.stringify(evalsJson, null, 2)}\n`); + console.log(`Transpiled to ${outputPath}`); + } + }, +}); diff --git a/apps/cli/src/index.ts b/apps/cli/src/index.ts index dd9d7690b..c1ae9a2aa 100644 --- a/apps/cli/src/index.ts +++ b/apps/cli/src/index.ts @@ -10,6 +10,7 @@ import { generateCommand } from './commands/generate/index.js'; import { initCmdTsCommand } from './commands/init/index.js'; import { selfCommand } from './commands/self/index.js'; import { traceCommand } from './commands/trace/index.js'; +import { transpileCommand } from './commands/transpile/index.js'; import { trimCommand } from './commands/trim/index.js'; import { validateCommand } from './commands/validate/index.js'; import { workspaceCommand } from './commands/workspace/index.js'; @@ -29,6 +30,7 @@ export const app = subcommands({ init: initCmdTsCommand, self: selfCommand, trace: traceCommand, + transpile: transpileCommand, trim: trimCommand, validate: validateCommand, workspace: workspaceCommand, diff --git a/examples/features/transpile/csv-analyzer.EVAL.yaml b/examples/features/transpile/csv-analyzer.EVAL.yaml new file mode 100644 index 000000000..8473efcbc --- /dev/null +++ b/examples/features/transpile/csv-analyzer.EVAL.yaml @@ -0,0 +1,26 @@ +tests: + - id: csv-top-months + criteria: Agent finds the top 3 months by revenue + input: + - role: user + content: + - type: file + value: evals/files/sales.csv + - type: text + value: "I have a CSV of monthly sales data. Find the top 3 months by revenue." + expected_output: "The top 3 months by revenue are November ($22,500), September ($20,100), and December ($19,400)." + assertions: + - type: trigger-judge + skill: csv-analyzer + should_trigger: true + - type: rubrics + criteria: "Output identifies November as the highest revenue month" + - type: contains + value: "$22,500" + + - id: irrelevant-query + input: "What time is it?" + assertions: + - type: trigger-judge + skill: csv-analyzer + should_trigger: false diff --git a/examples/features/transpile/csv-analyzer.evals.json b/examples/features/transpile/csv-analyzer.evals.json new file mode 100644 index 000000000..f843813f9 --- /dev/null +++ b/examples/features/transpile/csv-analyzer.evals.json @@ -0,0 +1,23 @@ +{ + "skill_name": "csv-analyzer", + "evals": [ + { + "id": 1, + "prompt": "I have a CSV of monthly sales data. Find the top 3 months by revenue.", + "expected_output": "The top 3 months by revenue are November ($22,500), September ($20,100), and December ($19,400).", + "files": ["evals/files/sales.csv"], + "should_trigger": true, + "assertions": [ + "Agent finds the top 3 months by revenue", + "Output identifies November as the highest revenue month", + "Output contains '$22,500'" + ] + }, + { + "id": 2, + "prompt": "What time is it?", + "should_trigger": false, + "assertions": [] + } + ] +} diff --git a/packages/core/src/evaluation/loaders/eval-yaml-transpiler.ts b/packages/core/src/evaluation/loaders/eval-yaml-transpiler.ts new file mode 100644 index 000000000..e170b0ed6 --- /dev/null +++ b/packages/core/src/evaluation/loaders/eval-yaml-transpiler.ts @@ -0,0 +1,525 @@ +/** + * EVAL.yaml → evals.json transpiler. + * + * Converts an AgentV EVAL.yaml file into Agent Skills evals.json format + * for consumption by the skill-creator pipeline. + * + * Handles both `assertions:` (current) and `assert:` (deprecated alias). + */ + +import { readFileSync } from 'node:fs'; +import path from 'node:path'; +import { parse } from 'yaml'; + +// --------------------------------------------------------------------------- +// evals.json output types +// --------------------------------------------------------------------------- + +export interface EvalsJsonCase { + id: number; + prompt: string; + expected_output?: string; + files?: string[]; + should_trigger?: boolean; + assertions: string[]; +} + +export interface EvalsJsonFile { + skill_name: string; + evals: EvalsJsonCase[]; +} + +// --------------------------------------------------------------------------- +// Raw YAML input types (unvalidated) +// --------------------------------------------------------------------------- + +type RawContent = + | string + | Array<{ type?: string; value?: string; [key: string]: unknown }> + | unknown; + +interface RawMessage { + role?: string; + content?: RawContent; + [key: string]: unknown; +} + +interface RawAssertEntry { + type?: string; + skill?: string; + should_trigger?: boolean; + criteria?: string; + value?: string; + name?: string; + description?: string; + command?: unknown; + prompt?: string; + rubrics?: unknown[]; + expected?: unknown[]; + fields?: unknown[]; + threshold?: number; + budget?: number; + [key: string]: unknown; +} + +interface RawTestCase { + id?: string | number; + criteria?: string; + input?: string | RawMessage[] | { [key: string]: unknown }; + input_files?: string[]; + expected_output?: string | RawMessage[] | unknown; + assertions?: RawAssertEntry[]; + /** @deprecated Use `assertions` instead */ + assert?: RawAssertEntry[]; + [key: string]: unknown; +} + +interface RawSuite { + tests?: RawTestCase[]; + assertions?: RawAssertEntry[]; + /** @deprecated Use `assertions` instead */ + assert?: RawAssertEntry[]; + [key: string]: unknown; +} + +// --------------------------------------------------------------------------- +// Assertion → natural language conversion +// --------------------------------------------------------------------------- + +function assertionToNaturalLanguage(entry: RawAssertEntry): string | null { + const type = entry.type; + + switch (type) { + case 'trigger-judge': + // Handled separately — not an NL assertion + return null; + + case 'rubrics': { + // criteria may be a string (NL) or array of rubric items + if (typeof entry.criteria === 'string') { + return entry.criteria; + } + return null; + } + + case 'contains': + return `Output contains '${entry.value}'`; + + case 'contains-any': + case 'contains_any': { + const values = Array.isArray(entry.value) + ? (entry.value as string[]).join("', '") + : entry.value; + return `Output contains any of: '${values}'`; + } + + case 'contains-all': + case 'contains_all': { + const values = Array.isArray(entry.value) + ? (entry.value as string[]).join("', '") + : entry.value; + return `Output contains all of: '${values}'`; + } + + case 'icontains': + return `Output contains (case-insensitive) '${entry.value}'`; + + case 'regex': + return `Output matches regex: ${entry.value}`; + + case 'equals': + return `Output exactly equals: ${entry.value}`; + + case 'is-json': + case 'is_json': + return 'Output is valid JSON'; + + case 'starts-with': + case 'starts_with': + return `Output starts with '${entry.value}'`; + + case 'ends-with': + case 'ends_with': + return `Output ends with '${entry.value}'`; + + case 'llm-judge': + case 'llm_judge': + return typeof entry.prompt === 'string' ? entry.prompt : null; + + case 'agent-judge': + case 'agent_judge': { + // Expand each rubric item to its own assertion string + // Return the first one — callers handle arrays via assertionToNaturalLanguageList + if (Array.isArray(entry.rubrics) && entry.rubrics.length > 0) { + return null; // handled by list expansion below + } + return typeof entry.prompt === 'string' ? entry.prompt : null; + } + + case 'tool-trajectory': + case 'tool_trajectory': { + const expectedArr = Array.isArray(entry.expected) ? entry.expected : []; + const tools = (expectedArr as Array<{ tool?: string }>) + .map((e) => e.tool) + .filter(Boolean) + .join(', '); + return tools + ? `Agent called tools in order: ${tools}` + : 'Agent followed expected tool trajectory'; + } + + case 'code-judge': + case 'code_judge': { + const namePart = + entry.name ?? (Array.isArray(entry.command) ? entry.command.join(' ') : entry.command); + const descPart = typeof entry.description === 'string' ? `: ${entry.description}` : ''; + return namePart ? `${namePart}${descPart}` : 'Code judge assertion'; + } + + case 'field-accuracy': + case 'field_accuracy': { + const fieldPaths = Array.isArray(entry.fields) + ? (entry.fields as Array<{ path?: string }>) + .map((f) => f.path) + .filter(Boolean) + .join(', ') + : ''; + return fieldPaths + ? `Fields ${fieldPaths} match expected values` + : 'Fields match expected values'; + } + + case 'latency': + return typeof entry.threshold === 'number' + ? `Response time under ${entry.threshold}ms` + : 'Response time within threshold'; + + case 'cost': + return typeof entry.budget === 'number' + ? `Cost under $${entry.budget}` + : 'Cost within budget'; + + case 'token-usage': + case 'token_usage': + return 'Token usage within limits'; + + case 'execution-metrics': + case 'execution_metrics': + return 'Execution within metric bounds'; + + default: + // Unknown type: try to produce something readable + if (typeof entry.criteria === 'string') return entry.criteria; + if (typeof entry.prompt === 'string') return entry.prompt; + return type ? `${type} assertion` : null; + } +} + +/** + * Expand a single assertion entry into zero or more NL strings. + * Most assertions produce exactly one string; agent-judge with rubrics expands to many. + */ +function assertionToNaturalLanguageList(entry: RawAssertEntry): string[] { + if (entry.type === 'agent-judge' || entry.type === 'agent_judge') { + if (Array.isArray(entry.rubrics) && entry.rubrics.length > 0) { + return (entry.rubrics as Array<{ outcome?: string; criteria?: string; id?: string }>) + .map((r) => r.outcome ?? r.criteria ?? r.id) + .filter((s): s is string => typeof s === 'string'); + } + } + const nl = assertionToNaturalLanguage(entry); + return nl !== null ? [nl] : []; +} + +/** + * Extract trigger-judge entries from an assertion list. + * Returns entries with type === 'trigger-judge'. + */ +function extractTriggerJudges(assertions: RawAssertEntry[]): RawAssertEntry[] { + return assertions.filter((a) => a.type === 'trigger-judge'); +} + +/** + * Collect all assertion entries for a test case, accepting both + * `assertions` and deprecated `assert` key. + */ +function resolveAssertions(rawCase: RawTestCase): RawAssertEntry[] { + if (Array.isArray(rawCase.assertions)) return rawCase.assertions; + if (Array.isArray(rawCase.assert)) return rawCase.assert; + return []; +} + +/** + * Collect suite-level assertions (applied to every test). + */ +function resolveSuiteAssertions(suite: RawSuite): RawAssertEntry[] { + if (Array.isArray(suite.assertions)) return suite.assertions; + if (Array.isArray(suite.assert)) return suite.assert; + return []; +} + +// --------------------------------------------------------------------------- +// Input extraction +// --------------------------------------------------------------------------- + +interface ExtractedInput { + prompt: string; + files: string[]; +} + +/** + * Extract prompt text and file paths from a test case input. + * + * Supports: + * - String input → prompt, no files + * - Message array with role: user and content blocks + * - input_files shorthand (alongside string or message-array input) + */ +function extractInput(rawCase: RawTestCase): ExtractedInput { + const inputFiles: string[] = []; + + // Collect input_files shorthand + if (Array.isArray(rawCase.input_files)) { + inputFiles.push(...(rawCase.input_files as string[]).filter((f) => typeof f === 'string')); + } + + const input = rawCase.input; + + // String shorthand + if (typeof input === 'string') { + return { prompt: input, files: inputFiles }; + } + + // Message array + if (Array.isArray(input)) { + let promptText = ''; + const filePaths: string[] = [...inputFiles]; + + for (const msg of input as RawMessage[]) { + if (msg.role !== 'user') continue; + + // String content + if (typeof msg.content === 'string') { + promptText = msg.content; + continue; + } + + // Content block array + if (Array.isArray(msg.content)) { + for (const block of msg.content as Array<{ type?: string; value?: string }>) { + if (block.type === 'text' && typeof block.value === 'string') { + promptText = block.value; + } else if (block.type === 'file' && typeof block.value === 'string') { + filePaths.push(block.value); + } + } + } + } + + return { prompt: promptText, files: filePaths }; + } + + return { prompt: '', files: inputFiles }; +} + +/** + * Flatten expected_output to a string. + * Accepts string, message array (takes last assistant message content), + * or any other value serialized to JSON. + */ +function extractExpectedOutput(raw: unknown): string | undefined { + if (raw === undefined || raw === null) return undefined; + if (typeof raw === 'string') return raw; + + if (Array.isArray(raw)) { + // Take the last assistant message content + for (let i = raw.length - 1; i >= 0; i--) { + const msg = raw[i] as RawMessage; + if (typeof msg.content === 'string') return msg.content; + } + return undefined; + } + + return JSON.stringify(raw); +} + +// --------------------------------------------------------------------------- +// Transpiler core +// --------------------------------------------------------------------------- + +/** + * Result of transpiling a single EVAL.yaml. + * May produce multiple evals.json files (one per skill). + */ +export interface TranspileResult { + /** Map from skill_name → EvalsJsonFile */ + files: Map; + /** Warning messages accumulated during transpilation */ + warnings: string[]; +} + +/** + * Transpile a parsed EVAL.yaml object into one or more evals.json objects. + * + * @param suite Parsed YAML object (already loaded, no file I/O here) + * @param source Source identifier for error messages (e.g. file path) + */ +export function transpileEvalYaml(suite: unknown, source = 'EVAL.yaml'): TranspileResult { + const warnings: string[] = []; + const files = new Map(); + + if (typeof suite !== 'object' || suite === null) { + throw new Error(`Invalid EVAL.yaml: expected an object in '${source}'`); + } + + const rawSuite = suite as RawSuite; + + if (!Array.isArray(rawSuite.tests)) { + throw new Error(`Invalid EVAL.yaml: missing 'tests' array in '${source}'`); + } + + if (rawSuite.assert !== undefined && rawSuite.assertions === undefined) { + warnings.push("'assert' is deprecated at the suite level. Use 'assertions' instead."); + } + + const suiteAssertions = resolveSuiteAssertions(rawSuite); + + // Suite-level NL assertions (appended to every test) + const suiteNlAssertions: string[] = suiteAssertions + .filter((a) => a.type !== 'trigger-judge') + .flatMap(assertionToNaturalLanguageList); + + /** + * Helper: get or create the EvalsJsonFile for a skill. + */ + function getSkillFile(skillName: string): EvalsJsonFile { + const existing = files.get(skillName); + if (existing) return existing; + const created: EvalsJsonFile = { skill_name: skillName, evals: [] }; + files.set(skillName, created); + return created; + } + + const tests = rawSuite.tests as RawTestCase[]; + + for (let idx = 0; idx < tests.length; idx++) { + const rawCase = tests[idx]; + const caseAssertions = resolveAssertions(rawCase); + + if (rawCase.assert !== undefined && rawCase.assertions === undefined) { + const caseId = rawCase.id ?? idx + 1; + warnings.push(`Test '${caseId}': 'assert' is deprecated. Use 'assertions' instead.`); + } + + // Collect NL assertions (not trigger-judge) + const nlAssertions: string[] = []; + + // Prepend test-level criteria as NL assertion + if (typeof rawCase.criteria === 'string' && rawCase.criteria.trim()) { + nlAssertions.push(rawCase.criteria.trim()); + } + + for (const entry of caseAssertions) { + if (entry.type !== 'trigger-judge') { + nlAssertions.push(...assertionToNaturalLanguageList(entry)); + } + } + + // Append suite-level NL assertions + nlAssertions.push(...suiteNlAssertions); + + const triggerJudges = extractTriggerJudges(caseAssertions); + const { prompt, files: inputFiles } = extractInput(rawCase); + const expectedOutput = extractExpectedOutput(rawCase.expected_output); + + // Build the numeric id (1-based index) + const numericId = idx + 1; + + // Build the base case (without should_trigger — added per-skill below) + const baseCase: Omit & { should_trigger?: boolean } = { + id: numericId, + prompt, + ...(expectedOutput !== undefined && { expected_output: expectedOutput }), + ...(inputFiles.length > 0 && { files: inputFiles }), + assertions: nlAssertions, + }; + + if (triggerJudges.length === 0) { + // No trigger-judge: place in dominant skill (or _no-skill) + // Determine dominant skill by scanning all tests (first occurrence wins) + // We defer this: record with a sentinel and resolve after all tests are processed. + // For now, push to _no-skill; we'll re-assign at the end. + const noSkillFile = getSkillFile('_no-skill'); + noSkillFile.evals.push({ ...baseCase }); + } else { + // Place in each skill with the correct should_trigger value + for (const tj of triggerJudges) { + const skillName = typeof tj.skill === 'string' ? tj.skill : '_no-skill'; + const shouldTrigger = tj.should_trigger !== false; // default true + const skillFile = getSkillFile(skillName); + skillFile.evals.push({ ...baseCase, should_trigger: shouldTrigger }); + } + } + } + + // Re-assign _no-skill tests to the dominant skill (if one exists) + const noSkillFile = files.get('_no-skill'); + if (noSkillFile && noSkillFile.evals.length > 0) { + // Find the skill with the most tests (among real skills) + let dominantSkill: string | null = null; + let maxCount = 0; + for (const [name, f] of files) { + if (name !== '_no-skill' && f.evals.length > maxCount) { + maxCount = f.evals.length; + dominantSkill = name; + } + } + + if (dominantSkill) { + const targetFile = getSkillFile(dominantSkill); + for (const evalCase of noSkillFile.evals) { + targetFile.evals.push(evalCase); + } + files.delete('_no-skill'); + } + // else: keep _no-skill if there are no other skills + } + + return { files, warnings }; +} + +// --------------------------------------------------------------------------- +// File-level API +// --------------------------------------------------------------------------- + +/** + * Transpile an EVAL.yaml file into one or more evals.json objects. + * Returns a map from output filename → JSON content. + * + * @param evalYamlPath Absolute path to the EVAL.yaml file + */ +export function transpileEvalYamlFile(evalYamlPath: string): TranspileResult { + const content = readFileSync(evalYamlPath, 'utf8'); + const parsed = parse(content) as unknown; + return transpileEvalYaml(parsed, path.basename(evalYamlPath)); +} + +/** + * Determine the output filename(s) for a transpile result. + * Single skill → "evals.json" + * Multiple skills → ".evals.json" + */ +export function getOutputFilenames(result: TranspileResult): Map { + const names = new Map(); + if (result.files.size === 1) { + for (const [skill] of result.files) { + names.set(skill, 'evals.json'); + } + } else { + for (const [skill] of result.files) { + const safeName = skill.replace(/[^a-zA-Z0-9_-]/g, '_'); + names.set(skill, `${safeName}.evals.json`); + } + } + return names; +} diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index 7df57f3f2..37f97bb18 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -5,6 +5,16 @@ export { isAgentSkillsFormat, parseAgentSkillsEvals, } from './evaluation/loaders/agent-skills-parser.js'; +export { + transpileEvalYaml, + transpileEvalYamlFile, + getOutputFilenames, +} from './evaluation/loaders/eval-yaml-transpiler.js'; +export type { + EvalsJsonCase, + EvalsJsonFile, + TranspileResult, +} from './evaluation/loaders/eval-yaml-transpiler.js'; export * from './evaluation/file-utils.js'; export * from './evaluation/providers/index.js'; export * from './evaluation/evaluators.js'; diff --git a/packages/core/test/evaluation/loaders/eval-yaml-transpiler.test.ts b/packages/core/test/evaluation/loaders/eval-yaml-transpiler.test.ts new file mode 100644 index 000000000..e4d3177c2 --- /dev/null +++ b/packages/core/test/evaluation/loaders/eval-yaml-transpiler.test.ts @@ -0,0 +1,672 @@ +import { describe, expect, it } from 'bun:test'; + +import { + getOutputFilenames, + transpileEvalYaml, +} from '../../../src/evaluation/loaders/eval-yaml-transpiler.js'; + +// --------------------------------------------------------------------------- +// Fixtures +// --------------------------------------------------------------------------- + +const SINGLE_SKILL_SUITE = { + tests: [ + { + id: 'csv-top-months', + criteria: 'Agent finds the top 3 months by revenue', + input: [ + { + role: 'user', + content: [ + { type: 'file', value: 'evals/files/sales.csv' }, + { + type: 'text', + value: 'I have a CSV of monthly sales data. Find the top 3 months by revenue.', + }, + ], + }, + ], + expected_output: + 'The top 3 months by revenue are November ($22,500), September ($20,100), and December ($19,400).', + assertions: [ + { type: 'trigger-judge', skill: 'csv-analyzer', should_trigger: true }, + { type: 'rubrics', criteria: 'Output identifies November as the highest revenue month' }, + { type: 'contains', value: '$22,500' }, + ], + }, + { + id: 'irrelevant-query', + input: 'What time is it?', + assertions: [{ type: 'trigger-judge', skill: 'csv-analyzer', should_trigger: false }], + }, + ], +}; + +// --------------------------------------------------------------------------- +// Basic transpilation +// --------------------------------------------------------------------------- + +describe('transpileEvalYaml — basic', () => { + it('produces one evals.json for a single-skill suite', () => { + const { files } = transpileEvalYaml(SINGLE_SKILL_SUITE); + expect(files.size).toBe(1); + expect(files.has('csv-analyzer')).toBe(true); + }); + + it('sets skill_name correctly', () => { + const { files } = transpileEvalYaml(SINGLE_SKILL_SUITE); + expect(files.get('csv-analyzer')?.skill_name).toBe('csv-analyzer'); + }); + + it('produces two evals in output', () => { + const { files } = transpileEvalYaml(SINGLE_SKILL_SUITE); + expect(files.get('csv-analyzer')?.evals).toHaveLength(2); + }); + + it('assigns 1-based numeric ids', () => { + const { files } = transpileEvalYaml(SINGLE_SKILL_SUITE); + const evals = files.get('csv-analyzer')?.evals; + expect(evals[0].id).toBe(1); + expect(evals[1].id).toBe(2); + }); +}); + +// --------------------------------------------------------------------------- +// Input extraction +// --------------------------------------------------------------------------- + +describe('transpileEvalYaml — input extraction', () => { + it('extracts prompt from content block (type: text)', () => { + const { files } = transpileEvalYaml(SINGLE_SKILL_SUITE); + const evals = files.get('csv-analyzer')?.evals; + expect(evals[0].prompt).toBe( + 'I have a CSV of monthly sales data. Find the top 3 months by revenue.', + ); + }); + + it('extracts files from content block (type: file)', () => { + const { files } = transpileEvalYaml(SINGLE_SKILL_SUITE); + const evals = files.get('csv-analyzer')?.evals; + expect(evals[0].files).toEqual(['evals/files/sales.csv']); + }); + + it('handles string input shorthand', () => { + const { files } = transpileEvalYaml(SINGLE_SKILL_SUITE); + const evals = files.get('csv-analyzer')?.evals; + expect(evals[1].prompt).toBe('What time is it?'); + }); + + it('does not include files when none present', () => { + const { files } = transpileEvalYaml(SINGLE_SKILL_SUITE); + const evals = files.get('csv-analyzer')?.evals; + expect(evals[1].files).toBeUndefined(); + }); +}); + +// --------------------------------------------------------------------------- +// Trigger-judge handling +// --------------------------------------------------------------------------- + +describe('transpileEvalYaml — trigger-judge', () => { + it('sets should_trigger: true for trigger-judge with should_trigger true', () => { + const { files } = transpileEvalYaml(SINGLE_SKILL_SUITE); + const evals = files.get('csv-analyzer')?.evals; + expect(evals[0].should_trigger).toBe(true); + }); + + it('sets should_trigger: false for trigger-judge with should_trigger false', () => { + const { files } = transpileEvalYaml(SINGLE_SKILL_SUITE); + const evals = files.get('csv-analyzer')?.evals; + expect(evals[1].should_trigger).toBe(false); + }); + + it('omits should_trigger when no trigger-judge in test', () => { + const suite = { + tests: [ + { + id: 'no-trigger', + input: 'Hello', + assertions: [{ type: 'contains', value: 'Hi' }], + }, + ], + }; + const { files } = transpileEvalYaml(suite); + // No skill: goes to _no-skill (or dominant skill if set) + const allFiles = [...files.values()]; + expect(allFiles).toHaveLength(1); + expect(allFiles[0].evals[0].should_trigger).toBeUndefined(); + }); + + it('trigger-judge is NOT included in assertions array', () => { + const { files } = transpileEvalYaml(SINGLE_SKILL_SUITE); + const evals = files.get('csv-analyzer')?.evals; + // assertions should contain NL items, not 'trigger-judge' literal + for (const a of evals[0].assertions) { + expect(a).not.toContain('trigger-judge'); + } + }); +}); + +// --------------------------------------------------------------------------- +// NL assertion conversion +// --------------------------------------------------------------------------- + +describe('transpileEvalYaml — NL assertions', () => { + it('prepends criteria to assertions', () => { + const { files } = transpileEvalYaml(SINGLE_SKILL_SUITE); + const evals = files.get('csv-analyzer')?.evals; + expect(evals[0].assertions[0]).toBe('Agent finds the top 3 months by revenue'); + }); + + it('converts rubrics type to criteria string', () => { + const { files } = transpileEvalYaml(SINGLE_SKILL_SUITE); + const evals = files.get('csv-analyzer')?.evals; + expect(evals[0].assertions).toContain( + 'Output identifies November as the highest revenue month', + ); + }); + + it('converts contains to NL', () => { + const { files } = transpileEvalYaml(SINGLE_SKILL_SUITE); + const evals = files.get('csv-analyzer')?.evals; + expect(evals[0].assertions).toContain("Output contains '$22,500'"); + }); + + it('converts regex to NL', () => { + const suite = { + tests: [ + { + id: 't1', + input: 'test', + assertions: [ + { type: 'trigger-judge', skill: 's', should_trigger: true }, + { type: 'regex', value: '\\d{4}-\\d{2}-\\d{2}' }, + ], + }, + ], + }; + const { files } = transpileEvalYaml(suite); + const evals = files.get('s')?.evals; + expect(evals[0].assertions).toContain('Output matches regex: \\d{4}-\\d{2}-\\d{2}'); + }); + + it('converts equals to NL', () => { + const suite = { + tests: [ + { + id: 't1', + input: 'test', + assertions: [ + { type: 'trigger-judge', skill: 's', should_trigger: true }, + { type: 'equals', value: 'exact answer' }, + ], + }, + ], + }; + const { files } = transpileEvalYaml(suite); + const evals = files.get('s')?.evals; + expect(evals[0].assertions).toContain('Output exactly equals: exact answer'); + }); + + it('converts is-json to NL', () => { + const suite = { + tests: [ + { + id: 't1', + input: 'test', + assertions: [ + { type: 'trigger-judge', skill: 's', should_trigger: true }, + { type: 'is-json' }, + ], + }, + ], + }; + const { files } = transpileEvalYaml(suite); + const evals = files.get('s')?.evals; + expect(evals[0].assertions).toContain('Output is valid JSON'); + }); + + it('converts llm-judge prompt to NL', () => { + const suite = { + tests: [ + { + id: 't1', + input: 'test', + assertions: [ + { type: 'trigger-judge', skill: 's', should_trigger: true }, + { type: 'llm-judge', prompt: 'The answer is clear and concise' }, + ], + }, + ], + }; + const { files } = transpileEvalYaml(suite); + const evals = files.get('s')?.evals; + expect(evals[0].assertions).toContain('The answer is clear and concise'); + }); + + it('converts agent-judge with rubrics to multiple assertions', () => { + const suite = { + tests: [ + { + id: 't1', + input: 'test', + assertions: [ + { type: 'trigger-judge', skill: 's', should_trigger: true }, + { + type: 'agent-judge', + rubrics: [ + { id: 'r1', outcome: 'Correct result returned' }, + { id: 'r2', outcome: 'No unnecessary steps' }, + ], + }, + ], + }, + ], + }; + const { files } = transpileEvalYaml(suite); + const evals = files.get('s')?.evals; + expect(evals[0].assertions).toContain('Correct result returned'); + expect(evals[0].assertions).toContain('No unnecessary steps'); + }); + + it('converts tool-trajectory to NL', () => { + const suite = { + tests: [ + { + id: 't1', + input: 'test', + assertions: [ + { type: 'trigger-judge', skill: 's', should_trigger: true }, + { + type: 'tool-trajectory', + expected: [{ tool: 'read_file' }, { tool: 'write_file' }], + }, + ], + }, + ], + }; + const { files } = transpileEvalYaml(suite); + const evals = files.get('s')?.evals; + expect(evals[0].assertions).toContain('Agent called tools in order: read_file, write_file'); + }); + + it('converts code-judge with name to NL', () => { + const suite = { + tests: [ + { + id: 't1', + input: 'test', + assertions: [ + { type: 'trigger-judge', skill: 's', should_trigger: true }, + { + type: 'code-judge', + name: 'trigger-judge', + description: 'Checks skill was triggered', + }, + ], + }, + ], + }; + const { files } = transpileEvalYaml(suite); + const evals = files.get('s')?.evals; + expect(evals[0].assertions).toContain('trigger-judge: Checks skill was triggered'); + }); + + it('converts field-accuracy to NL', () => { + const suite = { + tests: [ + { + id: 't1', + input: 'test', + assertions: [ + { type: 'trigger-judge', skill: 's', should_trigger: true }, + { + type: 'field-accuracy', + fields: [{ path: 'invoice.total' }, { path: 'invoice.date' }], + }, + ], + }, + ], + }; + const { files } = transpileEvalYaml(suite); + const evals = files.get('s')?.evals; + expect(evals[0].assertions).toContain( + 'Fields invoice.total, invoice.date match expected values', + ); + }); + + it('converts latency to NL', () => { + const suite = { + tests: [ + { + id: 't1', + input: 'test', + assertions: [ + { type: 'trigger-judge', skill: 's', should_trigger: true }, + { type: 'latency', threshold: 5000 }, + ], + }, + ], + }; + const { files } = transpileEvalYaml(suite); + const evals = files.get('s')?.evals; + expect(evals[0].assertions).toContain('Response time under 5000ms'); + }); + + it('converts cost to NL', () => { + const suite = { + tests: [ + { + id: 't1', + input: 'test', + assertions: [ + { type: 'trigger-judge', skill: 's', should_trigger: true }, + { type: 'cost', budget: 0.1 }, + ], + }, + ], + }; + const { files } = transpileEvalYaml(suite); + const evals = files.get('s')?.evals; + expect(evals[0].assertions).toContain('Cost under $0.1'); + }); + + it('converts token-usage to NL', () => { + const suite = { + tests: [ + { + id: 't1', + input: 'test', + assertions: [ + { type: 'trigger-judge', skill: 's', should_trigger: true }, + { type: 'token-usage', max_total: 1000 }, + ], + }, + ], + }; + const { files } = transpileEvalYaml(suite); + const evals = files.get('s')?.evals; + expect(evals[0].assertions).toContain('Token usage within limits'); + }); + + it('converts execution-metrics to NL', () => { + const suite = { + tests: [ + { + id: 't1', + input: 'test', + assertions: [ + { type: 'trigger-judge', skill: 's', should_trigger: true }, + { type: 'execution-metrics', max_tool_calls: 10 }, + ], + }, + ], + }; + const { files } = transpileEvalYaml(suite); + const evals = files.get('s')?.evals; + expect(evals[0].assertions).toContain('Execution within metric bounds'); + }); +}); + +// --------------------------------------------------------------------------- +// expected_output +// --------------------------------------------------------------------------- + +describe('transpileEvalYaml — expected_output', () => { + it('includes expected_output as string', () => { + const { files } = transpileEvalYaml(SINGLE_SKILL_SUITE); + const evals = files.get('csv-analyzer')?.evals; + expect(evals[0].expected_output).toBe( + 'The top 3 months by revenue are November ($22,500), September ($20,100), and December ($19,400).', + ); + }); + + it('omits expected_output when absent', () => { + const { files } = transpileEvalYaml(SINGLE_SKILL_SUITE); + const evals = files.get('csv-analyzer')?.evals; + expect(evals[1].expected_output).toBeUndefined(); + }); + + it('extracts string content from message array expected_output', () => { + const suite = { + tests: [ + { + id: 't1', + input: 'Hello', + expected_output: [{ role: 'assistant', content: 'World' }], + assertions: [{ type: 'trigger-judge', skill: 's', should_trigger: true }], + }, + ], + }; + const { files } = transpileEvalYaml(suite); + expect(files.get('s')?.evals[0].expected_output).toBe('World'); + }); +}); + +// --------------------------------------------------------------------------- +// input_files shorthand +// --------------------------------------------------------------------------- + +describe('transpileEvalYaml — input_files shorthand', () => { + it('expands input_files alongside string input', () => { + const suite = { + tests: [ + { + id: 't1', + input: 'Analyze this file', + input_files: ['data/file.csv', 'data/schema.json'], + assertions: [{ type: 'trigger-judge', skill: 's', should_trigger: true }], + }, + ], + }; + const { files } = transpileEvalYaml(suite); + const evals = files.get('s')?.evals; + expect(evals[0].prompt).toBe('Analyze this file'); + expect(evals[0].files).toEqual(['data/file.csv', 'data/schema.json']); + }); +}); + +// --------------------------------------------------------------------------- +// Root-level assertions distribution +// --------------------------------------------------------------------------- + +describe('transpileEvalYaml — suite-level assertions', () => { + it('appends suite-level NL assertions to every test', () => { + const suite = { + tests: [ + { + id: 't1', + input: 'first', + assertions: [{ type: 'trigger-judge', skill: 's', should_trigger: true }], + }, + { + id: 't2', + input: 'second', + assertions: [{ type: 'trigger-judge', skill: 's', should_trigger: true }], + }, + ], + assertions: [{ type: 'contains', value: 'global-check' }], + }; + const { files } = transpileEvalYaml(suite); + const evals = files.get('s')?.evals; + expect(evals[0].assertions).toContain("Output contains 'global-check'"); + expect(evals[1].assertions).toContain("Output contains 'global-check'"); + }); + + it('accepts deprecated assert: key at suite level', () => { + const suite = { + tests: [ + { + id: 't1', + input: 'hello', + assert: [{ type: 'trigger-judge', skill: 's', should_trigger: true }], + }, + ], + assert: [{ type: 'contains', value: 'suite-level' }], + }; + const { files, warnings } = transpileEvalYaml(suite); + const evals = files.get('s')?.evals; + expect(evals[0].assertions).toContain("Output contains 'suite-level'"); + expect(warnings.some((w) => w.includes("'assert' is deprecated"))).toBe(true); + }); +}); + +// --------------------------------------------------------------------------- +// Deprecated assert: key at test level +// --------------------------------------------------------------------------- + +describe('transpileEvalYaml — deprecated assert: key', () => { + it('accepts assert: key at test level with deprecation warning', () => { + const suite = { + tests: [ + { + id: 't1', + input: 'Hello', + assert: [ + { type: 'trigger-judge', skill: 'skill-a', should_trigger: true }, + { type: 'contains', value: 'world' }, + ], + }, + ], + }; + const { files, warnings } = transpileEvalYaml(suite); + expect(files.has('skill-a')).toBe(true); + expect(files.get('skill-a')?.evals[0].assertions).toContain("Output contains 'world'"); + expect(warnings.some((w) => w.includes("'assert' is deprecated"))).toBe(true); + }); +}); + +// --------------------------------------------------------------------------- +// Multi-skill +// --------------------------------------------------------------------------- + +describe('transpileEvalYaml — multi-skill', () => { + it('produces one evals.json per skill', () => { + const suite = { + tests: [ + { + id: 't1', + input: 'Hello', + assertions: [{ type: 'trigger-judge', skill: 'skill-a', should_trigger: true }], + }, + { + id: 't2', + input: 'World', + assertions: [{ type: 'trigger-judge', skill: 'skill-b', should_trigger: true }], + }, + ], + }; + const { files } = transpileEvalYaml(suite); + expect(files.size).toBe(2); + expect(files.has('skill-a')).toBe(true); + expect(files.has('skill-b')).toBe(true); + }); + + it('places test in both files when it has trigger-judges for two skills', () => { + const suite = { + tests: [ + { + id: 'shared', + input: 'Do something', + assertions: [ + { type: 'trigger-judge', skill: 'skill-a', should_trigger: true }, + { type: 'trigger-judge', skill: 'skill-b', should_trigger: false }, + ], + }, + ], + }; + const { files } = transpileEvalYaml(suite); + expect(files.size).toBe(2); + expect(files.get('skill-a')?.evals[0].should_trigger).toBe(true); + expect(files.get('skill-b')?.evals[0].should_trigger).toBe(false); + }); + + it('assigns tests with no trigger-judge to dominant skill', () => { + const suite = { + tests: [ + { + id: 't1', + input: 'Hello', + assertions: [ + { type: 'trigger-judge', skill: 'skill-a', should_trigger: true }, + { type: 'contains', value: 'hi' }, + ], + }, + { + id: 't2', + input: 'No trigger here', + assertions: [{ type: 'contains', value: 'world' }], + }, + ], + }; + const { files } = transpileEvalYaml(suite); + // _no-skill should be absorbed into skill-a (dominant) + expect(files.has('_no-skill')).toBe(false); + expect(files.get('skill-a')?.evals).toHaveLength(2); + }); + + it('keeps _no-skill file when there are no other skills', () => { + const suite = { + tests: [ + { + id: 't1', + input: 'Hello', + assertions: [{ type: 'contains', value: 'hi' }], + }, + ], + }; + const { files } = transpileEvalYaml(suite); + expect(files.has('_no-skill')).toBe(true); + }); +}); + +// --------------------------------------------------------------------------- +// Error handling +// --------------------------------------------------------------------------- + +describe('transpileEvalYaml — error handling', () => { + it('throws when input is not an object', () => { + expect(() => transpileEvalYaml('not an object')).toThrow('Invalid EVAL.yaml'); + }); + + it('throws when tests array is missing', () => { + expect(() => transpileEvalYaml({})).toThrow("missing 'tests' array"); + }); + + it('includes source in error messages', () => { + expect(() => transpileEvalYaml({}, 'my-file.yaml')).toThrow('my-file.yaml'); + }); +}); + +// --------------------------------------------------------------------------- +// getOutputFilenames +// --------------------------------------------------------------------------- + +describe('getOutputFilenames', () => { + it('returns evals.json for single-skill result', () => { + const { files } = transpileEvalYaml(SINGLE_SKILL_SUITE); + const names = getOutputFilenames({ files, warnings: [] }); + expect(names.get('csv-analyzer')).toBe('evals.json'); + }); + + it('returns skill-prefixed filenames for multi-skill result', () => { + const suite = { + tests: [ + { + id: 't1', + input: 'Hello', + assertions: [{ type: 'trigger-judge', skill: 'skill-a', should_trigger: true }], + }, + { + id: 't2', + input: 'World', + assertions: [{ type: 'trigger-judge', skill: 'skill-b', should_trigger: true }], + }, + ], + }; + const result = transpileEvalYaml(suite); + const names = getOutputFilenames(result); + expect(names.get('skill-a')).toBe('skill-a.evals.json'); + expect(names.get('skill-b')).toBe('skill-b.evals.json'); + }); +});