From ff83a052cf6a807f00ea5ba5dd94d3dcb38ab253 Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Sun, 15 Mar 2026 04:23:57 +0000
Subject: [PATCH] =?UTF-8?q?feat(cli):=20EVAL.yaml=20=E2=86=92=20evals.json?=
 =?UTF-8?q?=20transpiler=20(agentv=20transpile)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds a transpiler that converts AgentV EVAL.yaml format to Agent Skills
evals.json format for consumption by the skill-creator pipeline.

- New core module: eval-yaml-transpiler (transpileEvalYaml, transpileEvalYamlFile, getOutputFilenames)
- New CLI command: agentv transpile <input.yaml> [--out-dir dir] [--stdout]
- Handles both assertions: (current) and assert: (deprecated alias)
- Extracts prompt and files from structured content blocks (type: text/file)
- Converts all assertion types to natural language strings
- trigger-judge maps to should_trigger bool, not assertions
- Root-level assertions distributed to every test
- Multi-skill input produces one evals.json per skill
- Tests without trigger-judge assigned to dominant skill or _no-skill
- 43 unit tests covering all field mappings and edge cases
- Example EVAL.yaml and expected evals.json in examples/features/transpile/
---
 apps/cli/src/commands/transpile/index.ts      |  69 ++
 apps/cli/src/index.ts                         |   2 +
 .../features/transpile/csv-analyzer.EVAL.yaml |  26 +
 .../transpile/csv-analyzer.evals.json         |  23 +
 .../loaders/eval-yaml-transpiler.ts           | 525 ++++++++++++++
 packages/core/src/index.ts                    |  10 +
 .../loaders/eval-yaml-transpiler.test.ts      | 672 ++++++++++++++++++
 7 files changed, 1327 insertions(+)
 create mode 100644 apps/cli/src/commands/transpile/index.ts
 create mode 100644 examples/features/transpile/csv-analyzer.EVAL.yaml
 create mode 100644 examples/features/transpile/csv-analyzer.evals.json
 create mode 100644 packages/core/src/evaluation/loaders/eval-yaml-transpiler.ts
 create mode 100644 packages/core/test/evaluation/loaders/eval-yaml-transpiler.test.ts
diff --git a/apps/cli/src/commands/transpile/index.ts b/apps/cli/src/commands/transpile/index.ts
new file mode 100644
index 000000000..82c6ec663
--- /dev/null
+++ b/apps/cli/src/commands/transpile/index.ts
@@ -0,0 +1,69 @@
+import { writeFileSync } from 'node:fs';
+import path from 'node:path';
+import { command, flag, option, optional, positional, string } from 'cmd-ts';
+
+import { getOutputFilenames, transpileEvalYamlFile } from '@agentv/core';
+
+export const transpileCommand = command({
+  name: 'transpile',
+  description: 'Convert an EVAL.yaml file to Agent Skills evals.json format',
+  args: {
+    input: positional({
+      type: string,
+      displayName: 'input',
+      description: 'Path to EVAL.yaml file',
+    }),
+    outDir: option({
+      type: optional(string),
+      long: 'out-dir',
+      short: 'd',
+      description: 'Output directory (defaults to directory of input file)',
+    }),
+    stdout: flag({
+      long: 'stdout',
+      description: 'Write to stdout instead of file(s) (only valid for single-skill output)',
+    }),
+  },
+  handler: async ({ input, outDir, stdout }) => {
+    let result: ReturnType<typeof transpileEvalYamlFile>;
+    try {
+      result = transpileEvalYamlFile(path.resolve(input));
+    } catch (error) {
+      console.error(`Error: ${(error as Error).message}`);
+      process.exit(1);
+    }
+
+    // Print warnings
+    for (const warning of result.warnings) {
+      console.warn(`Warning: ${warning}`);
+    }
+
+    if (result.files.size === 0) {
+      console.error('Error: No output produced (no tests found)');
+      process.exit(1);
+    }
+
+    if (stdout) {
+      if (result.files.size > 1) {
+        console.error(
+          'Error: --stdout is only valid when input produces a single evals.json (multi-skill input produces multiple files)',
+        );
+        process.exit(1);
+      }
+      const [file] = result.files.values();
+      process.stdout.write(JSON.stringify(file, null, 2));
+      process.stdout.write('\n');
+      return;
+    }
+
+    const outputDir = outDir ? path.resolve(outDir) : path.dirname(path.resolve(input));
+    const fileNames = getOutputFilenames(result);
+
+    for (const [skill, evalsJson] of result.files) {
+      const fileName = fileNames.get(skill) ?? 'evals.json';
+      const outputPath = path.join(outputDir, fileName);
+      writeFileSync(outputPath, `${JSON.stringify(evalsJson, null, 2)}\n`);
+      console.log(`Transpiled to ${outputPath}`);
+    }
+  },
+});
diff --git a/apps/cli/src/index.ts b/apps/cli/src/index.ts
index dd9d7690b..c1ae9a2aa 100644
--- a/apps/cli/src/index.ts
+++ b/apps/cli/src/index.ts
@@ -10,6 +10,7 @@ import { generateCommand } from './commands/generate/index.js';
 import { initCmdTsCommand } from './commands/init/index.js';
 import { selfCommand } from './commands/self/index.js';
 import { traceCommand } from './commands/trace/index.js';
+import { transpileCommand } from './commands/transpile/index.js';
 import { trimCommand } from './commands/trim/index.js';
 import { validateCommand } from './commands/validate/index.js';
 import { workspaceCommand } from './commands/workspace/index.js';
@@ -29,6 +30,7 @@ export const app = subcommands({
     init: initCmdTsCommand,
     self: selfCommand,
     trace: traceCommand,
+    transpile: transpileCommand,
     trim: trimCommand,
     validate: validateCommand,
     workspace: workspaceCommand,
diff --git a/examples/features/transpile/csv-analyzer.EVAL.yaml b/examples/features/transpile/csv-analyzer.EVAL.yaml
new file mode 100644
index 000000000..8473efcbc
--- /dev/null
+++ b/examples/features/transpile/csv-analyzer.EVAL.yaml
@@ -0,0 +1,26 @@
+tests:
+  - id: csv-top-months
+    criteria: Agent finds the top 3 months by revenue
+    input:
+      - role: user
+        content:
+          - type: file
+            value: evals/files/sales.csv
+          - type: text
+            value: "I have a CSV of monthly sales data. Find the top 3 months by revenue."
+    expected_output: "The top 3 months by revenue are November ($22,500), September ($20,100), and December ($19,400)."
+    assertions:
+      - type: trigger-judge
+        skill: csv-analyzer
+        should_trigger: true
+      - type: rubrics
+        criteria: "Output identifies November as the highest revenue month"
+      - type: contains
+        value: "$22,500"
+
+  - id: irrelevant-query
+    input: "What time is it?"
+    assertions:
+      - type: trigger-judge
+        skill: csv-analyzer
+        should_trigger: false
diff --git a/examples/features/transpile/csv-analyzer.evals.json b/examples/features/transpile/csv-analyzer.evals.json
new file mode 100644
index 000000000..f843813f9
--- /dev/null
+++ b/examples/features/transpile/csv-analyzer.evals.json
@@ -0,0 +1,23 @@
+{
+  "skill_name": "csv-analyzer",
+  "evals": [
+    {
+      "id": 1,
+      "prompt": "I have a CSV of monthly sales data. Find the top 3 months by revenue.",
+      "expected_output": "The top 3 months by revenue are November ($22,500), September ($20,100), and December ($19,400).",
+      "files": ["evals/files/sales.csv"],
+      "should_trigger": true,
+      "assertions": [
+        "Agent finds the top 3 months by revenue",
+        "Output identifies November as the highest revenue month",
+        "Output contains '$22,500'"
+      ]
+    },
+    {
+      "id": 2,
+      "prompt": "What time is it?",
+      "should_trigger": false,
+      "assertions": []
+    }
+  ]
+}
diff --git a/packages/core/src/evaluation/loaders/eval-yaml-transpiler.ts b/packages/core/src/evaluation/loaders/eval-yaml-transpiler.ts
new file mode 100644
index 000000000..e170b0ed6
--- /dev/null
+++ b/packages/core/src/evaluation/loaders/eval-yaml-transpiler.ts
@@ -0,0 +1,525 @@
+/**
+ * EVAL.yaml → evals.json transpiler.
+ *
+ * Converts an AgentV EVAL.yaml file into Agent Skills evals.json format
+ * for consumption by the skill-creator pipeline.
+ *
+ * Handles both `assertions:` (current) and `assert:` (deprecated alias).
+ */
+
+import { readFileSync } from 'node:fs';
+import path from 'node:path';
+import { parse } from 'yaml';
+
+// ---------------------------------------------------------------------------
+// evals.json output types
+// ---------------------------------------------------------------------------
+
+export interface EvalsJsonCase {
+  id: number;
+  prompt: string;
+  expected_output?: string;
+  files?: string[];
+  should_trigger?: boolean;
+  assertions: string[];
+}
+
+export interface EvalsJsonFile {
+  skill_name: string;
+  evals: EvalsJsonCase[];
+}
+
+// ---------------------------------------------------------------------------
+// Raw YAML input types (unvalidated)
+// ---------------------------------------------------------------------------
+
+type RawContent =
+  | string
+  | Array<{ type?: string; value?: string; [key: string]: unknown }>
+  | unknown;
+
+interface RawMessage {
+  role?: string;
+  content?: RawContent;
+  [key: string]: unknown;
+}
+
+interface RawAssertEntry {
+  type?: string;
+  skill?: string;
+  should_trigger?: boolean;
+  criteria?: string;
+  value?: string;
+  name?: string;
+  description?: string;
+  command?: unknown;
+  prompt?: string;
+  rubrics?: unknown[];
+  expected?: unknown[];
+  fields?: unknown[];
+  threshold?: number;
+  budget?: number;
+  [key: string]: unknown;
+}
+
+interface RawTestCase {
+  id?: string | number;
+  criteria?: string;
+  input?: string | RawMessage[] | { [key: string]: unknown };
+  input_files?: string[];
+  expected_output?: string | RawMessage[] | unknown;
+  assertions?: RawAssertEntry[];
+  /** @deprecated Use `assertions` instead */
+  assert?: RawAssertEntry[];
+  [key: string]: unknown;
+}
+
+interface RawSuite {
+  tests?: RawTestCase[];
+  assertions?: RawAssertEntry[];
+  /** @deprecated Use `assertions` instead */
+  assert?: RawAssertEntry[];
+  [key: string]: unknown;
+}
+
+// ---------------------------------------------------------------------------
+// Assertion → natural language conversion
+// ---------------------------------------------------------------------------
+
+function assertionToNaturalLanguage(entry: RawAssertEntry): string | null {
+  const type = entry.type;
+
+  switch (type) {
+    case 'trigger-judge':
+      // Handled separately — not an NL assertion
+      return null;
+
+    case 'rubrics': {
+      // criteria may be a string (NL) or array of rubric items
+      if (typeof entry.criteria === 'string') {
+        return entry.criteria;
+      }
+      return null;
+    }
+
+    case 'contains':
+      return `Output contains '${entry.value}'`;
+
+    case 'contains-any':
+    case 'contains_any': {
+      const values = Array.isArray(entry.value)
+        ? (entry.value as string[]).join("', '")
+        : entry.value;
+      return `Output contains any of: '${values}'`;
+    }
+
+    case 'contains-all':
+    case 'contains_all': {
+      const values = Array.isArray(entry.value)
+        ? (entry.value as string[]).join("', '")
+        : entry.value;
+      return `Output contains all of: '${values}'`;
+    }
+
+    case 'icontains':
+      return `Output contains (case-insensitive) '${entry.value}'`;
+
+    case 'regex':
+      return `Output matches regex: ${entry.value}`;
+
+    case 'equals':
+      return `Output exactly equals: ${entry.value}`;
+
+    case 'is-json':
+    case 'is_json':
+      return 'Output is valid JSON';
+
+    case 'starts-with':
+    case 'starts_with':
+      return `Output starts with '${entry.value}'`;
+
+    case 'ends-with':
+    case 'ends_with':
+      return `Output ends with '${entry.value}'`;
+
+    case 'llm-judge':
+    case 'llm_judge':
+      return typeof entry.prompt === 'string' ? entry.prompt : null;
+
+    case 'agent-judge':
+    case 'agent_judge': {
+      // Expand each rubric item to its own assertion string
+      // Return the first one — callers handle arrays via assertionToNaturalLanguageList
+      if (Array.isArray(entry.rubrics) && entry.rubrics.length > 0) {
+        return null; // handled by list expansion below
+      }
+      return typeof entry.prompt === 'string' ? entry.prompt : null;
+    }
+
+    case 'tool-trajectory':
+    case 'tool_trajectory': {
+      const expectedArr = Array.isArray(entry.expected) ? entry.expected : [];
+      const tools = (expectedArr as Array<{ tool?: string }>)
+        .map((e) => e.tool)
+        .filter(Boolean)
+        .join(', ');
+      return tools
+        ? `Agent called tools in order: ${tools}`
+        : 'Agent followed expected tool trajectory';
+    }
+
+    case 'code-judge':
+    case 'code_judge': {
+      const namePart =
+        entry.name ?? (Array.isArray(entry.command) ? entry.command.join(' ') : entry.command);
+      const descPart = typeof entry.description === 'string' ? `: ${entry.description}` : '';
+      return namePart ? `${namePart}${descPart}` : 'Code judge assertion';
+    }
+
+    case 'field-accuracy':
+    case 'field_accuracy': {
+      const fieldPaths = Array.isArray(entry.fields)
+        ? (entry.fields as Array<{ path?: string }>)
+            .map((f) => f.path)
+            .filter(Boolean)
+            .join(', ')
+        : '';
+      return fieldPaths
+        ? `Fields ${fieldPaths} match expected values`
+        : 'Fields match expected values';
+    }
+
+    case 'latency':
+      return typeof entry.threshold === 'number'
+        ? `Response time under ${entry.threshold}ms`
+        : 'Response time within threshold';
+
+    case 'cost':
+      return typeof entry.budget === 'number'
+        ? `Cost under $${entry.budget}`
+        : 'Cost within budget';
+
+    case 'token-usage':
+    case 'token_usage':
+      return 'Token usage within limits';
+
+    case 'execution-metrics':
+    case 'execution_metrics':
+      return 'Execution within metric bounds';
+
+    default:
+      // Unknown type: try to produce something readable
+      if (typeof entry.criteria === 'string') return entry.criteria;
+      if (typeof entry.prompt === 'string') return entry.prompt;
+      return type ? `${type} assertion` : null;
+  }
+}
+
+/**
+ * Expand a single assertion entry into zero or more NL strings.
+ * Most assertions produce exactly one string; agent-judge with rubrics expands to many.
+ */
+function assertionToNaturalLanguageList(entry: RawAssertEntry): string[] {
+  if (entry.type === 'agent-judge' || entry.type === 'agent_judge') {
+    if (Array.isArray(entry.rubrics) && entry.rubrics.length > 0) {
+      return (entry.rubrics as Array<{ outcome?: string; criteria?: string; id?: string }>)
+        .map((r) => r.outcome ?? r.criteria ?? r.id)
+        .filter((s): s is string => typeof s === 'string');
+    }
+  }
+  const nl = assertionToNaturalLanguage(entry);
+  return nl !== null ? [nl] : [];
+}
+
+/**
+ * Extract trigger-judge entries from an assertion list.
+ * Returns entries with type === 'trigger-judge'.
+ */
+function extractTriggerJudges(assertions: RawAssertEntry[]): RawAssertEntry[] {
+  return assertions.filter((a) => a.type === 'trigger-judge');
+}
+
+/**
+ * Collect all assertion entries for a test case, accepting both
+ * `assertions` and deprecated `assert` key.
+ */
+function resolveAssertions(rawCase: RawTestCase): RawAssertEntry[] {
+  if (Array.isArray(rawCase.assertions)) return rawCase.assertions;
+  if (Array.isArray(rawCase.assert)) return rawCase.assert;
+  return [];
+}
+
+/**
+ * Collect suite-level assertions (applied to every test).
+ */
+function resolveSuiteAssertions(suite: RawSuite): RawAssertEntry[] {
+  if (Array.isArray(suite.assertions)) return suite.assertions;
+  if (Array.isArray(suite.assert)) return suite.assert;
+  return [];
+}
+
+// ---------------------------------------------------------------------------
+// Input extraction
+// ---------------------------------------------------------------------------
+
+interface ExtractedInput {
+  prompt: string;
+  files: string[];
+}
+
+/**
+ * Extract prompt text and file paths from a test case input.
+ *
+ * Supports:
+ * - String input → prompt, no files
+ * - Message array with role: user and content blocks
+ * - input_files shorthand (alongside string or message-array input)
+ */
+function extractInput(rawCase: RawTestCase): ExtractedInput {
+  const inputFiles: string[] = [];
+
+  // Collect input_files shorthand
+  if (Array.isArray(rawCase.input_files)) {
+    inputFiles.push(...(rawCase.input_files as string[]).filter((f) => typeof f === 'string'));
+  }
+
+  const input = rawCase.input;
+
+  // String shorthand
+  if (typeof input === 'string') {
+    return { prompt: input, files: inputFiles };
+  }
+
+  // Message array
+  if (Array.isArray(input)) {
+    let promptText = '';
+    const filePaths: string[] = [...inputFiles];
+
+    for (const msg of input as RawMessage[]) {
+      if (msg.role !== 'user') continue;
+
+      // String content
+      if (typeof msg.content === 'string') {
+        promptText = msg.content;
+        continue;
+      }
+
+      // Content block array
+      if (Array.isArray(msg.content)) {
+        for (const block of msg.content as Array<{ type?: string; value?: string }>) {
+          if (block.type === 'text' && typeof block.value === 'string') {
+            promptText = block.value;
+          } else if (block.type === 'file' && typeof block.value === 'string') {
+            filePaths.push(block.value);
+          }
+        }
+      }
+    }
+
+    return { prompt: promptText, files: filePaths };
+  }
+
+  return { prompt: '', files: inputFiles };
+}
+
+/**
+ * Flatten expected_output to a string.
+ * Accepts string, message array (takes last assistant message content),
+ * or any other value serialized to JSON.
+ */
+function extractExpectedOutput(raw: unknown): string | undefined {
+  if (raw === undefined || raw === null) return undefined;
+  if (typeof raw === 'string') return raw;
+
+  if (Array.isArray(raw)) {
+    // Take the last assistant message content
+    for (let i = raw.length - 1; i >= 0; i--) {
+      const msg = raw[i] as RawMessage;
+      if (typeof msg.content === 'string') return msg.content;
+    }
+    return undefined;
+  }
+
+  return JSON.stringify(raw);
+}
+
+// ---------------------------------------------------------------------------
+// Transpiler core
+// ---------------------------------------------------------------------------
+
+/**
+ * Result of transpiling a single EVAL.yaml.
+ * May produce multiple evals.json files (one per skill).
+ */
+export interface TranspileResult {
+  /** Map from skill_name → EvalsJsonFile */
+  files: Map<string, EvalsJsonFile>;
+  /** Warning messages accumulated during transpilation */
+  warnings: string[];
+}
+
+/**
+ * Transpile a parsed EVAL.yaml object into one or more evals.json objects.
+ *
+ * @param suite  Parsed YAML object (already loaded, no file I/O here)
+ * @param source Source identifier for error messages (e.g. file path)
+ */
+export function transpileEvalYaml(suite: unknown, source = 'EVAL.yaml'): TranspileResult {
+  const warnings: string[] = [];
+  const files = new Map<string, EvalsJsonFile>();
+
+  if (typeof suite !== 'object' || suite === null) {
+    throw new Error(`Invalid EVAL.yaml: expected an object in '${source}'`);
+  }
+
+  const rawSuite = suite as RawSuite;
+
+  if (!Array.isArray(rawSuite.tests)) {
+    throw new Error(`Invalid EVAL.yaml: missing 'tests' array in '${source}'`);
+  }
+
+  if (rawSuite.assert !== undefined && rawSuite.assertions === undefined) {
+    warnings.push("'assert' is deprecated at the suite level. Use 'assertions' instead.");
+  }
+
+  const suiteAssertions = resolveSuiteAssertions(rawSuite);
+
+  // Suite-level NL assertions (appended to every test)
+  const suiteNlAssertions: string[] = suiteAssertions
+    .filter((a) => a.type !== 'trigger-judge')
+    .flatMap(assertionToNaturalLanguageList);
+
+  /**
+   * Helper: get or create the EvalsJsonFile for a skill.
+   */
+  function getSkillFile(skillName: string): EvalsJsonFile {
+    const existing = files.get(skillName);
+    if (existing) return existing;
+    const created: EvalsJsonFile = { skill_name: skillName, evals: [] };
+    files.set(skillName, created);
+    return created;
+  }
+
+  const tests = rawSuite.tests as RawTestCase[];
+
+  for (let idx = 0; idx < tests.length; idx++) {
+    const rawCase = tests[idx];
+    const caseAssertions = resolveAssertions(rawCase);
+
+    if (rawCase.assert !== undefined && rawCase.assertions === undefined) {
+      const caseId = rawCase.id ?? idx + 1;
+      warnings.push(`Test '${caseId}': 'assert' is deprecated. Use 'assertions' instead.`);
+    }
+
+    // Collect NL assertions (not trigger-judge)
+    const nlAssertions: string[] = [];
+
+    // Prepend test-level criteria as NL assertion
+    if (typeof rawCase.criteria === 'string' && rawCase.criteria.trim()) {
+      nlAssertions.push(rawCase.criteria.trim());
+    }
+
+    for (const entry of caseAssertions) {
+      if (entry.type !== 'trigger-judge') {
+        nlAssertions.push(...assertionToNaturalLanguageList(entry));
+      }
+    }
+
+    // Append suite-level NL assertions
+    nlAssertions.push(...suiteNlAssertions);
+
+    const triggerJudges = extractTriggerJudges(caseAssertions);
+    const { prompt, files: inputFiles } = extractInput(rawCase);
+    const expectedOutput = extractExpectedOutput(rawCase.expected_output);
+
+    // Build the numeric id (1-based index)
+    const numericId = idx + 1;
+
+    // Build the base case (without should_trigger — added per-skill below)
+    const baseCase: Omit<EvalsJsonCase, 'should_trigger'> & { should_trigger?: boolean } = {
+      id: numericId,
+      prompt,
+      ...(expectedOutput !== undefined && { expected_output: expectedOutput }),
+      ...(inputFiles.length > 0 && { files: inputFiles }),
+      assertions: nlAssertions,
+    };
+
+    if (triggerJudges.length === 0) {
+      // No trigger-judge: place in dominant skill (or _no-skill)
+      // Determine dominant skill by scanning all tests (first occurrence wins)
+      // We defer this: record with a sentinel and resolve after all tests are processed.
+      // For now, push to _no-skill; we'll re-assign at the end.
+      const noSkillFile = getSkillFile('_no-skill');
+      noSkillFile.evals.push({ ...baseCase });
+    } else {
+      // Place in each skill with the correct should_trigger value
+      for (const tj of triggerJudges) {
+        const skillName = typeof tj.skill === 'string' ? tj.skill : '_no-skill';
+        const shouldTrigger = tj.should_trigger !== false; // default true
+        const skillFile = getSkillFile(skillName);
+        skillFile.evals.push({ ...baseCase, should_trigger: shouldTrigger });
+      }
+    }
+  }
+
+  // Re-assign _no-skill tests to the dominant skill (if one exists)
+  const noSkillFile = files.get('_no-skill');
+  if (noSkillFile && noSkillFile.evals.length > 0) {
+    // Find the skill with the most tests (among real skills)
+    let dominantSkill: string | null = null;
+    let maxCount = 0;
+    for (const [name, f] of files) {
+      if (name !== '_no-skill' && f.evals.length > maxCount) {
+        maxCount = f.evals.length;
+        dominantSkill = name;
+      }
+    }
+
+    if (dominantSkill) {
+      const targetFile = getSkillFile(dominantSkill);
+      for (const evalCase of noSkillFile.evals) {
+        targetFile.evals.push(evalCase);
+      }
+      files.delete('_no-skill');
+    }
+    // else: keep _no-skill if there are no other skills
+  }
+
+  return { files, warnings };
+}
+
+// ---------------------------------------------------------------------------
+// File-level API
+// ---------------------------------------------------------------------------
+
+/**
+ * Transpile an EVAL.yaml file into one or more evals.json objects.
+ * Returns a map from output filename → JSON content.
+ *
+ * @param evalYamlPath  Absolute path to the EVAL.yaml file
+ */
+export function transpileEvalYamlFile(evalYamlPath: string): TranspileResult {
+  const content = readFileSync(evalYamlPath, 'utf8');
+  const parsed = parse(content) as unknown;
+  return transpileEvalYaml(parsed, path.basename(evalYamlPath));
+}
+
+/**
+ * Determine the output filename(s) for a transpile result.
+ * Single skill → "evals.json"
+ * Multiple skills → "<skill>.evals.json"
+ */
+export function getOutputFilenames(result: TranspileResult): Map<string, string> {
+  const names = new Map<string, string>();
+  if (result.files.size === 1) {
+    for (const [skill] of result.files) {
+      names.set(skill, 'evals.json');
+    }
+  } else {
+    for (const [skill] of result.files) {
+      const safeName = skill.replace(/[^a-zA-Z0-9_-]/g, '_');
+      names.set(skill, `${safeName}.evals.json`);
+    }
+  }
+  return names;
+}
diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts
index 7df57f3f2..37f97bb18 100644
--- a/packages/core/src/index.ts
+++ b/packages/core/src/index.ts
@@ -5,6 +5,16 @@ export {
   isAgentSkillsFormat,
   parseAgentSkillsEvals,
 } from './evaluation/loaders/agent-skills-parser.js';
+export {
+  transpileEvalYaml,
+  transpileEvalYamlFile,
+  getOutputFilenames,
+} from './evaluation/loaders/eval-yaml-transpiler.js';
+export type {
+  EvalsJsonCase,
+  EvalsJsonFile,
+  TranspileResult,
+} from './evaluation/loaders/eval-yaml-transpiler.js';
 export * from './evaluation/file-utils.js';
 export * from './evaluation/providers/index.js';
 export * from './evaluation/evaluators.js';
diff --git a/packages/core/test/evaluation/loaders/eval-yaml-transpiler.test.ts b/packages/core/test/evaluation/loaders/eval-yaml-transpiler.test.ts
new file mode 100644
index 000000000..e4d3177c2
--- /dev/null
+++ b/packages/core/test/evaluation/loaders/eval-yaml-transpiler.test.ts
@@ -0,0 +1,672 @@
+import { describe, expect, it } from 'bun:test';
+
+import {
+  getOutputFilenames,
+  transpileEvalYaml,
+} from '../../../src/evaluation/loaders/eval-yaml-transpiler.js';
+
+// ---------------------------------------------------------------------------
+// Fixtures
+// ---------------------------------------------------------------------------
+
+const SINGLE_SKILL_SUITE = {
+  tests: [
+    {
+      id: 'csv-top-months',
+      criteria: 'Agent finds the top 3 months by revenue',
+      input: [
+        {
+          role: 'user',
+          content: [
+            { type: 'file', value: 'evals/files/sales.csv' },
+            {
+              type: 'text',
+              value: 'I have a CSV of monthly sales data. Find the top 3 months by revenue.',
+            },
+          ],
+        },
+      ],
+      expected_output:
+        'The top 3 months by revenue are November ($22,500), September ($20,100), and December ($19,400).',
+      assertions: [
+        { type: 'trigger-judge', skill: 'csv-analyzer', should_trigger: true },
+        { type: 'rubrics', criteria: 'Output identifies November as the highest revenue month' },
+        { type: 'contains', value: '$22,500' },
+      ],
+    },
+    {
+      id: 'irrelevant-query',
+      input: 'What time is it?',
+      assertions: [{ type: 'trigger-judge', skill: 'csv-analyzer', should_trigger: false }],
+    },
+  ],
+};
+
+// ---------------------------------------------------------------------------
+// Basic transpilation
+// ---------------------------------------------------------------------------
+
+describe('transpileEvalYaml — basic', () => {
+  it('produces one evals.json for a single-skill suite', () => {
+    const { files } = transpileEvalYaml(SINGLE_SKILL_SUITE);
+    expect(files.size).toBe(1);
+    expect(files.has('csv-analyzer')).toBe(true);
+  });
+
+  it('sets skill_name correctly', () => {
+    const { files } = transpileEvalYaml(SINGLE_SKILL_SUITE);
+    expect(files.get('csv-analyzer')?.skill_name).toBe('csv-analyzer');
+  });
+
+  it('produces two evals in output', () => {
+    const { files } = transpileEvalYaml(SINGLE_SKILL_SUITE);
+    expect(files.get('csv-analyzer')?.evals).toHaveLength(2);
+  });
+
+  it('assigns 1-based numeric ids', () => {
+    const { files } = transpileEvalYaml(SINGLE_SKILL_SUITE);
+    const evals = files.get('csv-analyzer')?.evals;
+    expect(evals[0].id).toBe(1);
+    expect(evals[1].id).toBe(2);
+  });
+});
+
+// ---------------------------------------------------------------------------
+// Input extraction
+// ---------------------------------------------------------------------------
+
+describe('transpileEvalYaml — input extraction', () => {
+  it('extracts prompt from content block (type: text)', () => {
+    const { files } = transpileEvalYaml(SINGLE_SKILL_SUITE);
+    const evals = files.get('csv-analyzer')?.evals;
+    expect(evals[0].prompt).toBe(
+      'I have a CSV of monthly sales data. Find the top 3 months by revenue.',
+    );
+  });
+
+  it('extracts files from content block (type: file)', () => {
+    const { files } = transpileEvalYaml(SINGLE_SKILL_SUITE);
+    const evals = files.get('csv-analyzer')?.evals;
+    expect(evals[0].files).toEqual(['evals/files/sales.csv']);
+  });
+
+  it('handles string input shorthand', () => {
+    const { files } = transpileEvalYaml(SINGLE_SKILL_SUITE);
+    const evals = files.get('csv-analyzer')?.evals;
+    expect(evals[1].prompt).toBe('What time is it?');
+  });
+
+  it('does not include files when none present', () => {
+    const { files } = transpileEvalYaml(SINGLE_SKILL_SUITE);
+    const evals = files.get('csv-analyzer')?.evals;
+    expect(evals[1].files).toBeUndefined();
+  });
+});
+
+// ---------------------------------------------------------------------------
+// Trigger-judge handling
+// ---------------------------------------------------------------------------
+
+describe('transpileEvalYaml — trigger-judge', () => {
+  it('sets should_trigger: true for trigger-judge with should_trigger true', () => {
+    const { files } = transpileEvalYaml(SINGLE_SKILL_SUITE);
+    const evals = files.get('csv-analyzer')?.evals;
+    expect(evals[0].should_trigger).toBe(true);
+  });
+
+  it('sets should_trigger: false for trigger-judge with should_trigger false', () => {
+    const { files } = transpileEvalYaml(SINGLE_SKILL_SUITE);
+    const evals = files.get('csv-analyzer')?.evals;
+    expect(evals[1].should_trigger).toBe(false);
+  });
+
+  it('omits should_trigger when no trigger-judge in test', () => {
+    const suite = {
+      tests: [
+        {
+          id: 'no-trigger',
+          input: 'Hello',
+          assertions: [{ type: 'contains', value: 'Hi' }],
+        },
+      ],
+    };
+    const { files } = transpileEvalYaml(suite);
+    // No skill: goes to _no-skill (or dominant skill if set)
+    const allFiles = [...files.values()];
+    expect(allFiles).toHaveLength(1);
+    expect(allFiles[0].evals[0].should_trigger).toBeUndefined();
+  });
+
+  it('trigger-judge is NOT included in assertions array', () => {
+    const { files } = transpileEvalYaml(SINGLE_SKILL_SUITE);
+    const evals = files.get('csv-analyzer')?.evals;
+    // assertions should contain NL items, not 'trigger-judge' literal
+    for (const a of evals[0].assertions) {
+      expect(a).not.toContain('trigger-judge');
+    }
+  });
+});
+
+// ---------------------------------------------------------------------------
+// NL assertion conversion
+// ---------------------------------------------------------------------------
+
+describe('transpileEvalYaml — NL assertions', () => {
+  it('prepends criteria to assertions', () => {
+    const { files } = transpileEvalYaml(SINGLE_SKILL_SUITE);
+    const evals = files.get('csv-analyzer')?.evals;
+    expect(evals[0].assertions[0]).toBe('Agent finds the top 3 months by revenue');
+  });
+
+  it('converts rubrics type to criteria string', () => {
+    const { files } = transpileEvalYaml(SINGLE_SKILL_SUITE);
+    const evals = files.get('csv-analyzer')?.evals;
+    expect(evals[0].assertions).toContain(
+      'Output identifies November as the highest revenue month',
+    );
+  });
+
+  it('converts contains to NL', () => {
+    const { files } = transpileEvalYaml(SINGLE_SKILL_SUITE);
+    const evals = files.get('csv-analyzer')?.evals;
+    expect(evals[0].assertions).toContain("Output contains '$22,500'");
+  });
+
+  it('converts regex to NL', () => {
+    const suite = {
+      tests: [
+        {
+          id: 't1',
+          input: 'test',
+          assertions: [
+            { type: 'trigger-judge', skill: 's', should_trigger: true },
+            { type: 'regex', value: '\\d{4}-\\d{2}-\\d{2}' },
+          ],
+        },
+      ],
+    };
+    const { files } = transpileEvalYaml(suite);
+    const evals = files.get('s')?.evals;
+    expect(evals[0].assertions).toContain('Output matches regex: \\d{4}-\\d{2}-\\d{2}');
+  });
+
+  it('converts equals to NL', () => {
+    const suite = {
+      tests: [
+        {
+          id: 't1',
+          input: 'test',
+          assertions: [
+            { type: 'trigger-judge', skill: 's', should_trigger: true },
+            { type: 'equals', value: 'exact answer' },
+          ],
+        },
+      ],
+    };
+    const { files } = transpileEvalYaml(suite);
+    const evals = files.get('s')?.evals;
+    expect(evals[0].assertions).toContain('Output exactly equals: exact answer');
+  });
+
+  it('converts is-json to NL', () => {
+    const suite = {
+      tests: [
+        {
+          id: 't1',
+          input: 'test',
+          assertions: [
+            { type: 'trigger-judge', skill: 's', should_trigger: true },
+            { type: 'is-json' },
+          ],
+        },
+      ],
+    };
+    const { files } = transpileEvalYaml(suite);
+    const evals = files.get('s')?.evals;
+    expect(evals[0].assertions).toContain('Output is valid JSON');
+  });
+
+  it('converts llm-judge prompt to NL', () => {
+    const suite = {
+      tests: [
+        {
+          id: 't1',
+          input: 'test',
+          assertions: [
+            { type: 'trigger-judge', skill: 's', should_trigger: true },
+            { type: 'llm-judge', prompt: 'The answer is clear and concise' },
+          ],
+        },
+      ],
+    };
+    const { files } = transpileEvalYaml(suite);
+    const evals = files.get('s')?.evals;
+    expect(evals[0].assertions).toContain('The answer is clear and concise');
+  });
+
+  it('converts agent-judge with rubrics to multiple assertions', () => {
+    const suite = {
+      tests: [
+        {
+          id: 't1',
+          input: 'test',
+          assertions: [
+            { type: 'trigger-judge', skill: 's', should_trigger: true },
+            {
+              type: 'agent-judge',
+              rubrics: [
+                { id: 'r1', outcome: 'Correct result returned' },
+                { id: 'r2', outcome: 'No unnecessary steps' },
+              ],
+            },
+          ],
+        },
+      ],
+    };
+    const { files } = transpileEvalYaml(suite);
+    const evals = files.get('s')?.evals;
+    expect(evals[0].assertions).toContain('Correct result returned');
+    expect(evals[0].assertions).toContain('No unnecessary steps');
+  });
+
+  it('converts tool-trajectory to NL', () => {
+    const suite = {
+      tests: [
+        {
+          id: 't1',
+          input: 'test',
+          assertions: [
+            { type: 'trigger-judge', skill: 's', should_trigger: true },
+            {
+              type: 'tool-trajectory',
+              expected: [{ tool: 'read_file' }, { tool: 'write_file' }],
+            },
+          ],
+        },
+      ],
+    };
+    const { files } = transpileEvalYaml(suite);
+    const evals = files.get('s')?.evals;
+    expect(evals[0].assertions).toContain('Agent called tools in order: read_file, write_file');
+  });
+
+  it('converts code-judge with name to NL', () => {
+    const suite = {
+      tests: [
+        {
+          id: 't1',
+          input: 'test',
+          assertions: [
+            { type: 'trigger-judge', skill: 's', should_trigger: true },
+            {
+              type: 'code-judge',
+              name: 'trigger-judge',
+              description: 'Checks skill was triggered',
+            },
+          ],
+        },
+      ],
+    };
+    const { files } = transpileEvalYaml(suite);
+    const evals = files.get('s')?.evals;
+    expect(evals[0].assertions).toContain('trigger-judge: Checks skill was triggered');
+  });
+
+  it('converts field-accuracy to NL', () => {
+    const suite = {
+      tests: [
+        {
+          id: 't1',
+          input: 'test',
+          assertions: [
+            { type: 'trigger-judge', skill: 's', should_trigger: true },
+            {
+              type: 'field-accuracy',
+              fields: [{ path: 'invoice.total' }, { path: 'invoice.date' }],
+            },
+          ],
+        },
+      ],
+    };
+    const { files } = transpileEvalYaml(suite);
+    const evals = files.get('s')?.evals;
+    expect(evals[0].assertions).toContain(
+      'Fields invoice.total, invoice.date match expected values',
+    );
+  });
+
+  it('converts latency to NL', () => {
+    const suite = {
+      tests: [
+        {
+          id: 't1',
+          input: 'test',
+          assertions: [
+            { type: 'trigger-judge', skill: 's', should_trigger: true },
+            { type: 'latency', threshold: 5000 },
+          ],
+        },
+      ],
+    };
+    const { files } = transpileEvalYaml(suite);
+    const evals = files.get('s')?.evals;
+    expect(evals[0].assertions).toContain('Response time under 5000ms');
+  });
+
+  it('converts cost to NL', () => {
+    const suite = {
+      tests: [
+        {
+          id: 't1',
+          input: 'test',
+          assertions: [
+            { type: 'trigger-judge', skill: 's', should_trigger: true },
+            { type: 'cost', budget: 0.1 },
+          ],
+        },
+      ],
+    };
+    const { files } = transpileEvalYaml(suite);
+    const evals = files.get('s')?.evals;
+    expect(evals[0].assertions).toContain('Cost under $0.1');
+  });
+
+  it('converts token-usage to NL', () => {
+    const suite = {
+      tests: [
+        {
+          id: 't1',
+          input: 'test',
+          assertions: [
+            { type: 'trigger-judge', skill: 's', should_trigger: true },
+            { type: 'token-usage', max_total: 1000 },
+          ],
+        },
+      ],
+    };
+    const { files } = transpileEvalYaml(suite);
+    const evals = files.get('s')?.evals;
+    expect(evals[0].assertions).toContain('Token usage within limits');
+  });
+
+  it('converts execution-metrics to NL', () => {
+    const suite = {
+      tests: [
+        {
+          id: 't1',
+          input: 'test',
+          assertions: [
+            { type: 'trigger-judge', skill: 's', should_trigger: true },
+            { type: 'execution-metrics', max_tool_calls: 10 },
+          ],
+        },
+      ],
+    };
+    const { files } = transpileEvalYaml(suite);
+    const evals = files.get('s')?.evals;
+    expect(evals[0].assertions).toContain('Execution within metric bounds');
+  });
+});
+
+// ---------------------------------------------------------------------------
+// expected_output
+// ---------------------------------------------------------------------------
+
+describe('transpileEvalYaml — expected_output', () => {
+  it('includes expected_output as string', () => {
+    const { files } = transpileEvalYaml(SINGLE_SKILL_SUITE);
+    const evals = files.get('csv-analyzer')?.evals;
+    expect(evals[0].expected_output).toBe(
+      'The top 3 months by revenue are November ($22,500), September ($20,100), and December ($19,400).',
+    );
+  });
+
+  it('omits expected_output when absent', () => {
+    const { files } = transpileEvalYaml(SINGLE_SKILL_SUITE);
+    const evals = files.get('csv-analyzer')?.evals;
+    expect(evals[1].expected_output).toBeUndefined();
+  });
+
+  it('extracts string content from message array expected_output', () => {
+    const suite = {
+      tests: [
+        {
+          id: 't1',
+          input: 'Hello',
+          expected_output: [{ role: 'assistant', content: 'World' }],
+          assertions: [{ type: 'trigger-judge', skill: 's', should_trigger: true }],
+        },
+      ],
+    };
+    const { files } = transpileEvalYaml(suite);
+    expect(files.get('s')?.evals[0].expected_output).toBe('World');
+  });
+});
+
+// ---------------------------------------------------------------------------
+// input_files shorthand
+// ---------------------------------------------------------------------------
+
+describe('transpileEvalYaml — input_files shorthand', () => {
+  it('expands input_files alongside string input', () => {
+    const suite = {
+      tests: [
+        {
+          id: 't1',
+          input: 'Analyze this file',
+          input_files: ['data/file.csv', 'data/schema.json'],
+          assertions: [{ type: 'trigger-judge', skill: 's', should_trigger: true }],
+        },
+      ],
+    };
+    const { files } = transpileEvalYaml(suite);
+    const evals = files.get('s')?.evals;
+    expect(evals[0].prompt).toBe('Analyze this file');
+    expect(evals[0].files).toEqual(['data/file.csv', 'data/schema.json']);
+  });
+});
+
+// ---------------------------------------------------------------------------
+// Root-level assertions distribution
+// ---------------------------------------------------------------------------
+
+describe('transpileEvalYaml — suite-level assertions', () => {
+  it('appends suite-level NL assertions to every test', () => {
+    const suite = {
+      tests: [
+        {
+          id: 't1',
+          input: 'first',
+          assertions: [{ type: 'trigger-judge', skill: 's', should_trigger: true }],
+        },
+        {
+          id: 't2',
+          input: 'second',
+          assertions: [{ type: 'trigger-judge', skill: 's', should_trigger: true }],
+        },
+      ],
+      assertions: [{ type: 'contains', value: 'global-check' }],
+    };
+    const { files } = transpileEvalYaml(suite);
+    const evals = files.get('s')?.evals;
+    expect(evals[0].assertions).toContain("Output contains 'global-check'");
+    expect(evals[1].assertions).toContain("Output contains 'global-check'");
+  });
+
+  it('accepts deprecated assert: key at suite level', () => {
+    const suite = {
+      tests: [
+        {
+          id: 't1',
+          input: 'hello',
+          assert: [{ type: 'trigger-judge', skill: 's', should_trigger: true }],
+        },
+      ],
+      assert: [{ type: 'contains', value: 'suite-level' }],
+    };
+    const { files, warnings } = transpileEvalYaml(suite);
+    const evals = files.get('s')?.evals;
+    expect(evals[0].assertions).toContain("Output contains 'suite-level'");
+    expect(warnings.some((w) => w.includes("'assert' is deprecated"))).toBe(true);
+  });
+});
+
+// ---------------------------------------------------------------------------
+// Deprecated assert: key at test level
+// ---------------------------------------------------------------------------
+
+describe('transpileEvalYaml — deprecated assert: key', () => {
+  it('accepts assert: key at test level with deprecation warning', () => {
+    const suite = {
+      tests: [
+        {
+          id: 't1',
+          input: 'Hello',
+          assert: [
+            { type: 'trigger-judge', skill: 'skill-a', should_trigger: true },
+            { type: 'contains', value: 'world' },
+          ],
+        },
+      ],
+    };
+    const { files, warnings } = transpileEvalYaml(suite);
+    expect(files.has('skill-a')).toBe(true);
+    expect(files.get('skill-a')?.evals[0].assertions).toContain("Output contains 'world'");
+    expect(warnings.some((w) => w.includes("'assert' is deprecated"))).toBe(true);
+  });
+});
+
+// ---------------------------------------------------------------------------
+// Multi-skill
+// ---------------------------------------------------------------------------
+
+describe('transpileEvalYaml — multi-skill', () => {
+  it('produces one evals.json per skill', () => {
+    const suite = {
+      tests: [
+        {
+          id: 't1',
+          input: 'Hello',
+          assertions: [{ type: 'trigger-judge', skill: 'skill-a', should_trigger: true }],
+        },
+        {
+          id: 't2',
+          input: 'World',
+          assertions: [{ type: 'trigger-judge', skill: 'skill-b', should_trigger: true }],
+        },
+      ],
+    };
+    const { files } = transpileEvalYaml(suite);
+    expect(files.size).toBe(2);
+    expect(files.has('skill-a')).toBe(true);
+    expect(files.has('skill-b')).toBe(true);
+  });
+
+  it('places test in both files when it has trigger-judges for two skills', () => {
+    const suite = {
+      tests: [
+        {
+          id: 'shared',
+          input: 'Do something',
+          assertions: [
+            { type: 'trigger-judge', skill: 'skill-a', should_trigger: true },
+            { type: 'trigger-judge', skill: 'skill-b', should_trigger: false },
+          ],
+        },
+      ],
+    };
+    const { files } = transpileEvalYaml(suite);
+    expect(files.size).toBe(2);
+    expect(files.get('skill-a')?.evals[0].should_trigger).toBe(true);
+    expect(files.get('skill-b')?.evals[0].should_trigger).toBe(false);
+  });
+
+  it('assigns tests with no trigger-judge to dominant skill', () => {
+    const suite = {
+      tests: [
+        {
+          id: 't1',
+          input: 'Hello',
+          assertions: [
+            { type: 'trigger-judge', skill: 'skill-a', should_trigger: true },
+            { type: 'contains', value: 'hi' },
+          ],
+        },
+        {
+          id: 't2',
+          input: 'No trigger here',
+          assertions: [{ type: 'contains', value: 'world' }],
+        },
+      ],
+    };
+    const { files } = transpileEvalYaml(suite);
+    // _no-skill should be absorbed into skill-a (dominant)
+    expect(files.has('_no-skill')).toBe(false);
+    expect(files.get('skill-a')?.evals).toHaveLength(2);
+  });
+
+  it('keeps _no-skill file when there are no other skills', () => {
+    const suite = {
+      tests: [
+        {
+          id: 't1',
+          input: 'Hello',
+          assertions: [{ type: 'contains', value: 'hi' }],
+        },
+      ],
+    };
+    const { files } = transpileEvalYaml(suite);
+    expect(files.has('_no-skill')).toBe(true);
+  });
+});
+
+// ---------------------------------------------------------------------------
+// Error handling
+// ---------------------------------------------------------------------------
+
+describe('transpileEvalYaml — error handling', () => {
+  it('throws when input is not an object', () => {
+    expect(() => transpileEvalYaml('not an object')).toThrow('Invalid EVAL.yaml');
+  });
+
+  it('throws when tests array is missing', () => {
+    expect(() => transpileEvalYaml({})).toThrow("missing 'tests' array");
+  });
+
+  it('includes source in error messages', () => {
+    expect(() => transpileEvalYaml({}, 'my-file.yaml')).toThrow('my-file.yaml');
+  });
+});
+
+// ---------------------------------------------------------------------------
+// getOutputFilenames
+// ---------------------------------------------------------------------------
+
+describe('getOutputFilenames', () => {
+  it('returns evals.json for single-skill result', () => {
+    const { files } = transpileEvalYaml(SINGLE_SKILL_SUITE);
+    const names = getOutputFilenames({ files, warnings: [] });
+    expect(names.get('csv-analyzer')).toBe('evals.json');
+  });
+
+  it('returns skill-prefixed filenames for multi-skill result', () => {
+    const suite = {
+      tests: [
+        {
+          id: 't1',
+          input: 'Hello',
+          assertions: [{ type: 'trigger-judge', skill: 'skill-a', should_trigger: true }],
+        },
+        {
+          id: 't2',
+          input: 'World',
+          assertions: [{ type: 'trigger-judge', skill: 'skill-b', should_trigger: true }],
+        },
+      ],
+    };
+    const result = transpileEvalYaml(suite);
+    const names = getOutputFilenames(result);
+    expect(names.get('skill-a')).toBe('skill-a.evals.json');
+    expect(names.get('skill-b')).toBe('skill-b.evals.json');
+  });
+});