diff --git a/examples/features/agent-skills-evals/.agentv/judges/trigger-judge.ts b/examples/features/agent-skills-evals/.agentv/judges/trigger-judge.ts index fb2d4d866..cd05e295f 100644 --- a/examples/features/agent-skills-evals/.agentv/judges/trigger-judge.ts +++ b/examples/features/agent-skills-evals/.agentv/judges/trigger-judge.ts @@ -1,89 +1,13 @@ -#!/usr/bin/env bun /** - * trigger-judge: detects whether the agent invoked a named Claude Code skill. + * @deprecated This file is kept for reference only. * - * Mirrors the post-hoc fallback detection in skill-creator's run_eval.py: - * - Only the FIRST tool call matters. Any non-Skill/Read tool as the first - * call means the skill was not triggered (mirrors run_eval.py's early-exit). - * - Skill tool: checks input.skill contains the skill name (case-sensitive). - * - Read tool: checks input.file_path contains the skill name (case-sensitive). - * - Supports negative cases via should_trigger: false. + * `trigger-judge` has been renamed to `skill-trigger` and promoted to a built-in + * evaluator in agentv core. You no longer need this file in .agentv/judges/. * - * Usage in eval YAML: - * assertions: - * - type: trigger-judge # discovered from .agentv/judges/ - * skill: my-skill-name # required: exact name as installed in .claude/commands/ - * should_trigger: true # optional: expected behaviour (default: true) + * Update your EVAL.yaml to use the built-in type: * - * Positive case (should_trigger: true): passes when skill fires. - * Negative case (should_trigger: false): passes when skill does NOT fire. + * assertions: + * - type: skill-trigger # built-in, no extra file needed + * skill: my-skill-name + * should_trigger: true */ -import { defineCodeJudge } from '@agentv/eval'; - -export default defineCodeJudge(({ output, config }) => { - const skillName = config?.skill as string | undefined; - const shouldTrigger = (config?.should_trigger ?? true) as boolean; - - if (!skillName) { - return { - score: 0, - misses: ['config.skill is required'], - reasoning: 'No skill name configured', - }; - } - - // Flatten all tool calls across messages and take only the first one. - // run_eval.py returns false as soon as a non-Skill/Read tool starts, so - // only the first tool call is relevant. - const firstTool = (output ?? []).flatMap((msg) => msg.toolCalls ?? [])[0]; - - let triggered = false; - let evidence = ''; - - if (firstTool) { - const input = (firstTool.input ?? {}) as Record; - - if (firstTool.tool === 'Skill') { - const skillArg = String(input.skill ?? ''); - if (skillArg.includes(skillName)) { - triggered = true; - evidence = `Skill tool invoked with skill="${skillArg}"`; - } - } else if (firstTool.tool === 'Read') { - const filePath = String(input.file_path ?? ''); - if (filePath.includes(skillName)) { - triggered = true; - evidence = `Read tool loaded skill file: ${filePath}`; - } - } - // Any other tool as first call: triggered remains false - } - - const pass = triggered === shouldTrigger; - - if (pass) { - return { - score: 1, - hits: [ - shouldTrigger - ? evidence || `Skill "${skillName}" triggered as expected` - : `Skill "${skillName}" correctly did not trigger`, - ], - reasoning: shouldTrigger ? 'Skill triggered correctly' : 'No false trigger', - }; - } - - return { - score: 0, - misses: [ - shouldTrigger - ? firstTool - ? `First tool was "${firstTool.tool}" — not Skill/Read for "${skillName}"` - : `No tool calls recorded` - : evidence || `Skill "${skillName}" triggered unexpectedly`, - ], - reasoning: shouldTrigger - ? `Skill "${skillName}" was not triggered` - : `False trigger: skill fired when it should not have`, - }; -}); diff --git a/examples/features/transpile/csv-analyzer.EVAL.yaml b/examples/features/transpile/csv-analyzer.EVAL.yaml index 8473efcbc..683e1d670 100644 --- a/examples/features/transpile/csv-analyzer.EVAL.yaml +++ b/examples/features/transpile/csv-analyzer.EVAL.yaml @@ -10,7 +10,7 @@ tests: value: "I have a CSV of monthly sales data. Find the top 3 months by revenue." expected_output: "The top 3 months by revenue are November ($22,500), September ($20,100), and December ($19,400)." assertions: - - type: trigger-judge + - type: skill-trigger skill: csv-analyzer should_trigger: true - type: rubrics @@ -21,6 +21,6 @@ tests: - id: irrelevant-query input: "What time is it?" assertions: - - type: trigger-judge + - type: skill-trigger skill: csv-analyzer should_trigger: false diff --git a/packages/core/src/evaluation/evaluators/index.ts b/packages/core/src/evaluation/evaluators/index.ts index 2a7ea58aa..157ff7c99 100644 --- a/packages/core/src/evaluation/evaluators/index.ts +++ b/packages/core/src/evaluation/evaluators/index.ts @@ -55,6 +55,8 @@ export type { LlmJudgeEvaluatorOptions } from './llm-judge.js'; export { AgentJudgeEvaluator } from './agent-judge.js'; export type { AgentJudgeEvaluatorOptions } from './agent-judge.js'; +export { SkillTriggerEvaluator } from './skill-trigger.js'; + export { assembleLlmJudgePrompt } from './llm-judge-prompt.js'; export type { LlmJudgePromptAssembly } from './llm-judge-prompt.js'; diff --git a/packages/core/src/evaluation/evaluators/skill-trigger.ts b/packages/core/src/evaluation/evaluators/skill-trigger.ts new file mode 100644 index 000000000..7688fb4e4 --- /dev/null +++ b/packages/core/src/evaluation/evaluators/skill-trigger.ts @@ -0,0 +1,90 @@ +/** + * Built-in skill-trigger evaluator. + * + * Detects whether the agent invoked a named Claude Code skill as its first tool call. + * Mirrors the post-hoc fallback detection in skill-creator's run_eval.py: + * - Only the FIRST tool call matters. + * - Skill tool: checks input.skill contains the skill name (case-sensitive substring). + * - Read tool: checks input.file_path contains the skill name (case-sensitive substring). + * - Any other tool as first call means the skill was not triggered. + * - Supports negative cases via should_trigger: false. + */ + +import type { SkillTriggerEvaluatorConfig } from '../types.js'; +import type { EvaluationContext, EvaluationScore, Evaluator } from './types.js'; + +export class SkillTriggerEvaluator implements Evaluator { + readonly kind = 'skill-trigger'; + + private readonly config: SkillTriggerEvaluatorConfig; + + constructor(config: SkillTriggerEvaluatorConfig) { + this.config = config; + } + + evaluate(context: EvaluationContext): EvaluationScore { + const skillName = this.config.skill; + const shouldTrigger = this.config.should_trigger !== false; // default true + + // Flatten all tool calls across messages and take only the first one. + // run_eval.py returns false as soon as a non-Skill/Read tool starts, + // so only the first tool call is relevant. + const firstTool = (context.output ?? []).flatMap((msg) => msg.toolCalls ?? [])[0]; + + let triggered = false; + let evidence = ''; + + if (firstTool) { + const input = (firstTool.input ?? {}) as Record; + + if (firstTool.tool === 'Skill') { + const skillArg = String(input.skill ?? ''); + if (skillArg.includes(skillName)) { + triggered = true; + evidence = `Skill tool invoked with skill="${skillArg}"`; + } + } else if (firstTool.tool === 'Read') { + const filePath = String(input.file_path ?? ''); + if (filePath.includes(skillName)) { + triggered = true; + evidence = `Read tool loaded skill file: ${filePath}`; + } + } + // Any other tool as first call: triggered remains false + } + + const pass = triggered === shouldTrigger; + + if (pass) { + return { + score: 1, + verdict: 'pass', + hits: [ + shouldTrigger + ? evidence || `Skill "${skillName}" triggered as expected` + : `Skill "${skillName}" correctly did not trigger`, + ], + misses: [], + expectedAspectCount: 1, + reasoning: shouldTrigger ? 'Skill triggered correctly' : 'No false trigger', + }; + } + + return { + score: 0, + verdict: 'fail', + hits: [], + misses: [ + shouldTrigger + ? firstTool + ? `First tool was "${firstTool.tool}" — not Skill/Read for "${skillName}"` + : 'No tool calls recorded' + : evidence || `Skill "${skillName}" triggered unexpectedly`, + ], + expectedAspectCount: 1, + reasoning: shouldTrigger + ? `Skill "${skillName}" was not triggered` + : 'False trigger: skill fired when it should not have', + }; + } +} diff --git a/packages/core/src/evaluation/loaders/eval-yaml-transpiler.ts b/packages/core/src/evaluation/loaders/eval-yaml-transpiler.ts index 63e5904a7..4d9560157 100644 --- a/packages/core/src/evaluation/loaders/eval-yaml-transpiler.ts +++ b/packages/core/src/evaluation/loaders/eval-yaml-transpiler.ts @@ -90,7 +90,7 @@ function assertionToNaturalLanguage(entry: RawAssertEntry): string | null { const type = entry.type; switch (type) { - case 'trigger-judge': + case 'skill-trigger': // Handled separately — not an NL assertion return null; @@ -232,11 +232,11 @@ function assertionToNaturalLanguageList(entry: RawAssertEntry): string[] { } /** - * Extract trigger-judge entries from an assertion list. - * Returns entries with type === 'trigger-judge'. + * Extract skill-trigger entries from an assertion list. + * Returns entries with type === 'skill-trigger'. */ function extractTriggerJudges(assertions: RawAssertEntry[]): RawAssertEntry[] { - return assertions.filter((a) => a.type === 'trigger-judge'); + return assertions.filter((a) => a.type === 'skill-trigger'); } /** @@ -370,7 +370,7 @@ export function transpileEvalYaml(suite: unknown, source = 'EVAL.yaml'): Transpi // Suite-level NL assertions (appended to every test) const suiteNlAssertions: string[] = suiteAssertions - .filter((a) => a.type !== 'trigger-judge') + .filter((a) => a.type !== 'skill-trigger') .flatMap(assertionToNaturalLanguageList); /** @@ -395,7 +395,7 @@ export function transpileEvalYaml(suite: unknown, source = 'EVAL.yaml'): Transpi warnings.push(`Test '${caseId}': 'assert' is deprecated. Use 'assertions' instead.`); } - // Collect NL assertions (not trigger-judge) + // Collect NL assertions (not skill-trigger) const nlAssertions: string[] = []; // Prepend test-level criteria as NL assertion @@ -404,7 +404,7 @@ export function transpileEvalYaml(suite: unknown, source = 'EVAL.yaml'): Transpi } for (const entry of caseAssertions) { - if (entry.type !== 'trigger-judge') { + if (entry.type !== 'skill-trigger') { nlAssertions.push(...assertionToNaturalLanguageList(entry)); } } @@ -429,7 +429,7 @@ export function transpileEvalYaml(suite: unknown, source = 'EVAL.yaml'): Transpi }; if (triggerJudges.length === 0) { - // No trigger-judge: place in dominant skill (or _no-skill) + // No skill-trigger: place in dominant skill (or _no-skill) // Determine dominant skill by scanning all tests (first occurrence wins) // We defer this: record with a sentinel and resolve after all tests are processed. // For now, push to _no-skill; we'll re-assign at the end. diff --git a/packages/core/src/evaluation/loaders/evaluator-parser.ts b/packages/core/src/evaluation/loaders/evaluator-parser.ts index 4e540f772..2eb72cb92 100644 --- a/packages/core/src/evaluation/loaders/evaluator-parser.ts +++ b/packages/core/src/evaluation/loaders/evaluator-parser.ts @@ -928,6 +928,28 @@ async function parseEvaluatorList( continue; } + if (typeValue === 'skill-trigger') { + const skillName = asString(rawEvaluator.skill); + if (!skillName) { + logWarning(`Skipping skill-trigger evaluator '${name}' in '${evalId}': missing skill`); + continue; + } + const rawShouldTrigger = rawEvaluator.should_trigger; + const shouldTrigger = typeof rawShouldTrigger === 'boolean' ? rawShouldTrigger : undefined; + const weight = validateWeight(rawEvaluator.weight, name, evalId); + const required = parseRequired(rawEvaluator.required); + evaluators.push({ + name, + type: 'skill-trigger', + skill: skillName, + ...(shouldTrigger !== undefined ? { should_trigger: shouldTrigger } : {}), + ...(weight !== undefined ? { weight } : {}), + ...(required !== undefined ? { required } : {}), + ...(negate !== undefined ? { negate } : {}), + }); + continue; + } + if (typeValue === 'contains') { const value = asString(rawEvaluator.value); if (!value) { @@ -1283,6 +1305,7 @@ async function parseEvaluatorList( /** Assertion evaluator types that support auto-generated names. */ const ASSERTION_TYPES = new Set([ + 'skill-trigger', 'contains', 'contains-any', 'contains-all', @@ -1310,6 +1333,10 @@ function generateAssertionName(typeValue: string, rawEvaluator: JsonObject): str const arrayValue = Array.isArray(rawEvaluator.value) ? rawEvaluator.value : undefined; switch (typeValue) { + case 'skill-trigger': { + const skillValue = asString(rawEvaluator.skill); + return skillValue ? `skill-trigger-${skillValue}` : 'skill-trigger'; + } case 'contains': return value ? `contains-${value}` : 'contains'; case 'contains-any': diff --git a/packages/core/src/evaluation/registry/builtin-evaluators.ts b/packages/core/src/evaluation/registry/builtin-evaluators.ts index 60d190f60..dee6b0237 100644 --- a/packages/core/src/evaluation/registry/builtin-evaluators.ts +++ b/packages/core/src/evaluation/registry/builtin-evaluators.ts @@ -17,6 +17,7 @@ import { FieldAccuracyEvaluator, LatencyEvaluator, LlmJudgeEvaluator, + SkillTriggerEvaluator, TokenUsageEvaluator, ToolTrajectoryEvaluator, runContainsAllAssertion, @@ -55,6 +56,7 @@ import type { LatencyEvaluatorConfig, LlmJudgeEvaluatorConfig, RegexEvaluatorConfig, + SkillTriggerEvaluatorConfig, StartsWithEvaluatorConfig, TokenUsageEvaluatorConfig, } from '../types.js'; @@ -235,6 +237,11 @@ export const agentJudgeFactory: EvaluatorFactoryFn = (config, context) => { }); }; +/** Factory for `skill-trigger` evaluator. */ +export const skillTriggerFactory: EvaluatorFactoryFn = (config) => { + return new SkillTriggerEvaluator(config as SkillTriggerEvaluatorConfig); +}; + /** Factory for `contains` deterministic assertion. */ export const containsFactory: EvaluatorFactoryFn = (config) => { const c = config as ContainsEvaluatorConfig; @@ -434,6 +441,7 @@ export function createBuiltinRegistry(): EvaluatorRegistry { .register('token-usage', tokenUsageFactory) .register('execution-metrics', executionMetricsFactory) .register('agent-judge', agentJudgeFactory) + .register('skill-trigger', skillTriggerFactory) .register('contains', containsFactory) .register('contains-any', containsAnyFactory) .register('contains-all', containsAllFactory) diff --git a/packages/core/src/evaluation/registry/judge-discovery.ts b/packages/core/src/evaluation/registry/judge-discovery.ts index c4a843565..da65ccb8c 100644 --- a/packages/core/src/evaluation/registry/judge-discovery.ts +++ b/packages/core/src/evaluation/registry/judge-discovery.ts @@ -5,7 +5,7 @@ * them as code-judge evaluators in the registry. The file name (without * extension) becomes the evaluator type name. * - * Example: `.agentv/judges/trigger-judge.ts` → type "trigger-judge" in EVAL.yaml + * Example: `.agentv/judges/custom-judge.ts` → type "custom-judge" in EVAL.yaml */ import path from 'node:path'; diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts index ed09d670b..b69c272ab 100644 --- a/packages/core/src/evaluation/types.ts +++ b/packages/core/src/evaluation/types.ts @@ -159,6 +159,7 @@ const EVALUATOR_KIND_VALUES = [ 'token-usage', 'execution-metrics', 'agent-judge', + 'skill-trigger', 'contains', 'contains-any', 'contains-all', @@ -726,6 +727,23 @@ export type RubricsEvaluatorConfig = { readonly negate?: boolean; }; +/** + * Configuration for the skill-trigger evaluator. + * Detects whether the agent invoked a named Claude Code skill as its first tool call. + * Mirrors the post-hoc fallback detection in skill-creator's run_eval.py. + */ +export type SkillTriggerEvaluatorConfig = { + readonly name: string; + readonly type: 'skill-trigger'; + /** The skill name to check for (case-sensitive substring match) */ + readonly skill: string; + /** Whether the skill is expected to trigger (default: true) */ + readonly should_trigger?: boolean; + readonly weight?: number; + readonly required?: boolean | number; + readonly negate?: boolean; +}; + /** * Configuration for the inline-assert evaluator. * Wraps an AssertFn for in-process evaluation via the evaluate() API. @@ -749,6 +767,7 @@ export type EvaluatorConfig = | TokenUsageEvaluatorConfig | ExecutionMetricsEvaluatorConfig | AgentJudgeEvaluatorConfig + | SkillTriggerEvaluatorConfig | ContainsEvaluatorConfig | ContainsAnyEvaluatorConfig | ContainsAllEvaluatorConfig diff --git a/packages/core/test/evaluation/loaders/eval-yaml-transpiler.test.ts b/packages/core/test/evaluation/loaders/eval-yaml-transpiler.test.ts index e4d3177c2..de224a1a2 100644 --- a/packages/core/test/evaluation/loaders/eval-yaml-transpiler.test.ts +++ b/packages/core/test/evaluation/loaders/eval-yaml-transpiler.test.ts @@ -29,7 +29,7 @@ const SINGLE_SKILL_SUITE = { expected_output: 'The top 3 months by revenue are November ($22,500), September ($20,100), and December ($19,400).', assertions: [ - { type: 'trigger-judge', skill: 'csv-analyzer', should_trigger: true }, + { type: 'skill-trigger', skill: 'csv-analyzer', should_trigger: true }, { type: 'rubrics', criteria: 'Output identifies November as the highest revenue month' }, { type: 'contains', value: '$22,500' }, ], @@ -37,7 +37,7 @@ const SINGLE_SKILL_SUITE = { { id: 'irrelevant-query', input: 'What time is it?', - assertions: [{ type: 'trigger-judge', skill: 'csv-analyzer', should_trigger: false }], + assertions: [{ type: 'skill-trigger', skill: 'csv-analyzer', should_trigger: false }], }, ], }; @@ -107,20 +107,20 @@ describe('transpileEvalYaml — input extraction', () => { // Trigger-judge handling // --------------------------------------------------------------------------- -describe('transpileEvalYaml — trigger-judge', () => { - it('sets should_trigger: true for trigger-judge with should_trigger true', () => { +describe('transpileEvalYaml — skill-trigger', () => { + it('sets should_trigger: true for skill-trigger with should_trigger true', () => { const { files } = transpileEvalYaml(SINGLE_SKILL_SUITE); const evals = files.get('csv-analyzer')?.evals; expect(evals[0].should_trigger).toBe(true); }); - it('sets should_trigger: false for trigger-judge with should_trigger false', () => { + it('sets should_trigger: false for skill-trigger with should_trigger false', () => { const { files } = transpileEvalYaml(SINGLE_SKILL_SUITE); const evals = files.get('csv-analyzer')?.evals; expect(evals[1].should_trigger).toBe(false); }); - it('omits should_trigger when no trigger-judge in test', () => { + it('omits should_trigger when no skill-trigger in test', () => { const suite = { tests: [ { @@ -137,12 +137,12 @@ describe('transpileEvalYaml — trigger-judge', () => { expect(allFiles[0].evals[0].should_trigger).toBeUndefined(); }); - it('trigger-judge is NOT included in assertions array', () => { + it('skill-trigger is NOT included in assertions array', () => { const { files } = transpileEvalYaml(SINGLE_SKILL_SUITE); const evals = files.get('csv-analyzer')?.evals; - // assertions should contain NL items, not 'trigger-judge' literal + // assertions should contain NL items, not 'skill-trigger' literal for (const a of evals[0].assertions) { - expect(a).not.toContain('trigger-judge'); + expect(a).not.toContain('skill-trigger'); } }); }); @@ -179,7 +179,7 @@ describe('transpileEvalYaml — NL assertions', () => { id: 't1', input: 'test', assertions: [ - { type: 'trigger-judge', skill: 's', should_trigger: true }, + { type: 'skill-trigger', skill: 's', should_trigger: true }, { type: 'regex', value: '\\d{4}-\\d{2}-\\d{2}' }, ], }, @@ -197,7 +197,7 @@ describe('transpileEvalYaml — NL assertions', () => { id: 't1', input: 'test', assertions: [ - { type: 'trigger-judge', skill: 's', should_trigger: true }, + { type: 'skill-trigger', skill: 's', should_trigger: true }, { type: 'equals', value: 'exact answer' }, ], }, @@ -215,7 +215,7 @@ describe('transpileEvalYaml — NL assertions', () => { id: 't1', input: 'test', assertions: [ - { type: 'trigger-judge', skill: 's', should_trigger: true }, + { type: 'skill-trigger', skill: 's', should_trigger: true }, { type: 'is-json' }, ], }, @@ -233,7 +233,7 @@ describe('transpileEvalYaml — NL assertions', () => { id: 't1', input: 'test', assertions: [ - { type: 'trigger-judge', skill: 's', should_trigger: true }, + { type: 'skill-trigger', skill: 's', should_trigger: true }, { type: 'llm-judge', prompt: 'The answer is clear and concise' }, ], }, @@ -251,7 +251,7 @@ describe('transpileEvalYaml — NL assertions', () => { id: 't1', input: 'test', assertions: [ - { type: 'trigger-judge', skill: 's', should_trigger: true }, + { type: 'skill-trigger', skill: 's', should_trigger: true }, { type: 'agent-judge', rubrics: [ @@ -276,7 +276,7 @@ describe('transpileEvalYaml — NL assertions', () => { id: 't1', input: 'test', assertions: [ - { type: 'trigger-judge', skill: 's', should_trigger: true }, + { type: 'skill-trigger', skill: 's', should_trigger: true }, { type: 'tool-trajectory', expected: [{ tool: 'read_file' }, { tool: 'write_file' }], @@ -297,10 +297,10 @@ describe('transpileEvalYaml — NL assertions', () => { id: 't1', input: 'test', assertions: [ - { type: 'trigger-judge', skill: 's', should_trigger: true }, + { type: 'skill-trigger', skill: 's', should_trigger: true }, { type: 'code-judge', - name: 'trigger-judge', + name: 'skill-trigger', description: 'Checks skill was triggered', }, ], @@ -309,7 +309,7 @@ describe('transpileEvalYaml — NL assertions', () => { }; const { files } = transpileEvalYaml(suite); const evals = files.get('s')?.evals; - expect(evals[0].assertions).toContain('trigger-judge: Checks skill was triggered'); + expect(evals[0].assertions).toContain('skill-trigger: Checks skill was triggered'); }); it('converts field-accuracy to NL', () => { @@ -319,7 +319,7 @@ describe('transpileEvalYaml — NL assertions', () => { id: 't1', input: 'test', assertions: [ - { type: 'trigger-judge', skill: 's', should_trigger: true }, + { type: 'skill-trigger', skill: 's', should_trigger: true }, { type: 'field-accuracy', fields: [{ path: 'invoice.total' }, { path: 'invoice.date' }], @@ -342,7 +342,7 @@ describe('transpileEvalYaml — NL assertions', () => { id: 't1', input: 'test', assertions: [ - { type: 'trigger-judge', skill: 's', should_trigger: true }, + { type: 'skill-trigger', skill: 's', should_trigger: true }, { type: 'latency', threshold: 5000 }, ], }, @@ -360,7 +360,7 @@ describe('transpileEvalYaml — NL assertions', () => { id: 't1', input: 'test', assertions: [ - { type: 'trigger-judge', skill: 's', should_trigger: true }, + { type: 'skill-trigger', skill: 's', should_trigger: true }, { type: 'cost', budget: 0.1 }, ], }, @@ -378,7 +378,7 @@ describe('transpileEvalYaml — NL assertions', () => { id: 't1', input: 'test', assertions: [ - { type: 'trigger-judge', skill: 's', should_trigger: true }, + { type: 'skill-trigger', skill: 's', should_trigger: true }, { type: 'token-usage', max_total: 1000 }, ], }, @@ -396,7 +396,7 @@ describe('transpileEvalYaml — NL assertions', () => { id: 't1', input: 'test', assertions: [ - { type: 'trigger-judge', skill: 's', should_trigger: true }, + { type: 'skill-trigger', skill: 's', should_trigger: true }, { type: 'execution-metrics', max_tool_calls: 10 }, ], }, @@ -434,7 +434,7 @@ describe('transpileEvalYaml — expected_output', () => { id: 't1', input: 'Hello', expected_output: [{ role: 'assistant', content: 'World' }], - assertions: [{ type: 'trigger-judge', skill: 's', should_trigger: true }], + assertions: [{ type: 'skill-trigger', skill: 's', should_trigger: true }], }, ], }; @@ -455,7 +455,7 @@ describe('transpileEvalYaml — input_files shorthand', () => { id: 't1', input: 'Analyze this file', input_files: ['data/file.csv', 'data/schema.json'], - assertions: [{ type: 'trigger-judge', skill: 's', should_trigger: true }], + assertions: [{ type: 'skill-trigger', skill: 's', should_trigger: true }], }, ], }; @@ -477,12 +477,12 @@ describe('transpileEvalYaml — suite-level assertions', () => { { id: 't1', input: 'first', - assertions: [{ type: 'trigger-judge', skill: 's', should_trigger: true }], + assertions: [{ type: 'skill-trigger', skill: 's', should_trigger: true }], }, { id: 't2', input: 'second', - assertions: [{ type: 'trigger-judge', skill: 's', should_trigger: true }], + assertions: [{ type: 'skill-trigger', skill: 's', should_trigger: true }], }, ], assertions: [{ type: 'contains', value: 'global-check' }], @@ -499,7 +499,7 @@ describe('transpileEvalYaml — suite-level assertions', () => { { id: 't1', input: 'hello', - assert: [{ type: 'trigger-judge', skill: 's', should_trigger: true }], + assert: [{ type: 'skill-trigger', skill: 's', should_trigger: true }], }, ], assert: [{ type: 'contains', value: 'suite-level' }], @@ -523,7 +523,7 @@ describe('transpileEvalYaml — deprecated assert: key', () => { id: 't1', input: 'Hello', assert: [ - { type: 'trigger-judge', skill: 'skill-a', should_trigger: true }, + { type: 'skill-trigger', skill: 'skill-a', should_trigger: true }, { type: 'contains', value: 'world' }, ], }, @@ -547,12 +547,12 @@ describe('transpileEvalYaml — multi-skill', () => { { id: 't1', input: 'Hello', - assertions: [{ type: 'trigger-judge', skill: 'skill-a', should_trigger: true }], + assertions: [{ type: 'skill-trigger', skill: 'skill-a', should_trigger: true }], }, { id: 't2', input: 'World', - assertions: [{ type: 'trigger-judge', skill: 'skill-b', should_trigger: true }], + assertions: [{ type: 'skill-trigger', skill: 'skill-b', should_trigger: true }], }, ], }; @@ -562,15 +562,15 @@ describe('transpileEvalYaml — multi-skill', () => { expect(files.has('skill-b')).toBe(true); }); - it('places test in both files when it has trigger-judges for two skills', () => { + it('places test in both files when it has skill-triggers for two skills', () => { const suite = { tests: [ { id: 'shared', input: 'Do something', assertions: [ - { type: 'trigger-judge', skill: 'skill-a', should_trigger: true }, - { type: 'trigger-judge', skill: 'skill-b', should_trigger: false }, + { type: 'skill-trigger', skill: 'skill-a', should_trigger: true }, + { type: 'skill-trigger', skill: 'skill-b', should_trigger: false }, ], }, ], @@ -581,14 +581,14 @@ describe('transpileEvalYaml — multi-skill', () => { expect(files.get('skill-b')?.evals[0].should_trigger).toBe(false); }); - it('assigns tests with no trigger-judge to dominant skill', () => { + it('assigns tests with no skill-trigger to dominant skill', () => { const suite = { tests: [ { id: 't1', input: 'Hello', assertions: [ - { type: 'trigger-judge', skill: 'skill-a', should_trigger: true }, + { type: 'skill-trigger', skill: 'skill-a', should_trigger: true }, { type: 'contains', value: 'hi' }, ], }, @@ -655,12 +655,12 @@ describe('getOutputFilenames', () => { { id: 't1', input: 'Hello', - assertions: [{ type: 'trigger-judge', skill: 'skill-a', should_trigger: true }], + assertions: [{ type: 'skill-trigger', skill: 'skill-a', should_trigger: true }], }, { id: 't2', input: 'World', - assertions: [{ type: 'trigger-judge', skill: 'skill-b', should_trigger: true }], + assertions: [{ type: 'skill-trigger', skill: 'skill-b', should_trigger: true }], }, ], }; diff --git a/packages/eval/src/assertion.ts b/packages/eval/src/assertion.ts index 2665b1493..dd28ea304 100644 --- a/packages/eval/src/assertion.ts +++ b/packages/eval/src/assertion.ts @@ -48,6 +48,7 @@ export type AssertionType = | 'token-usage' | 'execution-metrics' | 'agent-judge' + | 'skill-trigger' | 'contains' | 'contains-any' | 'contains-all'