Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,89 +1,13 @@
#!/usr/bin/env bun
/**
* trigger-judge: detects whether the agent invoked a named Claude Code skill.
* @deprecated This file is kept for reference only.
*
* Mirrors the post-hoc fallback detection in skill-creator's run_eval.py:
* - Only the FIRST tool call matters. Any non-Skill/Read tool as the first
* call means the skill was not triggered (mirrors run_eval.py's early-exit).
* - Skill tool: checks input.skill contains the skill name (case-sensitive).
* - Read tool: checks input.file_path contains the skill name (case-sensitive).
* - Supports negative cases via should_trigger: false.
* `trigger-judge` has been renamed to `skill-trigger` and promoted to a built-in
* evaluator in agentv core. You no longer need this file in .agentv/judges/.
*
* Usage in eval YAML:
* assertions:
* - type: trigger-judge # discovered from .agentv/judges/
* skill: my-skill-name # required: exact name as installed in .claude/commands/
* should_trigger: true # optional: expected behaviour (default: true)
* Update your EVAL.yaml to use the built-in type:
*
* Positive case (should_trigger: true): passes when skill fires.
* Negative case (should_trigger: false): passes when skill does NOT fire.
* assertions:
* - type: skill-trigger # built-in, no extra file needed
* skill: my-skill-name
* should_trigger: true
*/
import { defineCodeJudge } from '@agentv/eval';

export default defineCodeJudge(({ output, config }) => {
const skillName = config?.skill as string | undefined;
const shouldTrigger = (config?.should_trigger ?? true) as boolean;

if (!skillName) {
return {
score: 0,
misses: ['config.skill is required'],
reasoning: 'No skill name configured',
};
}

// Flatten all tool calls across messages and take only the first one.
// run_eval.py returns false as soon as a non-Skill/Read tool starts, so
// only the first tool call is relevant.
const firstTool = (output ?? []).flatMap((msg) => msg.toolCalls ?? [])[0];

let triggered = false;
let evidence = '';

if (firstTool) {
const input = (firstTool.input ?? {}) as Record<string, unknown>;

if (firstTool.tool === 'Skill') {
const skillArg = String(input.skill ?? '');
if (skillArg.includes(skillName)) {
triggered = true;
evidence = `Skill tool invoked with skill="${skillArg}"`;
}
} else if (firstTool.tool === 'Read') {
const filePath = String(input.file_path ?? '');
if (filePath.includes(skillName)) {
triggered = true;
evidence = `Read tool loaded skill file: ${filePath}`;
}
}
// Any other tool as first call: triggered remains false
}

const pass = triggered === shouldTrigger;

if (pass) {
return {
score: 1,
hits: [
shouldTrigger
? evidence || `Skill "${skillName}" triggered as expected`
: `Skill "${skillName}" correctly did not trigger`,
],
reasoning: shouldTrigger ? 'Skill triggered correctly' : 'No false trigger',
};
}

return {
score: 0,
misses: [
shouldTrigger
? firstTool
? `First tool was "${firstTool.tool}" — not Skill/Read for "${skillName}"`
: `No tool calls recorded`
: evidence || `Skill "${skillName}" triggered unexpectedly`,
],
reasoning: shouldTrigger
? `Skill "${skillName}" was not triggered`
: `False trigger: skill fired when it should not have`,
};
});
4 changes: 2 additions & 2 deletions examples/features/transpile/csv-analyzer.EVAL.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ tests:
value: "I have a CSV of monthly sales data. Find the top 3 months by revenue."
expected_output: "The top 3 months by revenue are November ($22,500), September ($20,100), and December ($19,400)."
assertions:
- type: trigger-judge
- type: skill-trigger
skill: csv-analyzer
should_trigger: true
- type: rubrics
Expand All @@ -21,6 +21,6 @@ tests:
- id: irrelevant-query
input: "What time is it?"
assertions:
- type: trigger-judge
- type: skill-trigger
skill: csv-analyzer
should_trigger: false
2 changes: 2 additions & 0 deletions packages/core/src/evaluation/evaluators/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -55,6 +55,8 @@ export type { LlmJudgeEvaluatorOptions } from './llm-judge.js';
export { AgentJudgeEvaluator } from './agent-judge.js';
export type { AgentJudgeEvaluatorOptions } from './agent-judge.js';

export { SkillTriggerEvaluator } from './skill-trigger.js';

export { assembleLlmJudgePrompt } from './llm-judge-prompt.js';
export type { LlmJudgePromptAssembly } from './llm-judge-prompt.js';

Expand Down
90 changes: 90 additions & 0 deletions packages/core/src/evaluation/evaluators/skill-trigger.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
/**
* Built-in skill-trigger evaluator.
*
* Detects whether the agent invoked a named Claude Code skill as its first tool call.
* Mirrors the post-hoc fallback detection in skill-creator's run_eval.py:
* - Only the FIRST tool call matters.
* - Skill tool: checks input.skill contains the skill name (case-sensitive substring).
* - Read tool: checks input.file_path contains the skill name (case-sensitive substring).
* - Any other tool as first call means the skill was not triggered.
* - Supports negative cases via should_trigger: false.
*/

import type { SkillTriggerEvaluatorConfig } from '../types.js';
import type { EvaluationContext, EvaluationScore, Evaluator } from './types.js';

export class SkillTriggerEvaluator implements Evaluator {
readonly kind = 'skill-trigger';

private readonly config: SkillTriggerEvaluatorConfig;

constructor(config: SkillTriggerEvaluatorConfig) {
this.config = config;
}

evaluate(context: EvaluationContext): EvaluationScore {
const skillName = this.config.skill;
const shouldTrigger = this.config.should_trigger !== false; // default true

// Flatten all tool calls across messages and take only the first one.
// run_eval.py returns false as soon as a non-Skill/Read tool starts,
// so only the first tool call is relevant.
const firstTool = (context.output ?? []).flatMap((msg) => msg.toolCalls ?? [])[0];

let triggered = false;
let evidence = '';

if (firstTool) {
const input = (firstTool.input ?? {}) as Record<string, unknown>;

if (firstTool.tool === 'Skill') {
const skillArg = String(input.skill ?? '');
if (skillArg.includes(skillName)) {
triggered = true;
evidence = `Skill tool invoked with skill="${skillArg}"`;
}
} else if (firstTool.tool === 'Read') {
const filePath = String(input.file_path ?? '');
if (filePath.includes(skillName)) {
triggered = true;
evidence = `Read tool loaded skill file: ${filePath}`;
}
}
// Any other tool as first call: triggered remains false
}

const pass = triggered === shouldTrigger;

if (pass) {
return {
score: 1,
verdict: 'pass',
hits: [
shouldTrigger
? evidence || `Skill "${skillName}" triggered as expected`
: `Skill "${skillName}" correctly did not trigger`,
],
misses: [],
expectedAspectCount: 1,
reasoning: shouldTrigger ? 'Skill triggered correctly' : 'No false trigger',
};
}

return {
score: 0,
verdict: 'fail',
hits: [],
misses: [
shouldTrigger
? firstTool
? `First tool was "${firstTool.tool}" — not Skill/Read for "${skillName}"`
: 'No tool calls recorded'
: evidence || `Skill "${skillName}" triggered unexpectedly`,
],
expectedAspectCount: 1,
reasoning: shouldTrigger
? `Skill "${skillName}" was not triggered`
: 'False trigger: skill fired when it should not have',
};
}
}
16 changes: 8 additions & 8 deletions packages/core/src/evaluation/loaders/eval-yaml-transpiler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -90,7 +90,7 @@ function assertionToNaturalLanguage(entry: RawAssertEntry): string | null {
const type = entry.type;

switch (type) {
case 'trigger-judge':
case 'skill-trigger':
// Handled separately — not an NL assertion
return null;

Expand Down Expand Up @@ -232,11 +232,11 @@ function assertionToNaturalLanguageList(entry: RawAssertEntry): string[] {
}

/**
* Extract trigger-judge entries from an assertion list.
* Returns entries with type === 'trigger-judge'.
* Extract skill-trigger entries from an assertion list.
* Returns entries with type === 'skill-trigger'.
*/
function extractTriggerJudges(assertions: RawAssertEntry[]): RawAssertEntry[] {
return assertions.filter((a) => a.type === 'trigger-judge');
return assertions.filter((a) => a.type === 'skill-trigger');
}

/**
Expand Down Expand Up @@ -370,7 +370,7 @@ export function transpileEvalYaml(suite: unknown, source = 'EVAL.yaml'): Transpi

// Suite-level NL assertions (appended to every test)
const suiteNlAssertions: string[] = suiteAssertions
.filter((a) => a.type !== 'trigger-judge')
.filter((a) => a.type !== 'skill-trigger')
.flatMap(assertionToNaturalLanguageList);

/**
Expand All @@ -395,7 +395,7 @@ export function transpileEvalYaml(suite: unknown, source = 'EVAL.yaml'): Transpi
warnings.push(`Test '${caseId}': 'assert' is deprecated. Use 'assertions' instead.`);
}

// Collect NL assertions (not trigger-judge)
// Collect NL assertions (not skill-trigger)
const nlAssertions: string[] = [];

// Prepend test-level criteria as NL assertion
Expand All @@ -404,7 +404,7 @@ export function transpileEvalYaml(suite: unknown, source = 'EVAL.yaml'): Transpi
}

for (const entry of caseAssertions) {
if (entry.type !== 'trigger-judge') {
if (entry.type !== 'skill-trigger') {
nlAssertions.push(...assertionToNaturalLanguageList(entry));
}
}
Expand All @@ -429,7 +429,7 @@ export function transpileEvalYaml(suite: unknown, source = 'EVAL.yaml'): Transpi
};

if (triggerJudges.length === 0) {
// No trigger-judge: place in dominant skill (or _no-skill)
// No skill-trigger: place in dominant skill (or _no-skill)
// Determine dominant skill by scanning all tests (first occurrence wins)
// We defer this: record with a sentinel and resolve after all tests are processed.
// For now, push to _no-skill; we'll re-assign at the end.
Expand Down
27 changes: 27 additions & 0 deletions packages/core/src/evaluation/loaders/evaluator-parser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -928,6 +928,28 @@ async function parseEvaluatorList(
continue;
}

if (typeValue === 'skill-trigger') {
const skillName = asString(rawEvaluator.skill);
if (!skillName) {
logWarning(`Skipping skill-trigger evaluator '${name}' in '${evalId}': missing skill`);
continue;
}
const rawShouldTrigger = rawEvaluator.should_trigger;
const shouldTrigger = typeof rawShouldTrigger === 'boolean' ? rawShouldTrigger : undefined;
const weight = validateWeight(rawEvaluator.weight, name, evalId);
const required = parseRequired(rawEvaluator.required);
evaluators.push({
name,
type: 'skill-trigger',
skill: skillName,
...(shouldTrigger !== undefined ? { should_trigger: shouldTrigger } : {}),
...(weight !== undefined ? { weight } : {}),
...(required !== undefined ? { required } : {}),
...(negate !== undefined ? { negate } : {}),
});
continue;
}

if (typeValue === 'contains') {
const value = asString(rawEvaluator.value);
if (!value) {
Expand Down Expand Up @@ -1283,6 +1305,7 @@ async function parseEvaluatorList(

/** Assertion evaluator types that support auto-generated names. */
const ASSERTION_TYPES = new Set([
'skill-trigger',
'contains',
'contains-any',
'contains-all',
Expand Down Expand Up @@ -1310,6 +1333,10 @@ function generateAssertionName(typeValue: string, rawEvaluator: JsonObject): str
const arrayValue = Array.isArray(rawEvaluator.value) ? rawEvaluator.value : undefined;

switch (typeValue) {
case 'skill-trigger': {
const skillValue = asString(rawEvaluator.skill);
return skillValue ? `skill-trigger-${skillValue}` : 'skill-trigger';
}
case 'contains':
return value ? `contains-${value}` : 'contains';
case 'contains-any':
Expand Down
8 changes: 8 additions & 0 deletions packages/core/src/evaluation/registry/builtin-evaluators.ts
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@ import {
FieldAccuracyEvaluator,
LatencyEvaluator,
LlmJudgeEvaluator,
SkillTriggerEvaluator,
TokenUsageEvaluator,
ToolTrajectoryEvaluator,
runContainsAllAssertion,
Expand Down Expand Up @@ -55,6 +56,7 @@ import type {
LatencyEvaluatorConfig,
LlmJudgeEvaluatorConfig,
RegexEvaluatorConfig,
SkillTriggerEvaluatorConfig,
StartsWithEvaluatorConfig,
TokenUsageEvaluatorConfig,
} from '../types.js';
Expand Down Expand Up @@ -235,6 +237,11 @@ export const agentJudgeFactory: EvaluatorFactoryFn = (config, context) => {
});
};

/** Factory for `skill-trigger` evaluator. */
export const skillTriggerFactory: EvaluatorFactoryFn = (config) => {
return new SkillTriggerEvaluator(config as SkillTriggerEvaluatorConfig);
};

/** Factory for `contains` deterministic assertion. */
export const containsFactory: EvaluatorFactoryFn = (config) => {
const c = config as ContainsEvaluatorConfig;
Expand Down Expand Up @@ -434,6 +441,7 @@ export function createBuiltinRegistry(): EvaluatorRegistry {
.register('token-usage', tokenUsageFactory)
.register('execution-metrics', executionMetricsFactory)
.register('agent-judge', agentJudgeFactory)
.register('skill-trigger', skillTriggerFactory)
.register('contains', containsFactory)
.register('contains-any', containsAnyFactory)
.register('contains-all', containsAllFactory)
Expand Down
2 changes: 1 addition & 1 deletion packages/core/src/evaluation/registry/judge-discovery.ts
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@
* them as code-judge evaluators in the registry. The file name (without
* extension) becomes the evaluator type name.
*
* Example: `.agentv/judges/trigger-judge.ts` → type "trigger-judge" in EVAL.yaml
* Example: `.agentv/judges/custom-judge.ts` → type "custom-judge" in EVAL.yaml
*/

import path from 'node:path';
Expand Down
Loading