EntityProcess · christso · Mar 15, 2026 · Mar 15, 2026
diff --git a/examples/features/agent-skills-evals/.agentv/judges/trigger-judge.ts b/examples/features/agent-skills-evals/.agentv/judges/trigger-judge.ts
@@ -1,89 +1,13 @@
-#!/usr/bin/env bun
 /**
- * trigger-judge: detects whether the agent invoked a named Claude Code skill.
+ * @deprecated This file is kept for reference only.
  *
- * Mirrors the post-hoc fallback detection in skill-creator's run_eval.py:
- *   - Only the FIRST tool call matters. Any non-Skill/Read tool as the first
- *     call means the skill was not triggered (mirrors run_eval.py's early-exit).
- *   - Skill tool: checks input.skill contains the skill name (case-sensitive).
- *   - Read tool: checks input.file_path contains the skill name (case-sensitive).
- *   - Supports negative cases via should_trigger: false.
+ * `trigger-judge` has been renamed to `skill-trigger` and promoted to a built-in
+ * evaluator in agentv core. You no longer need this file in .agentv/judges/.
  *
- * Usage in eval YAML:
- *   assertions:
- *     - type: trigger-judge          # discovered from .agentv/judges/
- *       skill: my-skill-name         # required: exact name as installed in .claude/commands/
- *       should_trigger: true         # optional: expected behaviour (default: true)
+ * Update your EVAL.yaml to use the built-in type:
  *
- * Positive case (should_trigger: true):  passes when skill fires.
- * Negative case (should_trigger: false): passes when skill does NOT fire.
+ *   assertions:
+ *     - type: skill-trigger          # built-in, no extra file needed
+ *       skill: my-skill-name
+ *       should_trigger: true
  */
-import { defineCodeJudge } from '@agentv/eval';
-
-export default defineCodeJudge(({ output, config }) => {
-  const skillName = config?.skill as string | undefined;
-  const shouldTrigger = (config?.should_trigger ?? true) as boolean;
-
-  if (!skillName) {
-    return {
-      score: 0,
-      misses: ['config.skill is required'],
-      reasoning: 'No skill name configured',
-    };
-  }
-
-  // Flatten all tool calls across messages and take only the first one.
-  // run_eval.py returns false as soon as a non-Skill/Read tool starts, so
-  // only the first tool call is relevant.
-  const firstTool = (output ?? []).flatMap((msg) => msg.toolCalls ?? [])[0];
-
-  let triggered = false;
-  let evidence = '';
-
-  if (firstTool) {
-    const input = (firstTool.input ?? {}) as Record<string, unknown>;
-
-    if (firstTool.tool === 'Skill') {
-      const skillArg = String(input.skill ?? '');
-      if (skillArg.includes(skillName)) {
-        triggered = true;
-        evidence = `Skill tool invoked with skill="${skillArg}"`;
-      }
-    } else if (firstTool.tool === 'Read') {
-      const filePath = String(input.file_path ?? '');
-      if (filePath.includes(skillName)) {
-        triggered = true;
-        evidence = `Read tool loaded skill file: ${filePath}`;
-      }
-    }
-    // Any other tool as first call: triggered remains false
-  }
-
-  const pass = triggered === shouldTrigger;
-
-  if (pass) {
-    return {
-      score: 1,
-      hits: [
-        shouldTrigger
-          ? evidence || `Skill "${skillName}" triggered as expected`
-          : `Skill "${skillName}" correctly did not trigger`,
-      ],
-      reasoning: shouldTrigger ? 'Skill triggered correctly' : 'No false trigger',
-    };
-  }
-
-  return {
-    score: 0,
-    misses: [
-      shouldTrigger
-        ? firstTool
-          ? `First tool was "${firstTool.tool}" — not Skill/Read for "${skillName}"`
-          : `No tool calls recorded`
-        : evidence || `Skill "${skillName}" triggered unexpectedly`,
-    ],
-    reasoning: shouldTrigger
-      ? `Skill "${skillName}" was not triggered`
-      : `False trigger: skill fired when it should not have`,
-  };
-});
diff --git a/examples/features/transpile/csv-analyzer.EVAL.yaml b/examples/features/transpile/csv-analyzer.EVAL.yaml
@@ -10,7 +10,7 @@ tests:
             value: "I have a CSV of monthly sales data. Find the top 3 months by revenue."
     expected_output: "The top 3 months by revenue are November ($22,500), September ($20,100), and December ($19,400)."
     assertions:
-      - type: trigger-judge
+      - type: skill-trigger
         skill: csv-analyzer
         should_trigger: true
       - type: rubrics
@@ -21,6 +21,6 @@ tests:
   - id: irrelevant-query
     input: "What time is it?"
     assertions:
-      - type: trigger-judge
+      - type: skill-trigger
         skill: csv-analyzer
         should_trigger: false
diff --git a/packages/core/src/evaluation/evaluators/index.ts b/packages/core/src/evaluation/evaluators/index.ts
@@ -55,6 +55,8 @@ export type { LlmJudgeEvaluatorOptions } from './llm-judge.js';
 export { AgentJudgeEvaluator } from './agent-judge.js';
 export type { AgentJudgeEvaluatorOptions } from './agent-judge.js';
 
+export { SkillTriggerEvaluator } from './skill-trigger.js';
+
 export { assembleLlmJudgePrompt } from './llm-judge-prompt.js';
 export type { LlmJudgePromptAssembly } from './llm-judge-prompt.js';
 

diff --git a/packages/core/src/evaluation/evaluators/skill-trigger.ts b/packages/core/src/evaluation/evaluators/skill-trigger.ts
@@ -0,0 +1,90 @@
+/**
+ * Built-in skill-trigger evaluator.
+ *
+ * Detects whether the agent invoked a named Claude Code skill as its first tool call.
+ * Mirrors the post-hoc fallback detection in skill-creator's run_eval.py:
+ *   - Only the FIRST tool call matters.
+ *   - Skill tool: checks input.skill contains the skill name (case-sensitive substring).
+ *   - Read tool: checks input.file_path contains the skill name (case-sensitive substring).
+ *   - Any other tool as first call means the skill was not triggered.
+ *   - Supports negative cases via should_trigger: false.
+ */
+
+import type { SkillTriggerEvaluatorConfig } from '../types.js';
+import type { EvaluationContext, EvaluationScore, Evaluator } from './types.js';
+
+export class SkillTriggerEvaluator implements Evaluator {
+  readonly kind = 'skill-trigger';
+
+  private readonly config: SkillTriggerEvaluatorConfig;
+
+  constructor(config: SkillTriggerEvaluatorConfig) {
+    this.config = config;
+  }
+
+  evaluate(context: EvaluationContext): EvaluationScore {
+    const skillName = this.config.skill;
+    const shouldTrigger = this.config.should_trigger !== false; // default true
+
+    // Flatten all tool calls across messages and take only the first one.
+    // run_eval.py returns false as soon as a non-Skill/Read tool starts,
+    // so only the first tool call is relevant.
+    const firstTool = (context.output ?? []).flatMap((msg) => msg.toolCalls ?? [])[0];
+
+    let triggered = false;
+    let evidence = '';
+
+    if (firstTool) {
+      const input = (firstTool.input ?? {}) as Record<string, unknown>;
+
+      if (firstTool.tool === 'Skill') {
+        const skillArg = String(input.skill ?? '');
+        if (skillArg.includes(skillName)) {
+          triggered = true;
+          evidence = `Skill tool invoked with skill="${skillArg}"`;
+        }
+      } else if (firstTool.tool === 'Read') {
+        const filePath = String(input.file_path ?? '');
+        if (filePath.includes(skillName)) {
+          triggered = true;
+          evidence = `Read tool loaded skill file: ${filePath}`;
+        }
+      }
+      // Any other tool as first call: triggered remains false
+    }
+
+    const pass = triggered === shouldTrigger;
+
+    if (pass) {
+      return {
+        score: 1,
+        verdict: 'pass',
+        hits: [
+          shouldTrigger
+            ? evidence || `Skill "${skillName}" triggered as expected`
+            : `Skill "${skillName}" correctly did not trigger`,
+        ],
+        misses: [],
+        expectedAspectCount: 1,
+        reasoning: shouldTrigger ? 'Skill triggered correctly' : 'No false trigger',
+      };
+    }
+
+    return {
+      score: 0,
+      verdict: 'fail',
+      hits: [],
+      misses: [
+        shouldTrigger
+          ? firstTool
+            ? `First tool was "${firstTool.tool}" — not Skill/Read for "${skillName}"`
+            : 'No tool calls recorded'
+          : evidence || `Skill "${skillName}" triggered unexpectedly`,
+      ],
+      expectedAspectCount: 1,
+      reasoning: shouldTrigger
+        ? `Skill "${skillName}" was not triggered`
+        : 'False trigger: skill fired when it should not have',
+    };
+  }
+}
diff --git a/packages/core/src/evaluation/loaders/eval-yaml-transpiler.ts b/packages/core/src/evaluation/loaders/eval-yaml-transpiler.ts
@@ -90,7 +90,7 @@ function assertionToNaturalLanguage(entry: RawAssertEntry): string | null {
   const type = entry.type;
 
   switch (type) {
-    case 'trigger-judge':
+    case 'skill-trigger':
       // Handled separately — not an NL assertion
       return null;
 
@@ -232,11 +232,11 @@ function assertionToNaturalLanguageList(entry: RawAssertEntry): string[] {
 }
 
 /**
- * Extract trigger-judge entries from an assertion list.
- * Returns entries with type === 'trigger-judge'.
+ * Extract skill-trigger entries from an assertion list.
+ * Returns entries with type === 'skill-trigger'.
  */
 function extractTriggerJudges(assertions: RawAssertEntry[]): RawAssertEntry[] {
-  return assertions.filter((a) => a.type === 'trigger-judge');
+  return assertions.filter((a) => a.type === 'skill-trigger');
 }
 
 /**
@@ -370,7 +370,7 @@ export function transpileEvalYaml(suite: unknown, source = 'EVAL.yaml'): Transpi
 
   // Suite-level NL assertions (appended to every test)
   const suiteNlAssertions: string[] = suiteAssertions
-    .filter((a) => a.type !== 'trigger-judge')
+    .filter((a) => a.type !== 'skill-trigger')
     .flatMap(assertionToNaturalLanguageList);
 
   /**
@@ -395,7 +395,7 @@ export function transpileEvalYaml(suite: unknown, source = 'EVAL.yaml'): Transpi
       warnings.push(`Test '${caseId}': 'assert' is deprecated. Use 'assertions' instead.`);
     }
 
-    // Collect NL assertions (not trigger-judge)
+    // Collect NL assertions (not skill-trigger)
     const nlAssertions: string[] = [];
 
     // Prepend test-level criteria as NL assertion
@@ -404,7 +404,7 @@ export function transpileEvalYaml(suite: unknown, source = 'EVAL.yaml'): Transpi
     }
 
     for (const entry of caseAssertions) {
-      if (entry.type !== 'trigger-judge') {
+      if (entry.type !== 'skill-trigger') {
         nlAssertions.push(...assertionToNaturalLanguageList(entry));
       }
     }
@@ -429,7 +429,7 @@ export function transpileEvalYaml(suite: unknown, source = 'EVAL.yaml'): Transpi
     };
 
     if (triggerJudges.length === 0) {
-      // No trigger-judge: place in dominant skill (or _no-skill)
+      // No skill-trigger: place in dominant skill (or _no-skill)
       // Determine dominant skill by scanning all tests (first occurrence wins)
       // We defer this: record with a sentinel and resolve after all tests are processed.
       // For now, push to _no-skill; we'll re-assign at the end.

diff --git a/packages/core/src/evaluation/loaders/evaluator-parser.ts b/packages/core/src/evaluation/loaders/evaluator-parser.ts
@@ -928,6 +928,28 @@ async function parseEvaluatorList(
       continue;
     }
 
+    if (typeValue === 'skill-trigger') {
+      const skillName = asString(rawEvaluator.skill);
+      if (!skillName) {
+        logWarning(`Skipping skill-trigger evaluator '${name}' in '${evalId}': missing skill`);
+        continue;
+      }
+      const rawShouldTrigger = rawEvaluator.should_trigger;
+      const shouldTrigger = typeof rawShouldTrigger === 'boolean' ? rawShouldTrigger : undefined;
+      const weight = validateWeight(rawEvaluator.weight, name, evalId);
+      const required = parseRequired(rawEvaluator.required);
+      evaluators.push({
+        name,
+        type: 'skill-trigger',
+        skill: skillName,
+        ...(shouldTrigger !== undefined ? { should_trigger: shouldTrigger } : {}),
+        ...(weight !== undefined ? { weight } : {}),
+        ...(required !== undefined ? { required } : {}),
+        ...(negate !== undefined ? { negate } : {}),
+      });
+      continue;
+    }
+
     if (typeValue === 'contains') {
       const value = asString(rawEvaluator.value);
       if (!value) {
@@ -1283,6 +1305,7 @@ async function parseEvaluatorList(
 
 /** Assertion evaluator types that support auto-generated names. */
 const ASSERTION_TYPES = new Set([
+  'skill-trigger',
   'contains',
   'contains-any',
   'contains-all',
@@ -1310,6 +1333,10 @@ function generateAssertionName(typeValue: string, rawEvaluator: JsonObject): str
   const arrayValue = Array.isArray(rawEvaluator.value) ? rawEvaluator.value : undefined;
 
   switch (typeValue) {
+    case 'skill-trigger': {
+      const skillValue = asString(rawEvaluator.skill);
+      return skillValue ? `skill-trigger-${skillValue}` : 'skill-trigger';
+    }
     case 'contains':
       return value ? `contains-${value}` : 'contains';
     case 'contains-any':

diff --git a/packages/core/src/evaluation/registry/builtin-evaluators.ts b/packages/core/src/evaluation/registry/builtin-evaluators.ts
@@ -17,6 +17,7 @@ import {
   FieldAccuracyEvaluator,
   LatencyEvaluator,
   LlmJudgeEvaluator,
+  SkillTriggerEvaluator,
   TokenUsageEvaluator,
   ToolTrajectoryEvaluator,
   runContainsAllAssertion,
@@ -55,6 +56,7 @@ import type {
   LatencyEvaluatorConfig,
   LlmJudgeEvaluatorConfig,
   RegexEvaluatorConfig,
+  SkillTriggerEvaluatorConfig,
   StartsWithEvaluatorConfig,
   TokenUsageEvaluatorConfig,
 } from '../types.js';
@@ -235,6 +237,11 @@ export const agentJudgeFactory: EvaluatorFactoryFn = (config, context) => {
   });
 };
 
+/** Factory for `skill-trigger` evaluator. */
+export const skillTriggerFactory: EvaluatorFactoryFn = (config) => {
+  return new SkillTriggerEvaluator(config as SkillTriggerEvaluatorConfig);
+};
+
 /** Factory for `contains` deterministic assertion. */
 export const containsFactory: EvaluatorFactoryFn = (config) => {
   const c = config as ContainsEvaluatorConfig;
@@ -434,6 +441,7 @@ export function createBuiltinRegistry(): EvaluatorRegistry {
     .register('token-usage', tokenUsageFactory)
     .register('execution-metrics', executionMetricsFactory)
     .register('agent-judge', agentJudgeFactory)
+    .register('skill-trigger', skillTriggerFactory)
     .register('contains', containsFactory)
     .register('contains-any', containsAnyFactory)
     .register('contains-all', containsAllFactory)

diff --git a/packages/core/src/evaluation/registry/judge-discovery.ts b/packages/core/src/evaluation/registry/judge-discovery.ts
@@ -5,7 +5,7 @@
  * them as code-judge evaluators in the registry. The file name (without
  * extension) becomes the evaluator type name.
  *
- * Example: `.agentv/judges/trigger-judge.ts` → type "trigger-judge" in EVAL.yaml
+ * Example: `.agentv/judges/custom-judge.ts` → type "custom-judge" in EVAL.yaml
  */
 
 import path from 'node:path';