From dd628ef7c88976401156b4f41758f7f621d69510 Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Sun, 22 Mar 2026 07:09:25 +0000
Subject: [PATCH 1/4] feat(skill-trigger): fix multi-provider detection +
 workspace-scoped skills

- Scan full transcript (not just first tool call) so preamble meta-skills
  like using-superpowers don't cause false negatives for copilot
- Add CODEX_MATCHER: detect command_execution reading skill files via bash
  sed commands; skill name appears in the command string
- Fix codex provider: store command_execution input as { command } object
  instead of raw string so readPathFromInput can extract the path
- Add workspace template with .agents/skills/ and .codex/skills/ dirs so
  evals are self-contained and don't rely on global skill installation
- Add 11 new unit tests covering full transcript scanning, codex bash
  detection, and pi-coding-agent provider resolution

E2E results: copilot 4/4, codex 4/4, pi 2/4 (pi has no skill system)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../features/agent-skills-evals/README.md     |  25 ++-
 .../multi-provider-skill-trigger.EVAL.yaml    |  14 +-
 .../.agents/skills/csv-analyzer/SKILL.md      |  23 ++
 .../.claude/skills/csv-analyzer/SKILL.md      |  23 ++
 .../.codex/skills/csv-analyzer/SKILL.md       |  23 ++
 .../agent-skills-evals/workspace/AGENTS.md    |  11 +
 .../agent-skills-evals/workspace/sales.csv    |  13 ++
 .../evaluation/evaluators/skill-trigger.ts    |  50 ++++-
 .../core/src/evaluation/providers/codex.ts    |   2 +-
 .../evaluators/skill-trigger.test.ts          | 212 +++++++++++++++++-
 .../evaluation/providers/codex-sdk.test.ts    |   2 +-
 11 files changed, 376 insertions(+), 22 deletions(-)
 create mode 100644 examples/features/agent-skills-evals/workspace/.agents/skills/csv-analyzer/SKILL.md
 create mode 100644 examples/features/agent-skills-evals/workspace/.claude/skills/csv-analyzer/SKILL.md
 create mode 100644 examples/features/agent-skills-evals/workspace/.codex/skills/csv-analyzer/SKILL.md
 create mode 100644 examples/features/agent-skills-evals/workspace/AGENTS.md
 create mode 100644 examples/features/agent-skills-evals/workspace/sales.csv
diff --git a/examples/features/agent-skills-evals/README.md b/examples/features/agent-skills-evals/README.md
index d77a23471..6ecb2481c 100644
--- a/examples/features/agent-skills-evals/README.md
+++ b/examples/features/agent-skills-evals/README.md
@@ -82,6 +82,27 @@ agentv eval csv-analyzer.EVAL.yaml
 agentv eval csv-analyzer.EVAL.yaml --target default
 ```
 
+## Skill setup
+
+The `csv-analyzer` skill is included in this example under `.claude/skills/csv-analyzer/SKILL.md`. Claude Code automatically discovers skills in the project's `.claude/skills/` directory, so no global installation is needed for claude-cli, claude-sdk, or copilot targets.
+
+## Multi-provider eval
+
+`multi-provider-skill-trigger.EVAL.yaml` tests the same skill across multiple providers:
+
+```bash
+bun apps/cli/src/cli.ts eval multi-provider-skill-trigger.EVAL.yaml \
+  --target copilot --targets ../.agentv/targets.yaml
+```
+
+The `skill-trigger` evaluator automatically handles each provider's tool-call format:
+
+| Provider | Detection method |
+|----------|-----------------|
+| claude, claude-cli | `Skill` tool with `skill` input field |
+| copilot | `Using skill: <name>` tool prefix |
+| codex | `mcp:<server>/<name>` tool prefix |
+
 ## Copilot note
 
 When running `skill-trigger` evals against Copilot targets, real traces may show provider-specific tool names such as:
@@ -91,6 +112,4 @@ Using skill: <skill-name>
 Viewing ...<skill-path>
 ```
 
-They may also emit `Read` tool calls with the path in `input.path` rather than `input.file_path`.
-
-These shapes come from real provider traces and should be treated as valid skill-trigger evidence for Copilot targets.
+The evaluator scans the entire conversation transcript (not just the first tool call), so a preamble meta-skill like `using-superpowers` firing before `csv-analyzer` still results in a pass.
diff --git a/examples/features/agent-skills-evals/multi-provider-skill-trigger.EVAL.yaml b/examples/features/agent-skills-evals/multi-provider-skill-trigger.EVAL.yaml
index 6d309d07b..377daf57f 100644
--- a/examples/features/agent-skills-evals/multi-provider-skill-trigger.EVAL.yaml
+++ b/examples/features/agent-skills-evals/multi-provider-skill-trigger.EVAL.yaml
@@ -1,14 +1,22 @@
 # Multi-provider skill-trigger evaluation example.
 #
+# Uses an isolated workspace template (workspace/) so no global skill installation
+# is needed. The workspace contains:
+#   .claude/skills/csv-analyzer/  — for claude/copilot providers
+#   .agents/skills/csv-analyzer/  — for codex provider
+#
 # The same EVAL.yaml works with any provider — just change --target:
 #
-#   agentv eval this-file.EVAL.yaml --target claude
-#   agentv eval this-file.EVAL.yaml --target copilot
-#   agentv eval this-file.EVAL.yaml --target pi-coding-agent
+#   agentv eval this-file.EVAL.yaml --target claude --targets ../.agentv/targets.yaml
+#   agentv eval this-file.EVAL.yaml --target copilot --targets ../.agentv/targets.yaml
+#   agentv eval this-file.EVAL.yaml --target codex --targets ../.agentv/targets.yaml
 #
 # The evaluator automatically resolves the correct tool names for each
 # provider. No provider-specific config needed in test cases.
 
+workspace:
+  template: workspace/
+
 tests:
   # === Positive cases: skill should trigger ===
 
diff --git a/examples/features/agent-skills-evals/workspace/.agents/skills/csv-analyzer/SKILL.md b/examples/features/agent-skills-evals/workspace/.agents/skills/csv-analyzer/SKILL.md
new file mode 100644
index 000000000..e52c484ef
--- /dev/null
+++ b/examples/features/agent-skills-evals/workspace/.agents/skills/csv-analyzer/SKILL.md
@@ -0,0 +1,23 @@
+---
+name: csv-analyzer
+description: Use when the user asks to analyze, summarize, or extract insights from CSV data or files
+---
+
+# CSV Analyzer
+
+## Overview
+
+Analyzes CSV data to extract insights, summaries, and statistics.
+
+## How to use
+
+1. Read the CSV file
+2. Parse the columns and rows
+3. Identify key metrics (totals, averages, top N entries)
+4. Present findings clearly
+
+## Example tasks
+
+- "Find the top 5 months by revenue"
+- "Which quarter had the most growth"
+- "Summarize the sales data"
diff --git a/examples/features/agent-skills-evals/workspace/.claude/skills/csv-analyzer/SKILL.md b/examples/features/agent-skills-evals/workspace/.claude/skills/csv-analyzer/SKILL.md
new file mode 100644
index 000000000..e52c484ef
--- /dev/null
+++ b/examples/features/agent-skills-evals/workspace/.claude/skills/csv-analyzer/SKILL.md
@@ -0,0 +1,23 @@
+---
+name: csv-analyzer
+description: Use when the user asks to analyze, summarize, or extract insights from CSV data or files
+---
+
+# CSV Analyzer
+
+## Overview
+
+Analyzes CSV data to extract insights, summaries, and statistics.
+
+## How to use
+
+1. Read the CSV file
+2. Parse the columns and rows
+3. Identify key metrics (totals, averages, top N entries)
+4. Present findings clearly
+
+## Example tasks
+
+- "Find the top 5 months by revenue"
+- "Which quarter had the most growth"
+- "Summarize the sales data"
diff --git a/examples/features/agent-skills-evals/workspace/.codex/skills/csv-analyzer/SKILL.md b/examples/features/agent-skills-evals/workspace/.codex/skills/csv-analyzer/SKILL.md
new file mode 100644
index 000000000..e52c484ef
--- /dev/null
+++ b/examples/features/agent-skills-evals/workspace/.codex/skills/csv-analyzer/SKILL.md
@@ -0,0 +1,23 @@
+---
+name: csv-analyzer
+description: Use when the user asks to analyze, summarize, or extract insights from CSV data or files
+---
+
+# CSV Analyzer
+
+## Overview
+
+Analyzes CSV data to extract insights, summaries, and statistics.
+
+## How to use
+
+1. Read the CSV file
+2. Parse the columns and rows
+3. Identify key metrics (totals, averages, top N entries)
+4. Present findings clearly
+
+## Example tasks
+
+- "Find the top 5 months by revenue"
+- "Which quarter had the most growth"
+- "Summarize the sales data"
diff --git a/examples/features/agent-skills-evals/workspace/AGENTS.md b/examples/features/agent-skills-evals/workspace/AGENTS.md
new file mode 100644
index 000000000..4b57137b8
--- /dev/null
+++ b/examples/features/agent-skills-evals/workspace/AGENTS.md
@@ -0,0 +1,11 @@
+# Agent Instructions
+
+## Skills
+
+Domain skills are in `.agents/skills/` relative to your working directory.
+Check for a relevant skill before responding to any task.
+
+Available skills:
+
+- **csv-analyzer** — use when the user asks to analyze, summarize, or extract
+  insights from CSV data or files. Skill file: `.agents/skills/csv-analyzer/SKILL.md`
diff --git a/examples/features/agent-skills-evals/workspace/sales.csv b/examples/features/agent-skills-evals/workspace/sales.csv
new file mode 100644
index 000000000..7f29d45a6
--- /dev/null
+++ b/examples/features/agent-skills-evals/workspace/sales.csv
@@ -0,0 +1,13 @@
+month,revenue,units_sold
+January,12500,150
+February,9800,120
+March,15200,180
+April,11300,140
+May,18900,220
+June,14700,175
+July,16500,195
+August,13200,160
+September,20100,240
+October,17800,210
+November,22500,265
+December,19400,230
diff --git a/packages/core/src/evaluation/evaluators/skill-trigger.ts b/packages/core/src/evaluation/evaluators/skill-trigger.ts
index 856a7c7a4..b4302494f 100644
--- a/packages/core/src/evaluation/evaluators/skill-trigger.ts
+++ b/packages/core/src/evaluation/evaluators/skill-trigger.ts
@@ -58,6 +58,29 @@ const COPILOT_MATCHER: ToolMatcher = {
   readInputFields: ['file_path', 'path'],
 };
 
+/**
+ * Codex reads skill files via command_execution using a bash sed command containing
+ * the skill file path. The skill name appears in the command string, so we match
+ * any command_execution whose command field includes the skill name.
+ *
+ * Skill lookup order (workspace-scoped first):
+ *   1. .agents/skills/<skill-name>/SKILL.md  (workspace-relative)
+ *   2. .codex/skills/<skill-name>/SKILL.md   (fallback)
+ *   3. ~/.agents/skills/<skill-name>/SKILL.md (global fallback)
+ *
+ * MCP-based skill invocation (`mcp:<server>/<skill-name>`) is also supported for
+ * Codex configurations that surface skills as MCP tools.
+ */
+const CODEX_MATCHER: ToolMatcher = {
+  skillTools: [],
+  skillInputField: 'skill',
+  readTools: ['command_execution'],
+  readInputField: 'command',
+  skillToolPrefixes: ['mcp:'],
+  readToolPrefixes: ['mcp:'],
+  readInputFields: ['command', 'path', 'file_path', 'filePath'],
+};
+
 /**
  * Static mapping of provider kinds to their tool-name semantics.
  * Providers not listed here fall back to CLAUDE_MATCHER.
@@ -66,6 +89,7 @@ const PROVIDER_TOOL_SEMANTICS: Partial<Record<ProviderKind, ToolMatcher>> = {
   claude: CLAUDE_MATCHER,
   'claude-cli': CLAUDE_MATCHER,
   'claude-sdk': CLAUDE_MATCHER,
+  codex: CODEX_MATCHER,
   'pi-coding-agent': CLAUDE_MATCHER,
   'pi-agent-sdk': CLAUDE_MATCHER,
   'copilot-cli': COPILOT_MATCHER,
@@ -97,40 +121,44 @@ export class SkillTriggerEvaluator implements Evaluator {
     const providerKind = context.provider?.kind as ProviderKind | undefined;
     const matcher = this.resolveMatcher(providerKind);
 
-    const firstTool = (context.output ?? []).flatMap((msg) => msg.toolCalls ?? [])[0];
+    const allToolCalls = (context.output ?? []).flatMap((msg) => msg.toolCalls ?? []);
 
     let triggered = false;
     let evidence = '';
 
-    if (firstTool) {
-      const input = (firstTool.input ?? {}) as Record<string, unknown>;
+    for (const toolCall of allToolCalls) {
+      const input = (toolCall.input ?? {}) as Record<string, unknown>;
 
-      if (matcher.skillTools.includes(firstTool.tool)) {
+      if (matcher.skillTools.includes(toolCall.tool)) {
         const skillArg = String(input[matcher.skillInputField] ?? '');
         if (skillArg.includes(skillName)) {
           triggered = true;
           evidence = `Skill tool invoked with ${matcher.skillInputField}="${skillArg}"`;
+          break;
         }
       } else if (
         matcher.skillToolPrefixes?.some(
-          (prefix) => firstTool.tool.startsWith(prefix) && firstTool.tool.includes(skillName),
+          (prefix) => toolCall.tool.startsWith(prefix) && toolCall.tool.includes(skillName),
         )
       ) {
         triggered = true;
-        evidence = `Skill tool invoked via tool name "${firstTool.tool}"`;
-      } else if (matcher.readTools.includes(firstTool.tool)) {
+        evidence = `Skill tool invoked via tool name "${toolCall.tool}"`;
+        break;
+      } else if (matcher.readTools.includes(toolCall.tool)) {
         const filePath = this.readPathFromInput(input, matcher);
         if (filePath.includes(skillName)) {
           triggered = true;
           evidence = `Read tool loaded skill file: ${filePath}`;
+          break;
         }
       } else if (
         matcher.readToolPrefixes?.some(
-          (prefix) => firstTool.tool.startsWith(prefix) && firstTool.tool.includes(skillName),
+          (prefix) => toolCall.tool.startsWith(prefix) && toolCall.tool.includes(skillName),
         )
       ) {
         triggered = true;
-        evidence = `Read tool loaded skill file via tool name "${firstTool.tool}"`;
+        evidence = `Read tool loaded skill file via tool name "${toolCall.tool}"`;
+        break;
       }
     }
 
@@ -158,8 +186,8 @@ export class SkillTriggerEvaluator implements Evaluator {
       assertions: [
         {
           text: shouldTrigger
-            ? firstTool
-              ? `First tool was "${firstTool.tool}" — not a skill/read tool for "${skillName}"`
+            ? allToolCalls.length > 0
+              ? `Skill "${skillName}" not found in ${allToolCalls.length} tool call(s)`
               : 'No tool calls recorded'
             : evidence || `Skill "${skillName}" triggered unexpectedly`,
           passed: false,
diff --git a/packages/core/src/evaluation/providers/codex.ts b/packages/core/src/evaluation/providers/codex.ts
index bddca49d1..9f2ca0ce7 100644
--- a/packages/core/src/evaluation/providers/codex.ts
+++ b/packages/core/src/evaluation/providers/codex.ts
@@ -235,7 +235,7 @@ export class CodexProvider implements Provider {
     if (itemType === 'command_execution') {
       completedToolCalls.push({
         tool: 'command_execution',
-        input: item.command,
+        input: { command: item.command },
         output: item.aggregated_output,
         id: item.id,
       });
diff --git a/packages/core/test/evaluation/evaluators/skill-trigger.test.ts b/packages/core/test/evaluation/evaluators/skill-trigger.test.ts
index dacfea669..f9c0f4e04 100644
--- a/packages/core/test/evaluation/evaluators/skill-trigger.test.ts
+++ b/packages/core/test/evaluation/evaluators/skill-trigger.test.ts
@@ -86,6 +86,64 @@ describe('SkillTriggerEvaluator', () => {
       expect(result.verdict).toBe('pass');
     });
 
+    it('should detect codex mcp skill tool (skill name in tool name)', () => {
+      const evaluator = new SkillTriggerEvaluator(makeConfig());
+      const context = makeContext({
+        provider: { kind: 'codex', targetName: 'test' },
+        output: [
+          {
+            role: 'assistant',
+            content: '',
+            toolCalls: [{ tool: 'mcp:claude-code/csv-analyzer', input: {} }],
+          },
+        ],
+      });
+      const result = evaluator.evaluate(context);
+      expect(result.verdict).toBe('pass');
+      expect(result.score).toBe(1);
+    });
+
+    it('should detect codex mcp skill tool with arbitrary server name', () => {
+      const evaluator = new SkillTriggerEvaluator(makeConfig());
+      const context = makeContext({
+        provider: { kind: 'codex', targetName: 'test' },
+        output: [
+          {
+            role: 'assistant',
+            content: '',
+            toolCalls: [{ tool: 'mcp:skills/csv-analyzer', input: {} }],
+          },
+        ],
+      });
+      const result = evaluator.evaluate(context);
+      expect(result.verdict).toBe('pass');
+    });
+
+    it('should detect codex bash command_execution reading skill file', () => {
+      const evaluator = new SkillTriggerEvaluator(makeConfig());
+      const context = makeContext({
+        provider: { kind: 'codex', targetName: 'test' },
+        output: [
+          {
+            role: 'assistant',
+            content: '',
+            toolCalls: [
+              {
+                tool: 'command_execution',
+                input: {
+                  command:
+                    '/bin/bash -lc "sed -n \'1,220p\' /home/user/.agents/skills/csv-analyzer/SKILL.md"',
+                },
+              },
+            ],
+          },
+        ],
+      });
+      const result = evaluator.evaluate(context);
+      expect(result.verdict).toBe('pass');
+      expect(result.score).toBe(1);
+    });
+
     it('should fail for codex with non-matching tool calls', () => {
       const evaluator = new SkillTriggerEvaluator(makeConfig());
       const context = makeContext({
@@ -94,13 +152,13 @@ describe('SkillTriggerEvaluator', () => {
           {
             role: 'assistant',
             content: 'some response',
-            toolCalls: [{ tool: 'command_execution', input: 'ls -la' }],
+            toolCalls: [{ tool: 'command_execution', input: { command: 'ls -la' } }],
           },
         ],
       });
       const result = evaluator.evaluate(context);
       expect(result.verdict).toBe('fail');
-      expect(result.assertions.filter((a) => !a.passed)[0].text).toContain('command_execution');
+      expect(result.assertions.filter((a) => !a.passed)[0].text).toContain('csv-analyzer');
     });
 
     it('should pass for codex with should_trigger: false and unrelated tool', () => {
@@ -111,7 +169,7 @@ describe('SkillTriggerEvaluator', () => {
           {
             role: 'assistant',
             content: 'some response',
-            toolCalls: [{ tool: 'command_execution', input: 'ls -la' }],
+            toolCalls: [{ tool: 'command_execution', input: { command: 'ls -la' } }],
           },
         ],
       });
@@ -197,6 +255,154 @@ describe('SkillTriggerEvaluator', () => {
     });
   });
 
+  describe('full transcript scanning', () => {
+    it('should pass when skill triggers after a preamble meta-skill', () => {
+      const evaluator = new SkillTriggerEvaluator(makeConfig());
+      const context = makeContext({
+        provider: { kind: 'copilot-cli', targetName: 'test' },
+        output: [
+          {
+            role: 'assistant',
+            content: '',
+            toolCalls: [
+              { tool: 'Using skill: using-superpowers', input: {} },
+              { tool: 'Using skill: csv-analyzer', input: {} },
+            ],
+          },
+        ],
+      });
+      const result = evaluator.evaluate(context);
+      expect(result.verdict).toBe('pass');
+    });
+
+    it('should pass when skill triggers in a later message', () => {
+      const evaluator = new SkillTriggerEvaluator(makeConfig());
+      const context = makeContext({
+        output: [
+          {
+            role: 'assistant',
+            content: 'thinking...',
+            toolCalls: [{ tool: 'Bash', input: { command: 'ls' } }],
+          },
+          {
+            role: 'assistant',
+            content: '',
+            toolCalls: [{ tool: 'Skill', input: { skill: 'csv-analyzer' } }],
+          },
+        ],
+      });
+      const result = evaluator.evaluate(context);
+      expect(result.verdict).toBe('pass');
+    });
+
+    it('should fail when target skill never appears anywhere in transcript', () => {
+      const evaluator = new SkillTriggerEvaluator(makeConfig());
+      const context = makeContext({
+        output: [
+          {
+            role: 'assistant',
+            content: '',
+            toolCalls: [
+              { tool: 'Using skill: using-superpowers', input: {} },
+              { tool: 'Bash', input: { command: 'ls' } },
+            ],
+          },
+        ],
+      });
+      const result = evaluator.evaluate(context);
+      expect(result.verdict).toBe('fail');
+    });
+
+    it('should pass for should_trigger:false when skill never appears in transcript', () => {
+      const evaluator = new SkillTriggerEvaluator(makeConfig({ should_trigger: false }));
+      const context = makeContext({
+        output: [
+          {
+            role: 'assistant',
+            content: '',
+            toolCalls: [{ tool: 'Using skill: using-superpowers', input: {} }],
+          },
+        ],
+      });
+      const result = evaluator.evaluate(context);
+      expect(result.verdict).toBe('pass');
+    });
+
+    it('should fail for should_trigger:false when skill appears later in transcript', () => {
+      const evaluator = new SkillTriggerEvaluator(makeConfig({ should_trigger: false }));
+      const context = makeContext({
+        output: [
+          {
+            role: 'assistant',
+            content: '',
+            toolCalls: [
+              { tool: 'Bash', input: { command: 'ls' } },
+              { tool: 'Skill', input: { skill: 'csv-analyzer' } },
+            ],
+          },
+        ],
+      });
+      const result = evaluator.evaluate(context);
+      expect(result.verdict).toBe('fail');
+    });
+  });
+
+  describe('pi-coding-agent tools', () => {
+    it('should detect Skill tool for pi-coding-agent', () => {
+      const evaluator = new SkillTriggerEvaluator(makeConfig());
+      const context = makeContext({
+        provider: { kind: 'pi-coding-agent', targetName: 'test' },
+        output: [
+          {
+            role: 'assistant',
+            content: '',
+            toolCalls: [{ tool: 'Skill', input: { skill: 'csv-analyzer' } }],
+          },
+        ],
+      });
+      const result = evaluator.evaluate(context);
+      expect(result.verdict).toBe('pass');
+      expect(result.score).toBe(1);
+    });
+
+    it('should detect Read tool for pi-coding-agent when reading skill file', () => {
+      const evaluator = new SkillTriggerEvaluator(makeConfig());
+      const context = makeContext({
+        provider: { kind: 'pi-coding-agent', targetName: 'test' },
+        output: [
+          {
+            role: 'assistant',
+            content: '',
+            toolCalls: [
+              {
+                tool: 'Read',
+                input: { file_path: '/skills/csv-analyzer/SKILL.md' },
+              },
+            ],
+          },
+        ],
+      });
+      const result = evaluator.evaluate(context);
+      expect(result.verdict).toBe('pass');
+    });
+
+    it('should pass for pi-coding-agent with should_trigger: false and unrelated tool', () => {
+      const evaluator = new SkillTriggerEvaluator(makeConfig({ should_trigger: false }));
+      const context = makeContext({
+        provider: { kind: 'pi-coding-agent', targetName: 'test' },
+        output: [
+          {
+            role: 'assistant',
+            content: 'some response',
+            toolCalls: [{ tool: 'bash', input: { command: 'ls' } }],
+          },
+        ],
+      });
+      const result = evaluator.evaluate(context);
+      expect(result.verdict).toBe('pass');
+    });
+  });
+
   describe('copilot-specific tools', () => {
     it('should recognize readFile tool for copilot', () => {
       const evaluator = new SkillTriggerEvaluator(makeConfig());
diff --git a/packages/core/test/evaluation/providers/codex-sdk.test.ts b/packages/core/test/evaluation/providers/codex-sdk.test.ts
index 1b6b9e6b6..0e76d586e 100644
--- a/packages/core/test/evaluation/providers/codex-sdk.test.ts
+++ b/packages/core/test/evaluation/providers/codex-sdk.test.ts
@@ -289,7 +289,7 @@ describe('CodexProvider (SDK)', () => {
     expect(msg?.toolCalls).toBeDefined();
     expect(msg?.toolCalls?.length).toBe(1);
     expect(msg?.toolCalls?.[0]?.tool).toBe('command_execution');
-    expect(msg?.toolCalls?.[0]?.input).toBe('ls -la');
+    expect(msg?.toolCalls?.[0]?.input).toEqual({ command: 'ls -la' });
     expect(msg?.toolCalls?.[0]?.output).toBe('file1.ts\nfile2.ts');
     expect(msg?.toolCalls?.[0]?.id).toBe('cmd-1');
   });

From 1ec4ce99640ed92ca9d1d0ca91ed888e4ca9edf3 Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Sun, 22 Mar 2026 07:21:48 +0000
Subject: [PATCH 2/4] fix(pi-coding-agent): extract toolCall format + add
 PI_CODING_AGENT_MATCHER

Pi CLI emits `type: 'toolCall'` with `arguments` (not `tool_use` / `input`),
so tool calls were silently dropped. Also add PI_CODING_AGENT_MATCHER using
lowercase `read` + `path` field to match Pi's actual tool names.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .../evaluation/evaluators/skill-trigger.ts    | 18 ++++++-
 .../evaluation/providers/pi-coding-agent.ts   |  8 +++
 .../evaluators/skill-trigger.test.ts          | 53 +++++++++++++++++--
 3 files changed, 73 insertions(+), 6 deletions(-)

diff --git a/packages/core/src/evaluation/evaluators/skill-trigger.ts b/packages/core/src/evaluation/evaluators/skill-trigger.ts
index b4302494f..a7fe545ff 100644
--- a/packages/core/src/evaluation/evaluators/skill-trigger.ts
+++ b/packages/core/src/evaluation/evaluators/skill-trigger.ts
@@ -58,6 +58,22 @@ const COPILOT_MATCHER: ToolMatcher = {
   readInputFields: ['file_path', 'path'],
 };
 
+/**
+ * Pi CLI reads skill files using the lowercase `read` tool with a `path` argument.
+ * Skills are auto-discovered from `.agents/skills/` relative to the working directory.
+ *
+ * Skill lookup order (workspace-scoped first):
+ *   1. .agents/skills/<skill-name>/SKILL.md  (workspace-relative, auto-discovered)
+ *   2. ~/.agents/skills/<skill-name>/SKILL.md (global fallback)
+ */
+const PI_CODING_AGENT_MATCHER: ToolMatcher = {
+  skillTools: [],
+  skillInputField: 'skill',
+  readTools: ['read'],
+  readInputField: 'path',
+  readInputFields: ['path', 'file_path', 'filePath'],
+};
+
 /**
  * Codex reads skill files via command_execution using a bash sed command containing
  * the skill file path. The skill name appears in the command string, so we match
@@ -90,7 +106,7 @@ const PROVIDER_TOOL_SEMANTICS: Partial<Record<ProviderKind, ToolMatcher>> = {
   'claude-cli': CLAUDE_MATCHER,
   'claude-sdk': CLAUDE_MATCHER,
   codex: CODEX_MATCHER,
-  'pi-coding-agent': CLAUDE_MATCHER,
+  'pi-coding-agent': PI_CODING_AGENT_MATCHER,
   'pi-agent-sdk': CLAUDE_MATCHER,
   'copilot-cli': COPILOT_MATCHER,
   'copilot-sdk': COPILOT_MATCHER,
diff --git a/packages/core/src/evaluation/providers/pi-coding-agent.ts b/packages/core/src/evaluation/providers/pi-coding-agent.ts
index 43a07318d..6438af0b6 100644
--- a/packages/core/src/evaluation/providers/pi-coding-agent.ts
+++ b/packages/core/src/evaluation/providers/pi-coding-agent.ts
@@ -771,6 +771,14 @@ function extractToolCalls(content: unknown): readonly ToolCall[] {
         id: typeof p.id === 'string' ? p.id : undefined,
       });
     }
+    // Pi CLI emits toolCall (camelCase) with arguments (not input)
+    if (p.type === 'toolCall' && typeof p.name === 'string') {
+      toolCalls.push({
+        tool: p.name,
+        input: p.arguments,
+        id: typeof p.id === 'string' ? p.id : undefined,
+      });
+    }
     // Also handle tool_result for output
     if (p.type === 'tool_result' && typeof p.tool_use_id === 'string') {
       // Find matching tool call and add output
diff --git a/packages/core/test/evaluation/evaluators/skill-trigger.test.ts b/packages/core/test/evaluation/evaluators/skill-trigger.test.ts
index f9c0f4e04..d8ab2b859 100644
--- a/packages/core/test/evaluation/evaluators/skill-trigger.test.ts
+++ b/packages/core/test/evaluation/evaluators/skill-trigger.test.ts
@@ -119,6 +119,44 @@ describe('SkillTriggerEvaluator', () => {
       expect(result.verdict).toBe('pass');
     });
 
+    it('should detect pi-coding-agent read tool loading skill file', () => {
+      const evaluator = new SkillTriggerEvaluator(makeConfig());
+      const context = makeContext({
+        provider: { kind: 'pi-coding-agent', targetName: 'test' },
+        output: [
+          {
+            role: 'assistant',
+            content: '',
+            toolCalls: [
+              {
+                tool: 'read',
+                input: { path: '/workspace/.agents/skills/csv-analyzer/SKILL.md' },
+              },
+            ],
+          },
+        ],
+      });
+      const result = evaluator.evaluate(context);
+      expect(result.verdict).toBe('pass');
+      expect(result.score).toBe(1);
+    });
+
+    it('should fail for pi-coding-agent with non-matching read call', () => {
+      const evaluator = new SkillTriggerEvaluator(makeConfig());
+      const context = makeContext({
+        provider: { kind: 'pi-coding-agent', targetName: 'test' },
+        output: [
+          {
+            role: 'assistant',
+            content: 'some response',
+            toolCalls: [{ tool: 'read', input: { path: '/workspace/README.md' } }],
+          },
+        ],
+      });
+      const result = evaluator.evaluate(context);
+      expect(result.verdict).toBe('fail');
+    });
+
     it('should detect codex bash command_execution reading skill file', () => {
       const evaluator = new SkillTriggerEvaluator(makeConfig());
       const context = makeContext({
@@ -348,7 +386,7 @@ describe('SkillTriggerEvaluator', () => {
   });
 
   describe('pi-coding-agent tools', () => {
-    it('should detect Skill tool for pi-coding-agent', () => {
+    it('should detect pi-coding-agent read tool loading skill from .agents/skills', () => {
       const evaluator = new SkillTriggerEvaluator(makeConfig());
       const context = makeContext({
         provider: { kind: 'pi-coding-agent', targetName: 'test' },
@@ -356,7 +394,12 @@ describe('SkillTriggerEvaluator', () => {
           {
             role: 'assistant',
             content: '',
-            toolCalls: [{ tool: 'Skill', input: { skill: 'csv-analyzer' } }],
+            toolCalls: [
+              {
+                tool: 'read',
+                input: { path: '.agents/skills/csv-analyzer/SKILL.md' },
+              },
+            ],
           },
         ],
       });
@@ -365,7 +408,7 @@ describe('SkillTriggerEvaluator', () => {
       expect(result.score).toBe(1);
     });
 
-    it('should detect Read tool for pi-coding-agent when reading skill file', () => {
+    it('should detect pi-coding-agent read tool loading skill from global path', () => {
       const evaluator = new SkillTriggerEvaluator(makeConfig());
       const context = makeContext({
         provider: { kind: 'pi-coding-agent', targetName: 'test' },
@@ -375,8 +418,8 @@ describe('SkillTriggerEvaluator', () => {
             content: '',
             toolCalls: [
               {
-                tool: 'Read',
-                input: { file_path: '/skills/csv-analyzer/SKILL.md' },
+                tool: 'read',
+                input: { path: '/home/user/.agents/skills/csv-analyzer/SKILL.md' },
               },
             ],
           },

From 0dc7964199045b5b1b6a7fe844f08442fccde830 Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Sun, 22 Mar 2026 08:02:39 +0000
Subject: [PATCH 3/4] chore(pi-agent-sdk): remove from example targets, add
 deprecation notices

pi-agent-sdk has no tools (tools: []) so it cannot read files or invoke
skills. pi-coding-agent covers all the same use cases. Example targets.yaml
files now use openrouter directly for LLM grading.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .agentv/targets.yaml                                       | 3 +--
 .../showcase/offline-grader-benchmark/.agentv/targets.yaml | 6 ++----
 packages/core/src/evaluation/evaluators/skill-trigger.ts   | 4 +++-
 packages/core/src/evaluation/providers/index.ts            | 1 +
 packages/core/src/evaluation/providers/pi-agent-sdk.ts     | 7 +++++++
 5 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/.agentv/targets.yaml b/.agentv/targets.yaml
index 7a14aa08e..13ab5e997 100644
--- a/.agentv/targets.yaml
+++ b/.agentv/targets.yaml
@@ -3,8 +3,7 @@
 
 targets:
   - name: default
-    provider: pi-agent-sdk
-    pi_provider: openrouter
+    provider: openrouter
     model: z-ai/glm-4.7
     api_key: ${{ OPENROUTER_API_KEY }}
     system_prompt: "Answer directly based on the information provided."
diff --git a/examples/showcase/offline-grader-benchmark/.agentv/targets.yaml b/examples/showcase/offline-grader-benchmark/.agentv/targets.yaml
index 3011b53d8..34212cabf 100644
--- a/examples/showcase/offline-grader-benchmark/.agentv/targets.yaml
+++ b/examples/showcase/offline-grader-benchmark/.agentv/targets.yaml
@@ -17,15 +17,13 @@ targets:
     model: ${{ AZURE_DEPLOYMENT_NAME }}
 
   - name: grader_claude_haiku
-    provider: pi-agent-sdk
-    pi_provider: openrouter
+    provider: openrouter
     api_key: ${{ OPENROUTER_API_KEY }}
     model: anthropic/claude-haiku-4.5
     system_prompt: "Return concise structured grading output only."
 
   - name: grader_gemini_flash
-    provider: pi-agent-sdk
-    pi_provider: openrouter
+    provider: openrouter
     api_key: ${{ OPENROUTER_API_KEY }}
     model: google/gemini-3-flash-preview
     system_prompt: "Return concise structured grading output only."
diff --git a/packages/core/src/evaluation/evaluators/skill-trigger.ts b/packages/core/src/evaluation/evaluators/skill-trigger.ts
index a7fe545ff..56e5246e5 100644
--- a/packages/core/src/evaluation/evaluators/skill-trigger.ts
+++ b/packages/core/src/evaluation/evaluators/skill-trigger.ts
@@ -107,7 +107,9 @@ const PROVIDER_TOOL_SEMANTICS: Partial<Record<ProviderKind, ToolMatcher>> = {
   'claude-sdk': CLAUDE_MATCHER,
   codex: CODEX_MATCHER,
   'pi-coding-agent': PI_CODING_AGENT_MATCHER,
-  'pi-agent-sdk': CLAUDE_MATCHER,
+  // pi-agent-sdk has no tools, so skill detection is a no-op. Kept for completeness.
+  // TODO: consider removing pi-agent-sdk provider entirely.
+  'pi-agent-sdk': PI_CODING_AGENT_MATCHER,
   'copilot-cli': COPILOT_MATCHER,
   'copilot-sdk': COPILOT_MATCHER,
   vscode: COPILOT_MATCHER,
diff --git a/packages/core/src/evaluation/providers/index.ts b/packages/core/src/evaluation/providers/index.ts
index 54445b5f2..7e69333df 100644
--- a/packages/core/src/evaluation/providers/index.ts
+++ b/packages/core/src/evaluation/providers/index.ts
@@ -100,6 +100,7 @@ export function createBuiltinProviderRegistry(): ProviderRegistry {
     .register('copilot-sdk', (t) => new CopilotSdkProvider(t.name, t.config as never))
     .register('copilot-cli', (t) => new CopilotCliProvider(t.name, t.config as never))
     .register('pi-coding-agent', (t) => new PiCodingAgentProvider(t.name, t.config as never))
+    // TODO: consider removing pi-agent-sdk — it has no tools and is superseded by pi-coding-agent
     .register('pi-agent-sdk', (t) => new PiAgentSdkProvider(t.name, t.config as never))
     // claude-cli is the new default subprocess provider; claude is an alias
     .register('claude-cli', (t) => new ClaudeCliProvider(t.name, t.config as never))
diff --git a/packages/core/src/evaluation/providers/pi-agent-sdk.ts b/packages/core/src/evaluation/providers/pi-agent-sdk.ts
index 38a619402..b36384908 100644
--- a/packages/core/src/evaluation/providers/pi-agent-sdk.ts
+++ b/packages/core/src/evaluation/providers/pi-agent-sdk.ts
@@ -66,6 +66,13 @@ interface ToolExecTracker {
  *
  * Note: Dependencies are loaded lazily on first use to avoid bundling issues.
  * Users must install @mariozechner/pi-agent-core and @mariozechner/pi-ai separately.
+ *
+ * @deprecated Consider removing this provider. It initializes with tools: [] so it
+ * cannot read files or invoke skills — making it a plain Q&A loop with no agentic
+ * capability. The `pi-coding-agent` provider covers the same use cases and more
+ * (full tool access, workspace isolation, skill discovery). For lightweight LLM
+ * grading without a CLI dependency, use the `openrouter`, `openai`, or `gemini`
+ * providers instead.
  */
 export class PiAgentSdkProvider implements Provider {
   readonly id: string;

From 0a3a31366be76dfb679f5dae136fccaf8453658a Mon Sep 17 00:00:00 2001
From: Christopher Tso <christso@gmail.com>
Date: Sun, 22 Mar 2026 08:10:14 +0000
Subject: [PATCH 4/4] fix(targets): restore default target to pi-coding-agent

The self-evaluation default target should use the coding agent,
not a direct LLM call. The offline-grader-benchmark targets
remain as openrouter since graders only need plain LLM calls.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 .agentv/targets.yaml | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/.agentv/targets.yaml b/.agentv/targets.yaml
index 13ab5e997..09b9fc8fe 100644
--- a/.agentv/targets.yaml
+++ b/.agentv/targets.yaml
@@ -3,7 +3,9 @@
 
 targets:
   - name: default
-    provider: openrouter
+    provider: pi-coding-agent
+    executable: ${{ PI_CLI_PATH }}
+    pi_provider: openrouter
     model: z-ai/glm-4.7
     api_key: ${{ OPENROUTER_API_KEY }}
     system_prompt: "Answer directly based on the information provided."