EntityProcess · christso · Mar 22, 2026 · Mar 22, 2026 · Mar 22, 2026 · Mar 22, 2026
diff --git a/.agentv/targets.yaml b/.agentv/targets.yaml
@@ -3,7 +3,8 @@
 
 targets:
   - name: default
-    provider: pi-agent-sdk
+    provider: pi-coding-agent
+    executable: ${{ PI_CLI_PATH }}
     pi_provider: openrouter
     model: z-ai/glm-4.7
     api_key: ${{ OPENROUTER_API_KEY }}

diff --git a/examples/features/agent-skills-evals/README.md b/examples/features/agent-skills-evals/README.md
@@ -82,6 +82,27 @@ agentv eval csv-analyzer.EVAL.yaml
 agentv eval csv-analyzer.EVAL.yaml --target default
 ```
 
+## Skill setup
+
+The `csv-analyzer` skill is included in this example under `.claude/skills/csv-analyzer/SKILL.md`. Claude Code automatically discovers skills in the project's `.claude/skills/` directory, so no global installation is needed for claude-cli, claude-sdk, or copilot targets.
+
+## Multi-provider eval
+
+`multi-provider-skill-trigger.EVAL.yaml` tests the same skill across multiple providers:
+
+```bash
+bun apps/cli/src/cli.ts eval multi-provider-skill-trigger.EVAL.yaml \
+  --target copilot --targets ../.agentv/targets.yaml
+```
+
+The `skill-trigger` evaluator automatically handles each provider's tool-call format:
+
+| Provider | Detection method |
+|----------|-----------------|
+| claude, claude-cli | `Skill` tool with `skill` input field |
+| copilot | `Using skill: <name>` tool prefix |
+| codex | `mcp:<server>/<name>` tool prefix |
+
 ## Copilot note
 
 When running `skill-trigger` evals against Copilot targets, real traces may show provider-specific tool names such as:
@@ -91,6 +112,4 @@ Using skill: <skill-name>
 Viewing ...<skill-path>
 ```
 
-They may also emit `Read` tool calls with the path in `input.path` rather than `input.file_path`.
-
-These shapes come from real provider traces and should be treated as valid skill-trigger evidence for Copilot targets.
+The evaluator scans the entire conversation transcript (not just the first tool call), so a preamble meta-skill like `using-superpowers` firing before `csv-analyzer` still results in a pass.
diff --git a/examples/features/agent-skills-evals/multi-provider-skill-trigger.EVAL.yaml b/examples/features/agent-skills-evals/multi-provider-skill-trigger.EVAL.yaml
@@ -1,14 +1,22 @@
 # Multi-provider skill-trigger evaluation example.
 #
+# Uses an isolated workspace template (workspace/) so no global skill installation
+# is needed. The workspace contains:
+#   .claude/skills/csv-analyzer/  — for claude/copilot providers
+#   .agents/skills/csv-analyzer/  — for codex provider
+#
 # The same EVAL.yaml works with any provider — just change --target:
 #
-#   agentv eval this-file.EVAL.yaml --target claude
-#   agentv eval this-file.EVAL.yaml --target copilot
-#   agentv eval this-file.EVAL.yaml --target pi-coding-agent
+#   agentv eval this-file.EVAL.yaml --target claude --targets ../.agentv/targets.yaml
+#   agentv eval this-file.EVAL.yaml --target copilot --targets ../.agentv/targets.yaml
+#   agentv eval this-file.EVAL.yaml --target codex --targets ../.agentv/targets.yaml
 #
 # The evaluator automatically resolves the correct tool names for each
 # provider. No provider-specific config needed in test cases.
 
+workspace:
+  template: workspace/
+
 tests:
   # === Positive cases: skill should trigger ===
 

diff --git a/...ples/features/agent-skills-evals/workspace/.agents/skills/csv-analyzer/SKILL.md b/...ples/features/agent-skills-evals/workspace/.agents/skills/csv-analyzer/SKILL.md
@@ -0,0 +1,23 @@
+---
+name: csv-analyzer
+description: Use when the user asks to analyze, summarize, or extract insights from CSV data or files
+---
+
+# CSV Analyzer
+
+## Overview
+
+Analyzes CSV data to extract insights, summaries, and statistics.
+
+## How to use
+
+1. Read the CSV file
+2. Parse the columns and rows
+3. Identify key metrics (totals, averages, top N entries)
+4. Present findings clearly
+
+## Example tasks
+
+- "Find the top 5 months by revenue"
+- "Which quarter had the most growth"
+- "Summarize the sales data"
diff --git a/...ples/features/agent-skills-evals/workspace/.claude/skills/csv-analyzer/SKILL.md b/...ples/features/agent-skills-evals/workspace/.claude/skills/csv-analyzer/SKILL.md
@@ -0,0 +1,23 @@
+---
+name: csv-analyzer
+description: Use when the user asks to analyze, summarize, or extract insights from CSV data or files
+---
+
+# CSV Analyzer
+
+## Overview
+
+Analyzes CSV data to extract insights, summaries, and statistics.
+
+## How to use
+
+1. Read the CSV file
+2. Parse the columns and rows
+3. Identify key metrics (totals, averages, top N entries)
+4. Present findings clearly
+
+## Example tasks
+
+- "Find the top 5 months by revenue"
+- "Which quarter had the most growth"
+- "Summarize the sales data"
diff --git a/examples/features/agent-skills-evals/workspace/.codex/skills/csv-analyzer/SKILL.md b/examples/features/agent-skills-evals/workspace/.codex/skills/csv-analyzer/SKILL.md
@@ -0,0 +1,23 @@
+---
+name: csv-analyzer
+description: Use when the user asks to analyze, summarize, or extract insights from CSV data or files
+---
+
+# CSV Analyzer
+
+## Overview
+
+Analyzes CSV data to extract insights, summaries, and statistics.
+
+## How to use
+
+1. Read the CSV file
+2. Parse the columns and rows
+3. Identify key metrics (totals, averages, top N entries)
+4. Present findings clearly
+
+## Example tasks
+
+- "Find the top 5 months by revenue"
+- "Which quarter had the most growth"
+- "Summarize the sales data"
diff --git a/examples/features/agent-skills-evals/workspace/AGENTS.md b/examples/features/agent-skills-evals/workspace/AGENTS.md
@@ -0,0 +1,11 @@
+# Agent Instructions
+
+## Skills
+
+Domain skills are in `.agents/skills/` relative to your working directory.
+Check for a relevant skill before responding to any task.
+
+Available skills:
+
+- **csv-analyzer** — use when the user asks to analyze, summarize, or extract
+  insights from CSV data or files. Skill file: `.agents/skills/csv-analyzer/SKILL.md`
diff --git a/examples/features/agent-skills-evals/workspace/sales.csv b/examples/features/agent-skills-evals/workspace/sales.csv
@@ -0,0 +1,13 @@
+month,revenue,units_sold
+January,12500,150
+February,9800,120
+March,15200,180
+April,11300,140
+May,18900,220
+June,14700,175
+July,16500,195
+August,13200,160
+September,20100,240
+October,17800,210
+November,22500,265
+December,19400,230
diff --git a/examples/showcase/offline-grader-benchmark/.agentv/targets.yaml b/examples/showcase/offline-grader-benchmark/.agentv/targets.yaml
@@ -17,15 +17,13 @@ targets:
     model: ${{ AZURE_DEPLOYMENT_NAME }}
 
   - name: grader_claude_haiku
-    provider: pi-agent-sdk
-    pi_provider: openrouter
+    provider: openrouter
     api_key: ${{ OPENROUTER_API_KEY }}
     model: anthropic/claude-haiku-4.5
     system_prompt: "Return concise structured grading output only."
 
   - name: grader_gemini_flash
-    provider: pi-agent-sdk
-    pi_provider: openrouter
+    provider: openrouter
     api_key: ${{ OPENROUTER_API_KEY }}
     model: google/gemini-3-flash-preview
     system_prompt: "Return concise structured grading output only."
diff --git a/packages/core/src/evaluation/evaluators/skill-trigger.ts b/packages/core/src/evaluation/evaluators/skill-trigger.ts
@@ -58,6 +58,45 @@ const COPILOT_MATCHER: ToolMatcher = {
   readInputFields: ['file_path', 'path'],
 };
 
+/**
+ * Pi CLI reads skill files using the lowercase `read` tool with a `path` argument.
+ * Skills are auto-discovered from `.agents/skills/` relative to the working directory.
+ *
+ * Skill lookup order (workspace-scoped first):
+ *   1. .agents/skills/<skill-name>/SKILL.md  (workspace-relative, auto-discovered)
+ *   2. ~/.agents/skills/<skill-name>/SKILL.md (global fallback)
+ */
+const PI_CODING_AGENT_MATCHER: ToolMatcher = {
+  skillTools: [],
+  skillInputField: 'skill',
+  readTools: ['read'],
+  readInputField: 'path',
+  readInputFields: ['path', 'file_path', 'filePath'],
+};
+
+/**
+ * Codex reads skill files via command_execution using a bash sed command containing
+ * the skill file path. The skill name appears in the command string, so we match
+ * any command_execution whose command field includes the skill name.
+ *
+ * Skill lookup order (workspace-scoped first):
+ *   1. .agents/skills/<skill-name>/SKILL.md  (workspace-relative)
+ *   2. .codex/skills/<skill-name>/SKILL.md   (fallback)
+ *   3. ~/.agents/skills/<skill-name>/SKILL.md (global fallback)
+ *
+ * MCP-based skill invocation (`mcp:<server>/<skill-name>`) is also supported for
+ * Codex configurations that surface skills as MCP tools.
+ */
+const CODEX_MATCHER: ToolMatcher = {
+  skillTools: [],
+  skillInputField: 'skill',
+  readTools: ['command_execution'],
+  readInputField: 'command',
+  skillToolPrefixes: ['mcp:'],
+  readToolPrefixes: ['mcp:'],
+  readInputFields: ['command', 'path', 'file_path', 'filePath'],
+};
+
 /**
  * Static mapping of provider kinds to their tool-name semantics.
  * Providers not listed here fall back to CLAUDE_MATCHER.
@@ -66,8 +105,11 @@ const PROVIDER_TOOL_SEMANTICS: Partial<Record<ProviderKind, ToolMatcher>> = {
   claude: CLAUDE_MATCHER,
   'claude-cli': CLAUDE_MATCHER,
   'claude-sdk': CLAUDE_MATCHER,
-  'pi-coding-agent': CLAUDE_MATCHER,
-  'pi-agent-sdk': CLAUDE_MATCHER,
+  codex: CODEX_MATCHER,
+  'pi-coding-agent': PI_CODING_AGENT_MATCHER,
+  // pi-agent-sdk has no tools, so skill detection is a no-op. Kept for completeness.
+  // TODO: consider removing pi-agent-sdk provider entirely.
+  'pi-agent-sdk': PI_CODING_AGENT_MATCHER,
   'copilot-cli': COPILOT_MATCHER,
   'copilot-sdk': COPILOT_MATCHER,
   vscode: COPILOT_MATCHER,
@@ -97,40 +139,44 @@ export class SkillTriggerEvaluator implements Evaluator {
     const providerKind = context.provider?.kind as ProviderKind | undefined;
     const matcher = this.resolveMatcher(providerKind);
 
-    const firstTool = (context.output ?? []).flatMap((msg) => msg.toolCalls ?? [])[0];
+    const allToolCalls = (context.output ?? []).flatMap((msg) => msg.toolCalls ?? []);
 
     let triggered = false;
     let evidence = '';
 
-    if (firstTool) {
-      const input = (firstTool.input ?? {}) as Record<string, unknown>;
+    for (const toolCall of allToolCalls) {
+      const input = (toolCall.input ?? {}) as Record<string, unknown>;
 
-      if (matcher.skillTools.includes(firstTool.tool)) {
+      if (matcher.skillTools.includes(toolCall.tool)) {
         const skillArg = String(input[matcher.skillInputField] ?? '');
         if (skillArg.includes(skillName)) {
           triggered = true;
           evidence = `Skill tool invoked with ${matcher.skillInputField}="${skillArg}"`;
+          break;
         }
       } else if (
         matcher.skillToolPrefixes?.some(
-          (prefix) => firstTool.tool.startsWith(prefix) && firstTool.tool.includes(skillName),
+          (prefix) => toolCall.tool.startsWith(prefix) && toolCall.tool.includes(skillName),
         )
       ) {
         triggered = true;
-        evidence = `Skill tool invoked via tool name "${firstTool.tool}"`;
-      } else if (matcher.readTools.includes(firstTool.tool)) {
+        evidence = `Skill tool invoked via tool name "${toolCall.tool}"`;
+        break;
+      } else if (matcher.readTools.includes(toolCall.tool)) {
         const filePath = this.readPathFromInput(input, matcher);
         if (filePath.includes(skillName)) {
           triggered = true;
           evidence = `Read tool loaded skill file: ${filePath}`;
+          break;
         }
       } else if (
         matcher.readToolPrefixes?.some(
-          (prefix) => firstTool.tool.startsWith(prefix) && firstTool.tool.includes(skillName),
+          (prefix) => toolCall.tool.startsWith(prefix) && toolCall.tool.includes(skillName),
         )
       ) {
         triggered = true;
-        evidence = `Read tool loaded skill file via tool name "${firstTool.tool}"`;
+        evidence = `Read tool loaded skill file via tool name "${toolCall.tool}"`;
+        break;
       }
     }
 
@@ -158,8 +204,8 @@ export class SkillTriggerEvaluator implements Evaluator {
       assertions: [
         {
           text: shouldTrigger
-            ? firstTool
-              ? `First tool was "${firstTool.tool}" — not a skill/read tool for "${skillName}"`
+            ? allToolCalls.length > 0
+              ? `Skill "${skillName}" not found in ${allToolCalls.length} tool call(s)`
               : 'No tool calls recorded'
             : evidence || `Skill "${skillName}" triggered unexpectedly`,
           passed: false,

diff --git a/packages/core/src/evaluation/providers/codex.ts b/packages/core/src/evaluation/providers/codex.ts
@@ -235,7 +235,7 @@ export class CodexProvider implements Provider {
     if (itemType === 'command_execution') {
       completedToolCalls.push({
         tool: 'command_execution',
-        input: item.command,
+        input: { command: item.command },
         output: item.aggregated_output,
         id: item.id,
       });

diff --git a/packages/core/src/evaluation/providers/index.ts b/packages/core/src/evaluation/providers/index.ts
@@ -100,6 +100,7 @@ export function createBuiltinProviderRegistry(): ProviderRegistry {
     .register('copilot-sdk', (t) => new CopilotSdkProvider(t.name, t.config as never))
     .register('copilot-cli', (t) => new CopilotCliProvider(t.name, t.config as never))
     .register('pi-coding-agent', (t) => new PiCodingAgentProvider(t.name, t.config as never))
+    // TODO: consider removing pi-agent-sdk — it has no tools and is superseded by pi-coding-agent
     .register('pi-agent-sdk', (t) => new PiAgentSdkProvider(t.name, t.config as never))
     // claude-cli is the new default subprocess provider; claude is an alias
     .register('claude-cli', (t) => new ClaudeCliProvider(t.name, t.config as never))

diff --git a/packages/core/src/evaluation/providers/pi-agent-sdk.ts b/packages/core/src/evaluation/providers/pi-agent-sdk.ts
@@ -66,6 +66,13 @@ interface ToolExecTracker {
  *
  * Note: Dependencies are loaded lazily on first use to avoid bundling issues.
  * Users must install @mariozechner/pi-agent-core and @mariozechner/pi-ai separately.
+ *
+ * @deprecated Consider removing this provider. It initializes with tools: [] so it
+ * cannot read files or invoke skills — making it a plain Q&A loop with no agentic
+ * capability. The `pi-coding-agent` provider covers the same use cases and more
+ * (full tool access, workspace isolation, skill discovery). For lightweight LLM
+ * grading without a CLI dependency, use the `openrouter`, `openai`, or `gemini`
+ * providers instead.
  */
 export class PiAgentSdkProvider implements Provider {
   readonly id: string;

diff --git a/packages/core/src/evaluation/providers/pi-coding-agent.ts b/packages/core/src/evaluation/providers/pi-coding-agent.ts
@@ -771,6 +771,14 @@ function extractToolCalls(content: unknown): readonly ToolCall[] {
         id: typeof p.id === 'string' ? p.id : undefined,
       });
     }
+    // Pi CLI emits toolCall (camelCase) with arguments (not input)
+    if (p.type === 'toolCall' && typeof p.name === 'string') {
+      toolCalls.push({
+        tool: p.name,
+        input: p.arguments,
+        id: typeof p.id === 'string' ? p.id : undefined,
+      });
+    }
     // Also handle tool_result for output
     if (p.type === 'tool_result' && typeof p.tool_use_id === 'string') {
       // Find matching tool call and add output