From dd628ef7c88976401156b4f41758f7f621d69510 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 22 Mar 2026 07:09:25 +0000 Subject: [PATCH 1/4] feat(skill-trigger): fix multi-provider detection + workspace-scoped skills - Scan full transcript (not just first tool call) so preamble meta-skills like using-superpowers don't cause false negatives for copilot - Add CODEX_MATCHER: detect command_execution reading skill files via bash sed commands; skill name appears in the command string - Fix codex provider: store command_execution input as { command } object instead of raw string so readPathFromInput can extract the path - Add workspace template with .agents/skills/ and .codex/skills/ dirs so evals are self-contained and don't rely on global skill installation - Add 11 new unit tests covering full transcript scanning, codex bash detection, and pi-coding-agent provider resolution E2E results: copilot 4/4, codex 4/4, pi 2/4 (pi has no skill system) Co-Authored-By: Claude Sonnet 4.6 --- .../features/agent-skills-evals/README.md | 25 ++- .../multi-provider-skill-trigger.EVAL.yaml | 14 +- .../.agents/skills/csv-analyzer/SKILL.md | 23 ++ .../.claude/skills/csv-analyzer/SKILL.md | 23 ++ .../.codex/skills/csv-analyzer/SKILL.md | 23 ++ .../agent-skills-evals/workspace/AGENTS.md | 11 + .../agent-skills-evals/workspace/sales.csv | 13 ++ .../evaluation/evaluators/skill-trigger.ts | 50 ++++- .../core/src/evaluation/providers/codex.ts | 2 +- .../evaluators/skill-trigger.test.ts | 212 +++++++++++++++++- .../evaluation/providers/codex-sdk.test.ts | 2 +- 11 files changed, 376 insertions(+), 22 deletions(-) create mode 100644 examples/features/agent-skills-evals/workspace/.agents/skills/csv-analyzer/SKILL.md create mode 100644 examples/features/agent-skills-evals/workspace/.claude/skills/csv-analyzer/SKILL.md create mode 100644 examples/features/agent-skills-evals/workspace/.codex/skills/csv-analyzer/SKILL.md create mode 100644 examples/features/agent-skills-evals/workspace/AGENTS.md create mode 100644 examples/features/agent-skills-evals/workspace/sales.csv diff --git a/examples/features/agent-skills-evals/README.md b/examples/features/agent-skills-evals/README.md index d77a23471..6ecb2481c 100644 --- a/examples/features/agent-skills-evals/README.md +++ b/examples/features/agent-skills-evals/README.md @@ -82,6 +82,27 @@ agentv eval csv-analyzer.EVAL.yaml agentv eval csv-analyzer.EVAL.yaml --target default ``` +## Skill setup + +The `csv-analyzer` skill is included in this example under `.claude/skills/csv-analyzer/SKILL.md`. Claude Code automatically discovers skills in the project's `.claude/skills/` directory, so no global installation is needed for claude-cli, claude-sdk, or copilot targets. + +## Multi-provider eval + +`multi-provider-skill-trigger.EVAL.yaml` tests the same skill across multiple providers: + +```bash +bun apps/cli/src/cli.ts eval multi-provider-skill-trigger.EVAL.yaml \ + --target copilot --targets ../.agentv/targets.yaml +``` + +The `skill-trigger` evaluator automatically handles each provider's tool-call format: + +| Provider | Detection method | +|----------|-----------------| +| claude, claude-cli | `Skill` tool with `skill` input field | +| copilot | `Using skill: ` tool prefix | +| codex | `mcp:/` tool prefix | + ## Copilot note When running `skill-trigger` evals against Copilot targets, real traces may show provider-specific tool names such as: @@ -91,6 +112,4 @@ Using skill: Viewing ... ``` -They may also emit `Read` tool calls with the path in `input.path` rather than `input.file_path`. - -These shapes come from real provider traces and should be treated as valid skill-trigger evidence for Copilot targets. +The evaluator scans the entire conversation transcript (not just the first tool call), so a preamble meta-skill like `using-superpowers` firing before `csv-analyzer` still results in a pass. diff --git a/examples/features/agent-skills-evals/multi-provider-skill-trigger.EVAL.yaml b/examples/features/agent-skills-evals/multi-provider-skill-trigger.EVAL.yaml index 6d309d07b..377daf57f 100644 --- a/examples/features/agent-skills-evals/multi-provider-skill-trigger.EVAL.yaml +++ b/examples/features/agent-skills-evals/multi-provider-skill-trigger.EVAL.yaml @@ -1,14 +1,22 @@ # Multi-provider skill-trigger evaluation example. # +# Uses an isolated workspace template (workspace/) so no global skill installation +# is needed. The workspace contains: +# .claude/skills/csv-analyzer/ — for claude/copilot providers +# .agents/skills/csv-analyzer/ — for codex provider +# # The same EVAL.yaml works with any provider — just change --target: # -# agentv eval this-file.EVAL.yaml --target claude -# agentv eval this-file.EVAL.yaml --target copilot -# agentv eval this-file.EVAL.yaml --target pi-coding-agent +# agentv eval this-file.EVAL.yaml --target claude --targets ../.agentv/targets.yaml +# agentv eval this-file.EVAL.yaml --target copilot --targets ../.agentv/targets.yaml +# agentv eval this-file.EVAL.yaml --target codex --targets ../.agentv/targets.yaml # # The evaluator automatically resolves the correct tool names for each # provider. No provider-specific config needed in test cases. +workspace: + template: workspace/ + tests: # === Positive cases: skill should trigger === diff --git a/examples/features/agent-skills-evals/workspace/.agents/skills/csv-analyzer/SKILL.md b/examples/features/agent-skills-evals/workspace/.agents/skills/csv-analyzer/SKILL.md new file mode 100644 index 000000000..e52c484ef --- /dev/null +++ b/examples/features/agent-skills-evals/workspace/.agents/skills/csv-analyzer/SKILL.md @@ -0,0 +1,23 @@ +--- +name: csv-analyzer +description: Use when the user asks to analyze, summarize, or extract insights from CSV data or files +--- + +# CSV Analyzer + +## Overview + +Analyzes CSV data to extract insights, summaries, and statistics. + +## How to use + +1. Read the CSV file +2. Parse the columns and rows +3. Identify key metrics (totals, averages, top N entries) +4. Present findings clearly + +## Example tasks + +- "Find the top 5 months by revenue" +- "Which quarter had the most growth" +- "Summarize the sales data" diff --git a/examples/features/agent-skills-evals/workspace/.claude/skills/csv-analyzer/SKILL.md b/examples/features/agent-skills-evals/workspace/.claude/skills/csv-analyzer/SKILL.md new file mode 100644 index 000000000..e52c484ef --- /dev/null +++ b/examples/features/agent-skills-evals/workspace/.claude/skills/csv-analyzer/SKILL.md @@ -0,0 +1,23 @@ +--- +name: csv-analyzer +description: Use when the user asks to analyze, summarize, or extract insights from CSV data or files +--- + +# CSV Analyzer + +## Overview + +Analyzes CSV data to extract insights, summaries, and statistics. + +## How to use + +1. Read the CSV file +2. Parse the columns and rows +3. Identify key metrics (totals, averages, top N entries) +4. Present findings clearly + +## Example tasks + +- "Find the top 5 months by revenue" +- "Which quarter had the most growth" +- "Summarize the sales data" diff --git a/examples/features/agent-skills-evals/workspace/.codex/skills/csv-analyzer/SKILL.md b/examples/features/agent-skills-evals/workspace/.codex/skills/csv-analyzer/SKILL.md new file mode 100644 index 000000000..e52c484ef --- /dev/null +++ b/examples/features/agent-skills-evals/workspace/.codex/skills/csv-analyzer/SKILL.md @@ -0,0 +1,23 @@ +--- +name: csv-analyzer +description: Use when the user asks to analyze, summarize, or extract insights from CSV data or files +--- + +# CSV Analyzer + +## Overview + +Analyzes CSV data to extract insights, summaries, and statistics. + +## How to use + +1. Read the CSV file +2. Parse the columns and rows +3. Identify key metrics (totals, averages, top N entries) +4. Present findings clearly + +## Example tasks + +- "Find the top 5 months by revenue" +- "Which quarter had the most growth" +- "Summarize the sales data" diff --git a/examples/features/agent-skills-evals/workspace/AGENTS.md b/examples/features/agent-skills-evals/workspace/AGENTS.md new file mode 100644 index 000000000..4b57137b8 --- /dev/null +++ b/examples/features/agent-skills-evals/workspace/AGENTS.md @@ -0,0 +1,11 @@ +# Agent Instructions + +## Skills + +Domain skills are in `.agents/skills/` relative to your working directory. +Check for a relevant skill before responding to any task. + +Available skills: + +- **csv-analyzer** — use when the user asks to analyze, summarize, or extract + insights from CSV data or files. Skill file: `.agents/skills/csv-analyzer/SKILL.md` diff --git a/examples/features/agent-skills-evals/workspace/sales.csv b/examples/features/agent-skills-evals/workspace/sales.csv new file mode 100644 index 000000000..7f29d45a6 --- /dev/null +++ b/examples/features/agent-skills-evals/workspace/sales.csv @@ -0,0 +1,13 @@ +month,revenue,units_sold +January,12500,150 +February,9800,120 +March,15200,180 +April,11300,140 +May,18900,220 +June,14700,175 +July,16500,195 +August,13200,160 +September,20100,240 +October,17800,210 +November,22500,265 +December,19400,230 diff --git a/packages/core/src/evaluation/evaluators/skill-trigger.ts b/packages/core/src/evaluation/evaluators/skill-trigger.ts index 856a7c7a4..b4302494f 100644 --- a/packages/core/src/evaluation/evaluators/skill-trigger.ts +++ b/packages/core/src/evaluation/evaluators/skill-trigger.ts @@ -58,6 +58,29 @@ const COPILOT_MATCHER: ToolMatcher = { readInputFields: ['file_path', 'path'], }; +/** + * Codex reads skill files via command_execution using a bash sed command containing + * the skill file path. The skill name appears in the command string, so we match + * any command_execution whose command field includes the skill name. + * + * Skill lookup order (workspace-scoped first): + * 1. .agents/skills//SKILL.md (workspace-relative) + * 2. .codex/skills//SKILL.md (fallback) + * 3. ~/.agents/skills//SKILL.md (global fallback) + * + * MCP-based skill invocation (`mcp:/`) is also supported for + * Codex configurations that surface skills as MCP tools. + */ +const CODEX_MATCHER: ToolMatcher = { + skillTools: [], + skillInputField: 'skill', + readTools: ['command_execution'], + readInputField: 'command', + skillToolPrefixes: ['mcp:'], + readToolPrefixes: ['mcp:'], + readInputFields: ['command', 'path', 'file_path', 'filePath'], +}; + /** * Static mapping of provider kinds to their tool-name semantics. * Providers not listed here fall back to CLAUDE_MATCHER. @@ -66,6 +89,7 @@ const PROVIDER_TOOL_SEMANTICS: Partial> = { claude: CLAUDE_MATCHER, 'claude-cli': CLAUDE_MATCHER, 'claude-sdk': CLAUDE_MATCHER, + codex: CODEX_MATCHER, 'pi-coding-agent': CLAUDE_MATCHER, 'pi-agent-sdk': CLAUDE_MATCHER, 'copilot-cli': COPILOT_MATCHER, @@ -97,40 +121,44 @@ export class SkillTriggerEvaluator implements Evaluator { const providerKind = context.provider?.kind as ProviderKind | undefined; const matcher = this.resolveMatcher(providerKind); - const firstTool = (context.output ?? []).flatMap((msg) => msg.toolCalls ?? [])[0]; + const allToolCalls = (context.output ?? []).flatMap((msg) => msg.toolCalls ?? []); let triggered = false; let evidence = ''; - if (firstTool) { - const input = (firstTool.input ?? {}) as Record; + for (const toolCall of allToolCalls) { + const input = (toolCall.input ?? {}) as Record; - if (matcher.skillTools.includes(firstTool.tool)) { + if (matcher.skillTools.includes(toolCall.tool)) { const skillArg = String(input[matcher.skillInputField] ?? ''); if (skillArg.includes(skillName)) { triggered = true; evidence = `Skill tool invoked with ${matcher.skillInputField}="${skillArg}"`; + break; } } else if ( matcher.skillToolPrefixes?.some( - (prefix) => firstTool.tool.startsWith(prefix) && firstTool.tool.includes(skillName), + (prefix) => toolCall.tool.startsWith(prefix) && toolCall.tool.includes(skillName), ) ) { triggered = true; - evidence = `Skill tool invoked via tool name "${firstTool.tool}"`; - } else if (matcher.readTools.includes(firstTool.tool)) { + evidence = `Skill tool invoked via tool name "${toolCall.tool}"`; + break; + } else if (matcher.readTools.includes(toolCall.tool)) { const filePath = this.readPathFromInput(input, matcher); if (filePath.includes(skillName)) { triggered = true; evidence = `Read tool loaded skill file: ${filePath}`; + break; } } else if ( matcher.readToolPrefixes?.some( - (prefix) => firstTool.tool.startsWith(prefix) && firstTool.tool.includes(skillName), + (prefix) => toolCall.tool.startsWith(prefix) && toolCall.tool.includes(skillName), ) ) { triggered = true; - evidence = `Read tool loaded skill file via tool name "${firstTool.tool}"`; + evidence = `Read tool loaded skill file via tool name "${toolCall.tool}"`; + break; } } @@ -158,8 +186,8 @@ export class SkillTriggerEvaluator implements Evaluator { assertions: [ { text: shouldTrigger - ? firstTool - ? `First tool was "${firstTool.tool}" — not a skill/read tool for "${skillName}"` + ? allToolCalls.length > 0 + ? `Skill "${skillName}" not found in ${allToolCalls.length} tool call(s)` : 'No tool calls recorded' : evidence || `Skill "${skillName}" triggered unexpectedly`, passed: false, diff --git a/packages/core/src/evaluation/providers/codex.ts b/packages/core/src/evaluation/providers/codex.ts index bddca49d1..9f2ca0ce7 100644 --- a/packages/core/src/evaluation/providers/codex.ts +++ b/packages/core/src/evaluation/providers/codex.ts @@ -235,7 +235,7 @@ export class CodexProvider implements Provider { if (itemType === 'command_execution') { completedToolCalls.push({ tool: 'command_execution', - input: item.command, + input: { command: item.command }, output: item.aggregated_output, id: item.id, }); diff --git a/packages/core/test/evaluation/evaluators/skill-trigger.test.ts b/packages/core/test/evaluation/evaluators/skill-trigger.test.ts index dacfea669..f9c0f4e04 100644 --- a/packages/core/test/evaluation/evaluators/skill-trigger.test.ts +++ b/packages/core/test/evaluation/evaluators/skill-trigger.test.ts @@ -86,6 +86,64 @@ describe('SkillTriggerEvaluator', () => { expect(result.verdict).toBe('pass'); }); + it('should detect codex mcp skill tool (skill name in tool name)', () => { + const evaluator = new SkillTriggerEvaluator(makeConfig()); + const context = makeContext({ + provider: { kind: 'codex', targetName: 'test' }, + output: [ + { + role: 'assistant', + content: '', + toolCalls: [{ tool: 'mcp:claude-code/csv-analyzer', input: {} }], + }, + ], + }); + const result = evaluator.evaluate(context); + expect(result.verdict).toBe('pass'); + expect(result.score).toBe(1); + }); + + it('should detect codex mcp skill tool with arbitrary server name', () => { + const evaluator = new SkillTriggerEvaluator(makeConfig()); + const context = makeContext({ + provider: { kind: 'codex', targetName: 'test' }, + output: [ + { + role: 'assistant', + content: '', + toolCalls: [{ tool: 'mcp:skills/csv-analyzer', input: {} }], + }, + ], + }); + const result = evaluator.evaluate(context); + expect(result.verdict).toBe('pass'); + }); + + it('should detect codex bash command_execution reading skill file', () => { + const evaluator = new SkillTriggerEvaluator(makeConfig()); + const context = makeContext({ + provider: { kind: 'codex', targetName: 'test' }, + output: [ + { + role: 'assistant', + content: '', + toolCalls: [ + { + tool: 'command_execution', + input: { + command: + '/bin/bash -lc "sed -n \'1,220p\' /home/user/.agents/skills/csv-analyzer/SKILL.md"', + }, + }, + ], + }, + ], + }); + const result = evaluator.evaluate(context); + expect(result.verdict).toBe('pass'); + expect(result.score).toBe(1); + }); + it('should fail for codex with non-matching tool calls', () => { const evaluator = new SkillTriggerEvaluator(makeConfig()); const context = makeContext({ @@ -94,13 +152,13 @@ describe('SkillTriggerEvaluator', () => { { role: 'assistant', content: 'some response', - toolCalls: [{ tool: 'command_execution', input: 'ls -la' }], + toolCalls: [{ tool: 'command_execution', input: { command: 'ls -la' } }], }, ], }); const result = evaluator.evaluate(context); expect(result.verdict).toBe('fail'); - expect(result.assertions.filter((a) => !a.passed)[0].text).toContain('command_execution'); + expect(result.assertions.filter((a) => !a.passed)[0].text).toContain('csv-analyzer'); }); it('should pass for codex with should_trigger: false and unrelated tool', () => { @@ -111,7 +169,7 @@ describe('SkillTriggerEvaluator', () => { { role: 'assistant', content: 'some response', - toolCalls: [{ tool: 'command_execution', input: 'ls -la' }], + toolCalls: [{ tool: 'command_execution', input: { command: 'ls -la' } }], }, ], }); @@ -197,6 +255,154 @@ describe('SkillTriggerEvaluator', () => { }); }); + describe('full transcript scanning', () => { + it('should pass when skill triggers after a preamble meta-skill', () => { + const evaluator = new SkillTriggerEvaluator(makeConfig()); + const context = makeContext({ + provider: { kind: 'copilot-cli', targetName: 'test' }, + output: [ + { + role: 'assistant', + content: '', + toolCalls: [ + { tool: 'Using skill: using-superpowers', input: {} }, + { tool: 'Using skill: csv-analyzer', input: {} }, + ], + }, + ], + }); + const result = evaluator.evaluate(context); + expect(result.verdict).toBe('pass'); + }); + + it('should pass when skill triggers in a later message', () => { + const evaluator = new SkillTriggerEvaluator(makeConfig()); + const context = makeContext({ + output: [ + { + role: 'assistant', + content: 'thinking...', + toolCalls: [{ tool: 'Bash', input: { command: 'ls' } }], + }, + { + role: 'assistant', + content: '', + toolCalls: [{ tool: 'Skill', input: { skill: 'csv-analyzer' } }], + }, + ], + }); + const result = evaluator.evaluate(context); + expect(result.verdict).toBe('pass'); + }); + + it('should fail when target skill never appears anywhere in transcript', () => { + const evaluator = new SkillTriggerEvaluator(makeConfig()); + const context = makeContext({ + output: [ + { + role: 'assistant', + content: '', + toolCalls: [ + { tool: 'Using skill: using-superpowers', input: {} }, + { tool: 'Bash', input: { command: 'ls' } }, + ], + }, + ], + }); + const result = evaluator.evaluate(context); + expect(result.verdict).toBe('fail'); + }); + + it('should pass for should_trigger:false when skill never appears in transcript', () => { + const evaluator = new SkillTriggerEvaluator(makeConfig({ should_trigger: false })); + const context = makeContext({ + output: [ + { + role: 'assistant', + content: '', + toolCalls: [{ tool: 'Using skill: using-superpowers', input: {} }], + }, + ], + }); + const result = evaluator.evaluate(context); + expect(result.verdict).toBe('pass'); + }); + + it('should fail for should_trigger:false when skill appears later in transcript', () => { + const evaluator = new SkillTriggerEvaluator(makeConfig({ should_trigger: false })); + const context = makeContext({ + output: [ + { + role: 'assistant', + content: '', + toolCalls: [ + { tool: 'Bash', input: { command: 'ls' } }, + { tool: 'Skill', input: { skill: 'csv-analyzer' } }, + ], + }, + ], + }); + const result = evaluator.evaluate(context); + expect(result.verdict).toBe('fail'); + }); + }); + + describe('pi-coding-agent tools', () => { + it('should detect Skill tool for pi-coding-agent', () => { + const evaluator = new SkillTriggerEvaluator(makeConfig()); + const context = makeContext({ + provider: { kind: 'pi-coding-agent', targetName: 'test' }, + output: [ + { + role: 'assistant', + content: '', + toolCalls: [{ tool: 'Skill', input: { skill: 'csv-analyzer' } }], + }, + ], + }); + const result = evaluator.evaluate(context); + expect(result.verdict).toBe('pass'); + expect(result.score).toBe(1); + }); + + it('should detect Read tool for pi-coding-agent when reading skill file', () => { + const evaluator = new SkillTriggerEvaluator(makeConfig()); + const context = makeContext({ + provider: { kind: 'pi-coding-agent', targetName: 'test' }, + output: [ + { + role: 'assistant', + content: '', + toolCalls: [ + { + tool: 'Read', + input: { file_path: '/skills/csv-analyzer/SKILL.md' }, + }, + ], + }, + ], + }); + const result = evaluator.evaluate(context); + expect(result.verdict).toBe('pass'); + }); + + it('should pass for pi-coding-agent with should_trigger: false and unrelated tool', () => { + const evaluator = new SkillTriggerEvaluator(makeConfig({ should_trigger: false })); + const context = makeContext({ + provider: { kind: 'pi-coding-agent', targetName: 'test' }, + output: [ + { + role: 'assistant', + content: 'some response', + toolCalls: [{ tool: 'bash', input: { command: 'ls' } }], + }, + ], + }); + const result = evaluator.evaluate(context); + expect(result.verdict).toBe('pass'); + }); + }); + describe('copilot-specific tools', () => { it('should recognize readFile tool for copilot', () => { const evaluator = new SkillTriggerEvaluator(makeConfig()); diff --git a/packages/core/test/evaluation/providers/codex-sdk.test.ts b/packages/core/test/evaluation/providers/codex-sdk.test.ts index 1b6b9e6b6..0e76d586e 100644 --- a/packages/core/test/evaluation/providers/codex-sdk.test.ts +++ b/packages/core/test/evaluation/providers/codex-sdk.test.ts @@ -289,7 +289,7 @@ describe('CodexProvider (SDK)', () => { expect(msg?.toolCalls).toBeDefined(); expect(msg?.toolCalls?.length).toBe(1); expect(msg?.toolCalls?.[0]?.tool).toBe('command_execution'); - expect(msg?.toolCalls?.[0]?.input).toBe('ls -la'); + expect(msg?.toolCalls?.[0]?.input).toEqual({ command: 'ls -la' }); expect(msg?.toolCalls?.[0]?.output).toBe('file1.ts\nfile2.ts'); expect(msg?.toolCalls?.[0]?.id).toBe('cmd-1'); }); From 1ec4ce99640ed92ca9d1d0ca91ed888e4ca9edf3 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 22 Mar 2026 07:21:48 +0000 Subject: [PATCH 2/4] fix(pi-coding-agent): extract toolCall format + add PI_CODING_AGENT_MATCHER Pi CLI emits `type: 'toolCall'` with `arguments` (not `tool_use` / `input`), so tool calls were silently dropped. Also add PI_CODING_AGENT_MATCHER using lowercase `read` + `path` field to match Pi's actual tool names. Co-Authored-By: Claude Sonnet 4.6 --- .../evaluation/evaluators/skill-trigger.ts | 18 ++++++- .../evaluation/providers/pi-coding-agent.ts | 8 +++ .../evaluators/skill-trigger.test.ts | 53 +++++++++++++++++-- 3 files changed, 73 insertions(+), 6 deletions(-) diff --git a/packages/core/src/evaluation/evaluators/skill-trigger.ts b/packages/core/src/evaluation/evaluators/skill-trigger.ts index b4302494f..a7fe545ff 100644 --- a/packages/core/src/evaluation/evaluators/skill-trigger.ts +++ b/packages/core/src/evaluation/evaluators/skill-trigger.ts @@ -58,6 +58,22 @@ const COPILOT_MATCHER: ToolMatcher = { readInputFields: ['file_path', 'path'], }; +/** + * Pi CLI reads skill files using the lowercase `read` tool with a `path` argument. + * Skills are auto-discovered from `.agents/skills/` relative to the working directory. + * + * Skill lookup order (workspace-scoped first): + * 1. .agents/skills//SKILL.md (workspace-relative, auto-discovered) + * 2. ~/.agents/skills//SKILL.md (global fallback) + */ +const PI_CODING_AGENT_MATCHER: ToolMatcher = { + skillTools: [], + skillInputField: 'skill', + readTools: ['read'], + readInputField: 'path', + readInputFields: ['path', 'file_path', 'filePath'], +}; + /** * Codex reads skill files via command_execution using a bash sed command containing * the skill file path. The skill name appears in the command string, so we match @@ -90,7 +106,7 @@ const PROVIDER_TOOL_SEMANTICS: Partial> = { 'claude-cli': CLAUDE_MATCHER, 'claude-sdk': CLAUDE_MATCHER, codex: CODEX_MATCHER, - 'pi-coding-agent': CLAUDE_MATCHER, + 'pi-coding-agent': PI_CODING_AGENT_MATCHER, 'pi-agent-sdk': CLAUDE_MATCHER, 'copilot-cli': COPILOT_MATCHER, 'copilot-sdk': COPILOT_MATCHER, diff --git a/packages/core/src/evaluation/providers/pi-coding-agent.ts b/packages/core/src/evaluation/providers/pi-coding-agent.ts index 43a07318d..6438af0b6 100644 --- a/packages/core/src/evaluation/providers/pi-coding-agent.ts +++ b/packages/core/src/evaluation/providers/pi-coding-agent.ts @@ -771,6 +771,14 @@ function extractToolCalls(content: unknown): readonly ToolCall[] { id: typeof p.id === 'string' ? p.id : undefined, }); } + // Pi CLI emits toolCall (camelCase) with arguments (not input) + if (p.type === 'toolCall' && typeof p.name === 'string') { + toolCalls.push({ + tool: p.name, + input: p.arguments, + id: typeof p.id === 'string' ? p.id : undefined, + }); + } // Also handle tool_result for output if (p.type === 'tool_result' && typeof p.tool_use_id === 'string') { // Find matching tool call and add output diff --git a/packages/core/test/evaluation/evaluators/skill-trigger.test.ts b/packages/core/test/evaluation/evaluators/skill-trigger.test.ts index f9c0f4e04..d8ab2b859 100644 --- a/packages/core/test/evaluation/evaluators/skill-trigger.test.ts +++ b/packages/core/test/evaluation/evaluators/skill-trigger.test.ts @@ -119,6 +119,44 @@ describe('SkillTriggerEvaluator', () => { expect(result.verdict).toBe('pass'); }); + it('should detect pi-coding-agent read tool loading skill file', () => { + const evaluator = new SkillTriggerEvaluator(makeConfig()); + const context = makeContext({ + provider: { kind: 'pi-coding-agent', targetName: 'test' }, + output: [ + { + role: 'assistant', + content: '', + toolCalls: [ + { + tool: 'read', + input: { path: '/workspace/.agents/skills/csv-analyzer/SKILL.md' }, + }, + ], + }, + ], + }); + const result = evaluator.evaluate(context); + expect(result.verdict).toBe('pass'); + expect(result.score).toBe(1); + }); + + it('should fail for pi-coding-agent with non-matching read call', () => { + const evaluator = new SkillTriggerEvaluator(makeConfig()); + const context = makeContext({ + provider: { kind: 'pi-coding-agent', targetName: 'test' }, + output: [ + { + role: 'assistant', + content: 'some response', + toolCalls: [{ tool: 'read', input: { path: '/workspace/README.md' } }], + }, + ], + }); + const result = evaluator.evaluate(context); + expect(result.verdict).toBe('fail'); + }); + it('should detect codex bash command_execution reading skill file', () => { const evaluator = new SkillTriggerEvaluator(makeConfig()); const context = makeContext({ @@ -348,7 +386,7 @@ describe('SkillTriggerEvaluator', () => { }); describe('pi-coding-agent tools', () => { - it('should detect Skill tool for pi-coding-agent', () => { + it('should detect pi-coding-agent read tool loading skill from .agents/skills', () => { const evaluator = new SkillTriggerEvaluator(makeConfig()); const context = makeContext({ provider: { kind: 'pi-coding-agent', targetName: 'test' }, @@ -356,7 +394,12 @@ describe('SkillTriggerEvaluator', () => { { role: 'assistant', content: '', - toolCalls: [{ tool: 'Skill', input: { skill: 'csv-analyzer' } }], + toolCalls: [ + { + tool: 'read', + input: { path: '.agents/skills/csv-analyzer/SKILL.md' }, + }, + ], }, ], }); @@ -365,7 +408,7 @@ describe('SkillTriggerEvaluator', () => { expect(result.score).toBe(1); }); - it('should detect Read tool for pi-coding-agent when reading skill file', () => { + it('should detect pi-coding-agent read tool loading skill from global path', () => { const evaluator = new SkillTriggerEvaluator(makeConfig()); const context = makeContext({ provider: { kind: 'pi-coding-agent', targetName: 'test' }, @@ -375,8 +418,8 @@ describe('SkillTriggerEvaluator', () => { content: '', toolCalls: [ { - tool: 'Read', - input: { file_path: '/skills/csv-analyzer/SKILL.md' }, + tool: 'read', + input: { path: '/home/user/.agents/skills/csv-analyzer/SKILL.md' }, }, ], }, From 0dc7964199045b5b1b6a7fe844f08442fccde830 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 22 Mar 2026 08:02:39 +0000 Subject: [PATCH 3/4] chore(pi-agent-sdk): remove from example targets, add deprecation notices pi-agent-sdk has no tools (tools: []) so it cannot read files or invoke skills. pi-coding-agent covers all the same use cases. Example targets.yaml files now use openrouter directly for LLM grading. Co-Authored-By: Claude Sonnet 4.6 --- .agentv/targets.yaml | 3 +-- .../showcase/offline-grader-benchmark/.agentv/targets.yaml | 6 ++---- packages/core/src/evaluation/evaluators/skill-trigger.ts | 4 +++- packages/core/src/evaluation/providers/index.ts | 1 + packages/core/src/evaluation/providers/pi-agent-sdk.ts | 7 +++++++ 5 files changed, 14 insertions(+), 7 deletions(-) diff --git a/.agentv/targets.yaml b/.agentv/targets.yaml index 7a14aa08e..13ab5e997 100644 --- a/.agentv/targets.yaml +++ b/.agentv/targets.yaml @@ -3,8 +3,7 @@ targets: - name: default - provider: pi-agent-sdk - pi_provider: openrouter + provider: openrouter model: z-ai/glm-4.7 api_key: ${{ OPENROUTER_API_KEY }} system_prompt: "Answer directly based on the information provided." diff --git a/examples/showcase/offline-grader-benchmark/.agentv/targets.yaml b/examples/showcase/offline-grader-benchmark/.agentv/targets.yaml index 3011b53d8..34212cabf 100644 --- a/examples/showcase/offline-grader-benchmark/.agentv/targets.yaml +++ b/examples/showcase/offline-grader-benchmark/.agentv/targets.yaml @@ -17,15 +17,13 @@ targets: model: ${{ AZURE_DEPLOYMENT_NAME }} - name: grader_claude_haiku - provider: pi-agent-sdk - pi_provider: openrouter + provider: openrouter api_key: ${{ OPENROUTER_API_KEY }} model: anthropic/claude-haiku-4.5 system_prompt: "Return concise structured grading output only." - name: grader_gemini_flash - provider: pi-agent-sdk - pi_provider: openrouter + provider: openrouter api_key: ${{ OPENROUTER_API_KEY }} model: google/gemini-3-flash-preview system_prompt: "Return concise structured grading output only." diff --git a/packages/core/src/evaluation/evaluators/skill-trigger.ts b/packages/core/src/evaluation/evaluators/skill-trigger.ts index a7fe545ff..56e5246e5 100644 --- a/packages/core/src/evaluation/evaluators/skill-trigger.ts +++ b/packages/core/src/evaluation/evaluators/skill-trigger.ts @@ -107,7 +107,9 @@ const PROVIDER_TOOL_SEMANTICS: Partial> = { 'claude-sdk': CLAUDE_MATCHER, codex: CODEX_MATCHER, 'pi-coding-agent': PI_CODING_AGENT_MATCHER, - 'pi-agent-sdk': CLAUDE_MATCHER, + // pi-agent-sdk has no tools, so skill detection is a no-op. Kept for completeness. + // TODO: consider removing pi-agent-sdk provider entirely. + 'pi-agent-sdk': PI_CODING_AGENT_MATCHER, 'copilot-cli': COPILOT_MATCHER, 'copilot-sdk': COPILOT_MATCHER, vscode: COPILOT_MATCHER, diff --git a/packages/core/src/evaluation/providers/index.ts b/packages/core/src/evaluation/providers/index.ts index 54445b5f2..7e69333df 100644 --- a/packages/core/src/evaluation/providers/index.ts +++ b/packages/core/src/evaluation/providers/index.ts @@ -100,6 +100,7 @@ export function createBuiltinProviderRegistry(): ProviderRegistry { .register('copilot-sdk', (t) => new CopilotSdkProvider(t.name, t.config as never)) .register('copilot-cli', (t) => new CopilotCliProvider(t.name, t.config as never)) .register('pi-coding-agent', (t) => new PiCodingAgentProvider(t.name, t.config as never)) + // TODO: consider removing pi-agent-sdk — it has no tools and is superseded by pi-coding-agent .register('pi-agent-sdk', (t) => new PiAgentSdkProvider(t.name, t.config as never)) // claude-cli is the new default subprocess provider; claude is an alias .register('claude-cli', (t) => new ClaudeCliProvider(t.name, t.config as never)) diff --git a/packages/core/src/evaluation/providers/pi-agent-sdk.ts b/packages/core/src/evaluation/providers/pi-agent-sdk.ts index 38a619402..b36384908 100644 --- a/packages/core/src/evaluation/providers/pi-agent-sdk.ts +++ b/packages/core/src/evaluation/providers/pi-agent-sdk.ts @@ -66,6 +66,13 @@ interface ToolExecTracker { * * Note: Dependencies are loaded lazily on first use to avoid bundling issues. * Users must install @mariozechner/pi-agent-core and @mariozechner/pi-ai separately. + * + * @deprecated Consider removing this provider. It initializes with tools: [] so it + * cannot read files or invoke skills — making it a plain Q&A loop with no agentic + * capability. The `pi-coding-agent` provider covers the same use cases and more + * (full tool access, workspace isolation, skill discovery). For lightweight LLM + * grading without a CLI dependency, use the `openrouter`, `openai`, or `gemini` + * providers instead. */ export class PiAgentSdkProvider implements Provider { readonly id: string; From 0a3a31366be76dfb679f5dae136fccaf8453658a Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sun, 22 Mar 2026 08:10:14 +0000 Subject: [PATCH 4/4] fix(targets): restore default target to pi-coding-agent The self-evaluation default target should use the coding agent, not a direct LLM call. The offline-grader-benchmark targets remain as openrouter since graders only need plain LLM calls. Co-Authored-By: Claude Sonnet 4.6 --- .agentv/targets.yaml | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/.agentv/targets.yaml b/.agentv/targets.yaml index 13ab5e997..09b9fc8fe 100644 --- a/.agentv/targets.yaml +++ b/.agentv/targets.yaml @@ -3,7 +3,9 @@ targets: - name: default - provider: openrouter + provider: pi-coding-agent + executable: ${{ PI_CLI_PATH }} + pi_provider: openrouter model: z-ai/glm-4.7 api_key: ${{ OPENROUTER_API_KEY }} system_prompt: "Answer directly based on the information provided."