From be79a8c78f78282a37eb0f192c108622d7a617d7 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Tue, 17 Mar 2026 10:38:21 +1100 Subject: [PATCH 1/9] chore(deps): start ai sdk 6 migration From 841fbe1d07c16394a5b82fc9dc7c0e85e2276ac1 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Tue, 17 Mar 2026 00:24:35 +0000 Subject: [PATCH 2/9] chore(deps): bump ai sdk from v5 to v6 Co-Authored-By: Claude Opus 4.6 --- apps/cli/package.json | 2 +- bun.lock | 32 ++++++++++++++++---------------- packages/core/package.json | 10 +++++----- 3 files changed, 22 insertions(+), 22 deletions(-) diff --git a/apps/cli/package.json b/apps/cli/package.json index c0eb6dcac..d99932954 100644 --- a/apps/cli/package.json +++ b/apps/cli/package.json @@ -28,7 +28,7 @@ "test:watch": "bun test --watch" }, "dependencies": { - "@ai-sdk/openai": "^2.0.0", + "@ai-sdk/openai": "^3.0.0", "@anthropic-ai/claude-agent-sdk": "^0.2.49", "@github/copilot-sdk": "^0.1.25", "@inquirer/prompts": "^8.2.1", diff --git a/bun.lock b/bun.lock index 101d988bc..12600c368 100644 --- a/bun.lock +++ b/bun.lock @@ -29,7 +29,7 @@ "agentv": "./dist/cli.js", }, "dependencies": { - "@ai-sdk/openai": "^2.0.0", + "@ai-sdk/openai": "^3.0.0", "@anthropic-ai/claude-agent-sdk": "^0.2.49", "@github/copilot-sdk": "^0.1.25", "@inquirer/prompts": "^8.2.1", @@ -66,17 +66,17 @@ "dependencies": { "@agentclientprotocol/sdk": "^0.14.1", "@agentv/eval": "workspace:*", - "@ai-sdk/anthropic": "^2.0.53", - "@ai-sdk/azure": "^2.0.78", - "@ai-sdk/google": "^2.0.44", - "@ai-sdk/openai": "^2.0.0", + "@ai-sdk/anthropic": "^3.0.0", + "@ai-sdk/azure": "^3.0.0", + "@ai-sdk/google": "^3.0.0", + "@ai-sdk/openai": "^3.0.0", "@anthropic-ai/claude-agent-sdk": "^0.2.49", "@github/copilot-sdk": "^0.1.25", "@mariozechner/pi-agent-core": "^0.54.2", "@mariozechner/pi-ai": "^0.54.2", "@openai/codex-sdk": "^0.104.0", "@openrouter/ai-sdk-provider": "^2.3.1", - "ai": "^5.0.106", + "ai": "^6.0.0", "fast-glob": "^3.3.3", "json5": "^2.2.3", "micromatch": "^4.0.8", @@ -113,19 +113,19 @@ "@agentv/web": ["@agentv/web@workspace:apps/web"], - "@ai-sdk/anthropic": ["@ai-sdk/anthropic@2.0.56", "", { "dependencies": { "@ai-sdk/provider": "2.0.0", "@ai-sdk/provider-utils": "3.0.19" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-XHJKu0Yvfu9SPzRfsAFESa+9T7f2YJY6TxykKMfRsAwpeWAiX/Gbx5J5uM15AzYC3Rw8tVP3oH+j7jEivENirQ=="], + "@ai-sdk/anthropic": ["@ai-sdk/anthropic@3.0.58", "", { "dependencies": { "@ai-sdk/provider": "3.0.8", "@ai-sdk/provider-utils": "4.0.19" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-/53SACgmVukO4bkms4dpxpRlYhW8Ct6QZRe6sj1Pi5H00hYhxIrqfiLbZBGxkdRvjsBQeP/4TVGsXgH5rQeb8Q=="], - "@ai-sdk/azure": ["@ai-sdk/azure@2.0.87", "", { "dependencies": { "@ai-sdk/openai": "2.0.85", "@ai-sdk/provider": "2.0.0", "@ai-sdk/provider-utils": "3.0.19" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-raGHMKOqsUIWtWyC1IRxgB+D/MrGldNh5l6HUyHblKKA9yXrIr4RThpLUPhfpt58vvSxgM4yXaOyiFE6AtIDTQ=="], + "@ai-sdk/azure": ["@ai-sdk/azure@3.0.42", "", { "dependencies": { "@ai-sdk/openai": "3.0.41", "@ai-sdk/provider": "3.0.8", "@ai-sdk/provider-utils": "4.0.19" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-BGg0e3GEI7KHkwUv7d5f9rXzDlTiWhQ4xzVakdHLV/OP24jvXes5X7fI3QZ0rbKBop6URq0yaxomBfwEqqRlzw=="], - "@ai-sdk/gateway": ["@ai-sdk/gateway@2.0.21", "", { "dependencies": { "@ai-sdk/provider": "2.0.0", "@ai-sdk/provider-utils": "3.0.19", "@vercel/oidc": "3.0.5" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-BwV7DU/lAm3Xn6iyyvZdWgVxgLu3SNXzl5y57gMvkW4nGhAOV5269IrJzQwGt03bb107sa6H6uJwWxc77zXoGA=="], + "@ai-sdk/gateway": ["@ai-sdk/gateway@3.0.66", "", { "dependencies": { "@ai-sdk/provider": "3.0.8", "@ai-sdk/provider-utils": "4.0.19", "@vercel/oidc": "3.1.0" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-SIQ0YY0iMuv+07HLsZ+bB990zUJ6S4ujORAh+Jv1V2KGNn73qQKnGO0JBk+w+Res8YqOFSycwDoWcFlQrVxS4A=="], - "@ai-sdk/google": ["@ai-sdk/google@2.0.46", "", { "dependencies": { "@ai-sdk/provider": "2.0.0", "@ai-sdk/provider-utils": "3.0.19" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-8PK6u4sGE/kXebd7ZkTp+0aya4kNqzoqpS5m7cHY2NfTK6fhPc6GNvE+MZIZIoHQTp5ed86wGBdeBPpFaaUtyg=="], + "@ai-sdk/google": ["@ai-sdk/google@3.0.43", "", { "dependencies": { "@ai-sdk/provider": "3.0.8", "@ai-sdk/provider-utils": "4.0.19" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-NGCgP5g8HBxrNdxvF8Dhww+UKfqAkZAmyYBvbu9YLoBkzAmGKDBGhVptN/oXPB5Vm0jggMdoLycZ8JReQM8Zqg=="], - "@ai-sdk/openai": ["@ai-sdk/openai@2.0.85", "", { "dependencies": { "@ai-sdk/provider": "2.0.0", "@ai-sdk/provider-utils": "3.0.19" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-3pzr7qVhsOXwjPAfmvFNZz3sRWCuyMOc3GgLHe7sWY0t8J4hA5mwQ4LISTKYI3iIr8IXzAQn9MUrC8Hiji9RpA=="], + "@ai-sdk/openai": ["@ai-sdk/openai@3.0.41", "", { "dependencies": { "@ai-sdk/provider": "3.0.8", "@ai-sdk/provider-utils": "4.0.19" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-IZ42A+FO+vuEQCVNqlnAPYQnnUpUfdJIwn1BEDOBywiEHa23fw7PahxVtlX9zm3/zMvTW4JKPzWyvAgDu+SQ2A=="], - "@ai-sdk/provider": ["@ai-sdk/provider@2.0.0", "", { "dependencies": { "json-schema": "^0.4.0" } }, "sha512-6o7Y2SeO9vFKB8lArHXehNuusnpddKPk7xqL7T2/b+OvXMRIXUO1rR4wcv1hAFUAT9avGZshty3Wlua/XA7TvA=="], + "@ai-sdk/provider": ["@ai-sdk/provider@3.0.8", "", { "dependencies": { "json-schema": "^0.4.0" } }, "sha512-oGMAgGoQdBXbZqNG0Ze56CHjDZ1IDYOwGYxYjO5KLSlz5HiNQ9udIXsPZ61VWaHGZ5XW/jyjmr6t2xz2jGVwbQ=="], - "@ai-sdk/provider-utils": ["@ai-sdk/provider-utils@3.0.19", "", { "dependencies": { "@ai-sdk/provider": "2.0.0", "@standard-schema/spec": "^1.0.0", "eventsource-parser": "^3.0.6" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-W41Wc9/jbUVXVwCN/7bWa4IKe8MtxO3EyA0Hfhx6grnmiYlCvpI8neSYWFE0zScXJkgA/YK3BRybzgyiXuu6JA=="], + "@ai-sdk/provider-utils": ["@ai-sdk/provider-utils@4.0.19", "", { "dependencies": { "@ai-sdk/provider": "3.0.8", "@standard-schema/spec": "^1.1.0", "eventsource-parser": "^3.0.6" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-3eG55CrSWCu2SXlqq2QCsFjo3+E7+Gmg7i/oRVoSZzIodTuDSfLb3MRje67xE9RFea73Zao7Lm4mADIfUETKGg=="], "@anthropic-ai/claude-agent-sdk": ["@anthropic-ai/claude-agent-sdk@0.2.49", "", { "optionalDependencies": { "@img/sharp-darwin-arm64": "^0.34.2", "@img/sharp-darwin-x64": "^0.34.2", "@img/sharp-linux-arm": "^0.34.2", "@img/sharp-linux-arm64": "^0.34.2", "@img/sharp-linux-x64": "^0.34.2", "@img/sharp-linuxmusl-arm64": "^0.34.2", "@img/sharp-linuxmusl-x64": "^0.34.2", "@img/sharp-win32-arm64": "^0.34.2", "@img/sharp-win32-x64": "^0.34.2" }, "peerDependencies": { "zod": "^4.0.0" } }, "sha512-3avi409dwuGkPEETpWa0gyJvRMr3b6LxeuW5/sAPCOtLD9WxH9fYltbA5wZoazxTw5mlbXmjDp7JqO1rlmpaIQ=="], @@ -709,7 +709,7 @@ "@speed-highlight/core": ["@speed-highlight/core@1.2.14", "", {}, "sha512-G4ewlBNhUtlLvrJTb88d2mdy2KRijzs4UhnlrOSRT4bmjh/IqNElZa3zkrZ+TC47TwtlDWzVLFADljF1Ijp5hA=="], - "@standard-schema/spec": ["@standard-schema/spec@1.0.0", "", {}, "sha512-m2bOd0f2RT9k8QJx1JN85cZYyH1RqFBdlwtkSlf4tBDYLCiiZnv1fIIwacK6cqwXavOydf0NPToMQgpKq+dVlA=="], + "@standard-schema/spec": ["@standard-schema/spec@1.1.0", "", {}, "sha512-l2aFy5jALhniG5HgqrD6jXLi/rUWrKvqN/qJx6yoJsgKhblVd+iqqU4RCXavm/jPityDo5TCvKMnpjKnOriy0w=="], "@tootallnate/quickjs-emscripten": ["@tootallnate/quickjs-emscripten@0.23.0", "", {}, "sha512-C5Mc6rdnsaJDjO3UpGW/CQTHtCKaYlScZTly4JIu97Jxo/odCiH0ITnDXSJPTOrEKk/ycSZ0AOgTmkDtkOsvIA=="], @@ -749,7 +749,7 @@ "@ungap/structured-clone": ["@ungap/structured-clone@1.3.0", "", {}, "sha512-WmoN8qaIAo7WTYWbAZuG8PYEhn5fkz7dZrqTBZ7dtt//lL2Gwms1IcnQ5yHqjDfX8Ft5j4YzDM23f87zBfDe9g=="], - "@vercel/oidc": ["@vercel/oidc@3.0.5", "", {}, "sha512-fnYhv671l+eTTp48gB4zEsTW/YtRgRPnkI2nT7x6qw5rkI1Lq2hTmQIpHPgyThI0znLK+vX2n9XxKdXZ7BUbbw=="], + "@vercel/oidc": ["@vercel/oidc@3.1.0", "", {}, "sha512-Fw28YZpRnA3cAHHDlkt7xQHiJ0fcL+NRcIqsocZQUSmbzeIKRpwttJjik5ZGanXP+vlA4SbTg+AbA3bP363l+w=="], "acorn": ["acorn@8.15.0", "", { "bin": { "acorn": "bin/acorn" } }, "sha512-NZyJarBfL7nWwIq+FDL6Zp/yHEhePMNnnJ0y3qfieCrmNvYct8uvtiV41UvlSe6apAfk0fY1FbWx+NwfmpvtTg=="], @@ -761,7 +761,7 @@ "agentv": ["agentv@workspace:apps/cli"], - "ai": ["ai@5.0.112", "", { "dependencies": { "@ai-sdk/gateway": "2.0.21", "@ai-sdk/provider": "2.0.0", "@ai-sdk/provider-utils": "3.0.19", "@opentelemetry/api": "1.9.0" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-Y0dluYpe5wn81UkfHbZL78mH6CsceUfMiu4oPRaWZvjlmcoXSPdEAsPcYbOjvX8ZPvQc6m4kNZhkcEXmT2ln4w=="], + "ai": ["ai@6.0.116", "", { "dependencies": { "@ai-sdk/gateway": "3.0.66", "@ai-sdk/provider": "3.0.8", "@ai-sdk/provider-utils": "4.0.19", "@opentelemetry/api": "1.9.0" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-7yM+cTmyRLeNIXwt4Vj+mrrJgVQ9RMIW5WO0ydoLoYkewIvsMcvUmqS4j2RJTUXaF1HphwmSKUMQ/HypNRGOmA=="], "ajv": ["ajv@8.17.1", "", { "dependencies": { "fast-deep-equal": "^3.1.3", "fast-uri": "^3.0.1", "json-schema-traverse": "^1.0.0", "require-from-string": "^2.0.2" } }, "sha512-B/gBuNg5SiMTrPkC+A2+cW0RszwxYmn6VYxB/inlBStS5nx6xHIt/ehKRhIMhqusl7a8LjQoZnjCs5vhwxOQ1g=="], diff --git a/packages/core/package.json b/packages/core/package.json index a8476ffb1..79702d601 100644 --- a/packages/core/package.json +++ b/packages/core/package.json @@ -41,17 +41,17 @@ "dependencies": { "@agentclientprotocol/sdk": "^0.14.1", "@agentv/eval": "workspace:*", - "@ai-sdk/anthropic": "^2.0.53", - "@ai-sdk/azure": "^2.0.78", - "@ai-sdk/google": "^2.0.44", - "@ai-sdk/openai": "^2.0.0", + "@ai-sdk/anthropic": "^3.0.0", + "@ai-sdk/azure": "^3.0.0", + "@ai-sdk/google": "^3.0.0", + "@ai-sdk/openai": "^3.0.0", "@anthropic-ai/claude-agent-sdk": "^0.2.49", "@github/copilot-sdk": "^0.1.25", "@mariozechner/pi-agent-core": "^0.54.2", "@mariozechner/pi-ai": "^0.54.2", "@openai/codex-sdk": "^0.104.0", "@openrouter/ai-sdk-provider": "^2.3.1", - "ai": "^5.0.106", + "ai": "^6.0.0", "fast-glob": "^3.3.3", "json5": "^2.2.3", "micromatch": "^4.0.8", From 24a8b0f007d0bf7303d12c615588e9031b3c776d Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Tue, 17 Mar 2026 00:25:56 +0000 Subject: [PATCH 3/9] refactor(providers): remove v2/v3 compatibility casts Co-Authored-By: Claude Opus 4.6 --- packages/core/src/evaluation/providers/agentv-provider.ts | 4 +--- packages/core/src/evaluation/providers/ai-sdk.ts | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/packages/core/src/evaluation/providers/agentv-provider.ts b/packages/core/src/evaluation/providers/agentv-provider.ts index 1ba115452..d4c66db3d 100644 --- a/packages/core/src/evaluation/providers/agentv-provider.ts +++ b/packages/core/src/evaluation/providers/agentv-provider.ts @@ -31,9 +31,7 @@ function createLanguageModel(modelString: string): LanguageModel { switch (provider) { case 'openai': - // Cast: @ai-sdk/openai may return LanguageModelV3 while the rest of the - // codebase uses LanguageModelV2. The runtime API is compatible. - return createOpenAI()(modelName) as unknown as LanguageModel; + return createOpenAI()(modelName); case 'anthropic': return createAnthropic()(modelName); case 'azure': diff --git a/packages/core/src/evaluation/providers/ai-sdk.ts b/packages/core/src/evaluation/providers/ai-sdk.ts index 1af7b63bb..f596fcb97 100644 --- a/packages/core/src/evaluation/providers/ai-sdk.ts +++ b/packages/core/src/evaluation/providers/ai-sdk.ts @@ -133,9 +133,7 @@ export class OpenRouterProvider implements Provider { const openrouter = createOpenRouter({ apiKey: config.apiKey, }); - // Cast: OpenRouter may return LanguageModelV3 while the rest of the - // codebase uses LanguageModelV2. The runtime API is compatible. - this.model = openrouter(config.model) as unknown as LanguageModel; + this.model = openrouter(config.model); } async invoke(request: ProviderRequest): Promise { From 43f139fcb364dd19302d411a4a64766dcb37b3ba Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Tue, 17 Mar 2026 00:27:09 +0000 Subject: [PATCH 4/9] test(providers): update mocks to specificationVersion v3 --- .../test/evaluation/providers/agentv-provider.test.ts | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/packages/core/test/evaluation/providers/agentv-provider.test.ts b/packages/core/test/evaluation/providers/agentv-provider.test.ts index 78559acca..72ce58ce7 100644 --- a/packages/core/test/evaluation/providers/agentv-provider.test.ts +++ b/packages/core/test/evaluation/providers/agentv-provider.test.ts @@ -5,7 +5,7 @@ import { describe, expect, it, vi } from 'vitest'; vi.mock('@ai-sdk/openai', () => ({ createOpenAI: () => (modelId: string) => ({ modelId, - specificationVersion: 'v2', + specificationVersion: 'v3', provider: 'openai', }), })); @@ -13,7 +13,7 @@ vi.mock('@ai-sdk/openai', () => ({ vi.mock('@ai-sdk/anthropic', () => ({ createAnthropic: () => (modelId: string) => ({ modelId, - specificationVersion: 'v2', + specificationVersion: 'v3', provider: 'anthropic', }), })); @@ -21,7 +21,7 @@ vi.mock('@ai-sdk/anthropic', () => ({ vi.mock('@ai-sdk/azure', () => ({ createAzure: () => (modelId: string) => ({ modelId, - specificationVersion: 'v2', + specificationVersion: 'v3', provider: 'azure', }), })); @@ -29,7 +29,7 @@ vi.mock('@ai-sdk/azure', () => ({ vi.mock('@ai-sdk/google', () => ({ createGoogleGenerativeAI: () => (modelId: string) => ({ modelId, - specificationVersion: 'v2', + specificationVersion: 'v3', provider: 'google', }), })); From 5bfe47fd95d77d1c08ecc47d59dbbd5b97a773a6 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Tue, 17 Mar 2026 00:38:53 +0000 Subject: [PATCH 5/9] feat(core): add reasoning tokens and candidate duration metrics - Add `reasoning` field to TokenUsage and ProviderTokenUsage interfaces - Extract reasoning_tokens from Claude CLI provider's usage response - Extract reasoningTokens from AI SDK provider's usage response - Add `candidateDurationMs` to EvaluationResult (agent-only time, excludes grading) - Override `durationMs` with total case time (includes grading) in orchestrator - Update TimingArtifact to include reasoning token accumulation - Fix artifact-writer tests for new reasoning field Closes #633 --- apps/cli/src/commands/eval/artifact-writer.ts | 8 +++++++- .../test/commands/eval/artifact-writer.test.ts | 6 +++--- packages/core/src/evaluation/orchestrator.ts | 18 +++++++++++++++++- .../core/src/evaluation/providers/ai-sdk.ts | 9 ++++++++- .../src/evaluation/providers/claude-cli.ts | 2 ++ .../core/src/evaluation/providers/types.ts | 2 ++ packages/core/src/evaluation/trace.ts | 2 ++ packages/core/src/evaluation/types.ts | 4 +++- 8 files changed, 44 insertions(+), 7 deletions(-) diff --git a/apps/cli/src/commands/eval/artifact-writer.ts b/apps/cli/src/commands/eval/artifact-writer.ts index 275484c01..893ec4861 100644 --- a/apps/cli/src/commands/eval/artifact-writer.ts +++ b/apps/cli/src/commands/eval/artifact-writer.ts @@ -49,6 +49,7 @@ export interface TimingArtifact { readonly token_usage: { readonly input: number; readonly output: number; + readonly reasoning: number; }; } @@ -273,13 +274,17 @@ export function buildGradingArtifact(result: EvaluationResult): GradingArtifact export function buildTimingArtifact(results: readonly EvaluationResult[]): TimingArtifact { let totalInput = 0; let totalOutput = 0; + let totalReasoning = 0; let totalDurationMs = 0; for (const result of results) { - const usage = result.tokenUsage as { input?: number; output?: number } | undefined; + const usage = result.tokenUsage as + | { input?: number; output?: number; reasoning?: number } + | undefined; if (usage) { totalInput += usage.input ?? 0; totalOutput += usage.output ?? 0; + totalReasoning += usage.reasoning ?? 0; } if (result.durationMs != null) { totalDurationMs += result.durationMs; @@ -293,6 +298,7 @@ export function buildTimingArtifact(results: readonly EvaluationResult[]): Timin token_usage: { input: totalInput, output: totalOutput, + reasoning: totalReasoning, }, }; } diff --git a/apps/cli/test/commands/eval/artifact-writer.test.ts b/apps/cli/test/commands/eval/artifact-writer.test.ts index 07229e9de..2edd42c2a 100644 --- a/apps/cli/test/commands/eval/artifact-writer.test.ts +++ b/apps/cli/test/commands/eval/artifact-writer.test.ts @@ -202,7 +202,7 @@ describe('buildTimingArtifact', () => { expect(timing.total_tokens).toBe(4500); expect(timing.duration_ms).toBe(90000); expect(timing.total_duration_seconds).toBe(90); - expect(timing.token_usage).toEqual({ input: 3000, output: 1500 }); + expect(timing.token_usage).toEqual({ input: 3000, output: 1500, reasoning: 0 }); }); it('handles results with no timing data', () => { @@ -212,7 +212,7 @@ describe('buildTimingArtifact', () => { expect(timing.total_tokens).toBe(0); expect(timing.duration_ms).toBe(0); expect(timing.total_duration_seconds).toBe(0); - expect(timing.token_usage).toEqual({ input: 0, output: 0 }); + expect(timing.token_usage).toEqual({ input: 0, output: 0, reasoning: 0 }); }); it('handles empty results array', () => { @@ -232,7 +232,7 @@ describe('buildTimingArtifact', () => { const timing = buildTimingArtifact(results); expect(timing.total_tokens).toBe(500); - expect(timing.token_usage).toEqual({ input: 500, output: 0 }); + expect(timing.token_usage).toEqual({ input: 500, output: 0, reasoning: 0 }); }); }); diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index ce8b2983a..c5711de95 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -1527,6 +1527,7 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise { const { evalCase, @@ -1935,6 +1949,7 @@ async function evaluateCandidate(options: { availableTargets, fileChanges, workspacePath, + candidateDurationMs, } = options; const gradeTimestamp = nowFn(); @@ -2011,6 +2026,7 @@ async function evaluateCandidate(options: { tokenUsage, costUsd, durationMs, + candidateDurationMs, startTime, endTime, requests, diff --git a/packages/core/src/evaluation/providers/ai-sdk.ts b/packages/core/src/evaluation/providers/ai-sdk.ts index f596fcb97..1c3e4acd4 100644 --- a/packages/core/src/evaluation/providers/ai-sdk.ts +++ b/packages/core/src/evaluation/providers/ai-sdk.ts @@ -390,9 +390,16 @@ async function invokeModel(options: { function mapResponse(result: TextResult): ProviderResponse { const content = result.text ?? ''; const rawUsage = result.totalUsage ?? result.usage; + const reasoning = (rawUsage as Record | undefined)?.reasoningTokens as + | number + | undefined; const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null - ? { input: rawUsage.inputTokens, output: rawUsage.outputTokens } + ? { + input: rawUsage.inputTokens, + output: rawUsage.outputTokens, + reasoning: reasoning ?? undefined, + } : undefined; return { diff --git a/packages/core/src/evaluation/providers/claude-cli.ts b/packages/core/src/evaluation/providers/claude-cli.ts index 659d82097..27fa2e200 100644 --- a/packages/core/src/evaluation/providers/claude-cli.ts +++ b/packages/core/src/evaluation/providers/claude-cli.ts @@ -119,10 +119,12 @@ export class ClaudeCliProvider implements Provider { ((usage.cache_read_input_tokens as number) ?? 0) + ((usage.cache_creation_input_tokens as number) ?? 0); const outputTokens = (usage.output_tokens as number) ?? 0; + const reasoningTokens = (usage.reasoning_tokens as number) ?? undefined; tokenUsage = { input: inputTokens, output: outputTokens, cached: (usage.cache_read_input_tokens as number) ?? undefined, + reasoning: reasoningTokens, }; // Stream callback for LLM usage diff --git a/packages/core/src/evaluation/providers/types.ts b/packages/core/src/evaluation/providers/types.ts index 904751cf4..f482b711f 100644 --- a/packages/core/src/evaluation/providers/types.ts +++ b/packages/core/src/evaluation/providers/types.ts @@ -193,6 +193,8 @@ export interface ProviderTokenUsage { readonly output: number; /** Cached tokens (optional, provider-specific) */ readonly cached?: number; + /** Reasoning/thinking tokens (optional, provider-specific) */ + readonly reasoning?: number; } export interface ProviderResponse { diff --git a/packages/core/src/evaluation/trace.ts b/packages/core/src/evaluation/trace.ts index 95f23de6e..a4d486d58 100644 --- a/packages/core/src/evaluation/trace.ts +++ b/packages/core/src/evaluation/trace.ts @@ -13,6 +13,8 @@ export interface TokenUsage { readonly output: number; /** Cached tokens (optional, provider-specific) */ readonly cached?: number; + /** Reasoning/thinking tokens (optional, provider-specific) */ + readonly reasoning?: number; } /** diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts index 0a4a32cfa..3407ae20d 100644 --- a/packages/core/src/evaluation/types.ts +++ b/packages/core/src/evaluation/types.ts @@ -909,8 +909,10 @@ export interface EvaluationResult { readonly tokenUsage?: TokenUsage; /** Total cost in USD (optional, from provider) */ readonly costUsd?: number; - /** Total execution duration in milliseconds (optional) */ + /** Total execution duration in milliseconds (includes grading time) */ readonly durationMs?: number; + /** Execution duration of just the candidate/agent in milliseconds (excludes grading time) */ + readonly candidateDurationMs?: number; /** ISO 8601 timestamp when execution started */ readonly startTime?: string; /** ISO 8601 timestamp when execution ended */ From 2fc4bf72802fa62cdfc1f877d4b9fffb78be8999 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Tue, 17 Mar 2026 01:06:48 +0000 Subject: [PATCH 6/9] fix: replace candidateDurationMs with evalRun aggregate metrics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Revert candidateDurationMs and durationMs override — durationMs stays as the candidate-only duration from the provider - Add evalRun field to EvaluationResult with total durationMs (candidate + grading) and aggregated tokenUsage (candidate + all evaluators) - Add aggregateEvaluatorTokenUsage helper that recursively sums token usage from evaluator results including nested children Addresses review feedback on #633: durationMs is already candidate-only, so keep it as-is and add a separate total eval run field instead. --- packages/core/src/evaluation/orchestrator.ts | 86 ++++++++++++++++---- packages/core/src/evaluation/types.ts | 9 +- 2 files changed, 78 insertions(+), 17 deletions(-) diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index c5711de95..2be063ec9 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -1689,7 +1689,6 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise { const { evalCase, @@ -1949,7 +1960,6 @@ async function evaluateCandidate(options: { availableTargets, fileChanges, workspacePath, - candidateDurationMs, } = options; const gradeTimestamp = nowFn(); @@ -2026,7 +2036,6 @@ async function evaluateCandidate(options: { tokenUsage, costUsd, durationMs, - candidateDurationMs, startTime, endTime, requests, @@ -2581,6 +2590,55 @@ function buildResultInput(promptInputs: PromptInputs): EvaluationResult['input'] return promptInputs.question; } +/** + * Sum token usage across all evaluator results (including nested children). + * Returns undefined when no evaluator reported token usage. + */ +function aggregateEvaluatorTokenUsage( + scores?: readonly EvaluatorResult[], +): TokenUsage | undefined { + if (!scores || scores.length === 0) return undefined; + + let hasAny = false; + let input = 0; + let output = 0; + let reasoning = 0; + let cached = 0; + let hasReasoning = false; + let hasCached = false; + + const visit = (items: readonly EvaluatorResult[]): void => { + for (const item of items) { + if (item.tokenUsage) { + hasAny = true; + input += item.tokenUsage.input; + output += item.tokenUsage.output; + if (item.tokenUsage.reasoning != null) { + hasReasoning = true; + reasoning += item.tokenUsage.reasoning; + } + if (item.tokenUsage.cached != null) { + hasCached = true; + cached += item.tokenUsage.cached; + } + } + if (item.scores) { + visit(item.scores); + } + } + }; + + visit(scores); + if (!hasAny) return undefined; + + return { + input, + output, + ...(hasReasoning ? { reasoning } : {}), + ...(hasCached ? { cached } : {}), + }; +} + function isTimeoutLike(error: unknown): boolean { if (!error) { return false; diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts index 3407ae20d..970903d5c 100644 --- a/packages/core/src/evaluation/types.ts +++ b/packages/core/src/evaluation/types.ts @@ -909,10 +909,8 @@ export interface EvaluationResult { readonly tokenUsage?: TokenUsage; /** Total cost in USD (optional, from provider) */ readonly costUsd?: number; - /** Total execution duration in milliseconds (includes grading time) */ + /** Candidate/agent execution duration in milliseconds (excludes grading time) */ readonly durationMs?: number; - /** Execution duration of just the candidate/agent in milliseconds (excludes grading time) */ - readonly candidateDurationMs?: number; /** ISO 8601 timestamp when execution started */ readonly startTime?: string; /** ISO 8601 timestamp when execution ended */ @@ -950,6 +948,11 @@ export interface EvaluationResult { readonly costLimited?: boolean; /** Whether the evaluation was skipped due to suite-level budget exhaustion */ readonly budgetExceeded?: boolean; + /** Aggregate metrics for the full eval run (candidate + grading) */ + readonly evalRun?: { + readonly durationMs?: number; + readonly tokenUsage?: TokenUsage; + }; /** Primary classification: ok, quality_failure, or execution_error */ readonly executionStatus: ExecutionStatus; /** Pipeline stage where failure occurred (only when executionStatus !== 'ok') */ From 6f0886caf869ae912de3aa954b58a01fea1ae622 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Tue, 17 Mar 2026 01:07:58 +0000 Subject: [PATCH 7/9] style: fix biome formatting for aggregateEvaluatorTokenUsage --- packages/core/src/evaluation/orchestrator.ts | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index 2be063ec9..fec65e67d 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -2594,9 +2594,7 @@ function buildResultInput(promptInputs: PromptInputs): EvaluationResult['input'] * Sum token usage across all evaluator results (including nested children). * Returns undefined when no evaluator reported token usage. */ -function aggregateEvaluatorTokenUsage( - scores?: readonly EvaluatorResult[], -): TokenUsage | undefined { +function aggregateEvaluatorTokenUsage(scores?: readonly EvaluatorResult[]): TokenUsage | undefined { if (!scores || scores.length === 0) return undefined; let hasAny = false; From 18d99f908a025ea1cdd87127932d73f960add52c Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Tue, 17 Mar 2026 01:44:41 +0000 Subject: [PATCH 8/9] fix(providers): extract reasoning and cached tokens from AI SDK properly Use the standard outputTokenDetails.reasoningTokens and inputTokenDetails.cacheReadTokens paths instead of the deprecated top-level reasoningTokens field. This correctly extracts reasoning tokens from OpenAI, OpenRouter, Azure, Anthropic, and Gemini providers that go through the Vercel AI SDK. --- packages/core/src/evaluation/providers/ai-sdk.ts | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/packages/core/src/evaluation/providers/ai-sdk.ts b/packages/core/src/evaluation/providers/ai-sdk.ts index 1c3e4acd4..df561679e 100644 --- a/packages/core/src/evaluation/providers/ai-sdk.ts +++ b/packages/core/src/evaluation/providers/ai-sdk.ts @@ -390,15 +390,15 @@ async function invokeModel(options: { function mapResponse(result: TextResult): ProviderResponse { const content = result.text ?? ''; const rawUsage = result.totalUsage ?? result.usage; - const reasoning = (rawUsage as Record | undefined)?.reasoningTokens as - | number - | undefined; + const reasoning = rawUsage?.outputTokenDetails?.reasoningTokens ?? undefined; + const cached = rawUsage?.inputTokenDetails?.cacheReadTokens ?? undefined; const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null ? { input: rawUsage.inputTokens, output: rawUsage.outputTokens, - reasoning: reasoning ?? undefined, + ...(reasoning != null ? { reasoning } : {}), + ...(cached != null ? { cached } : {}), } : undefined; From 2c92efd58b3853daf51110539c49da44a120a559 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Tue, 17 Mar 2026 02:10:39 +0000 Subject: [PATCH 9/9] fix: use nullish checks for evalRun token aggregation, add evalRun on error paths - Replace || with != null for reasoning/cached checks in evalRun aggregation to correctly include fields when value is 0 - Add evalRun.durationMs to evaluator error catch path so consumers always get timing even when grading fails --- packages/core/src/evaluation/orchestrator.ts | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index fec65e67d..99136efca 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -1723,10 +1723,10 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise {}); } else { - return { ...errorResult, workspacePath, beforeEachOutput, afterEachOutput }; + return { ...errorResult, evalRun, workspacePath, beforeEachOutput, afterEachOutput }; } } - return { ...errorResult, beforeEachOutput, afterEachOutput }; + return { ...errorResult, evalRun, beforeEachOutput, afterEachOutput }; } }