diff --git a/apps/cli/package.json b/apps/cli/package.json index c0eb6dcac..d99932954 100644 --- a/apps/cli/package.json +++ b/apps/cli/package.json @@ -28,7 +28,7 @@ "test:watch": "bun test --watch" }, "dependencies": { - "@ai-sdk/openai": "^2.0.0", + "@ai-sdk/openai": "^3.0.0", "@anthropic-ai/claude-agent-sdk": "^0.2.49", "@github/copilot-sdk": "^0.1.25", "@inquirer/prompts": "^8.2.1", diff --git a/apps/cli/src/commands/eval/artifact-writer.ts b/apps/cli/src/commands/eval/artifact-writer.ts index 275484c01..893ec4861 100644 --- a/apps/cli/src/commands/eval/artifact-writer.ts +++ b/apps/cli/src/commands/eval/artifact-writer.ts @@ -49,6 +49,7 @@ export interface TimingArtifact { readonly token_usage: { readonly input: number; readonly output: number; + readonly reasoning: number; }; } @@ -273,13 +274,17 @@ export function buildGradingArtifact(result: EvaluationResult): GradingArtifact export function buildTimingArtifact(results: readonly EvaluationResult[]): TimingArtifact { let totalInput = 0; let totalOutput = 0; + let totalReasoning = 0; let totalDurationMs = 0; for (const result of results) { - const usage = result.tokenUsage as { input?: number; output?: number } | undefined; + const usage = result.tokenUsage as + | { input?: number; output?: number; reasoning?: number } + | undefined; if (usage) { totalInput += usage.input ?? 0; totalOutput += usage.output ?? 0; + totalReasoning += usage.reasoning ?? 0; } if (result.durationMs != null) { totalDurationMs += result.durationMs; @@ -293,6 +298,7 @@ export function buildTimingArtifact(results: readonly EvaluationResult[]): Timin token_usage: { input: totalInput, output: totalOutput, + reasoning: totalReasoning, }, }; } diff --git a/apps/cli/test/commands/eval/artifact-writer.test.ts b/apps/cli/test/commands/eval/artifact-writer.test.ts index 07229e9de..2edd42c2a 100644 --- a/apps/cli/test/commands/eval/artifact-writer.test.ts +++ b/apps/cli/test/commands/eval/artifact-writer.test.ts @@ -202,7 +202,7 @@ describe('buildTimingArtifact', () => { expect(timing.total_tokens).toBe(4500); expect(timing.duration_ms).toBe(90000); expect(timing.total_duration_seconds).toBe(90); - expect(timing.token_usage).toEqual({ input: 3000, output: 1500 }); + expect(timing.token_usage).toEqual({ input: 3000, output: 1500, reasoning: 0 }); }); it('handles results with no timing data', () => { @@ -212,7 +212,7 @@ describe('buildTimingArtifact', () => { expect(timing.total_tokens).toBe(0); expect(timing.duration_ms).toBe(0); expect(timing.total_duration_seconds).toBe(0); - expect(timing.token_usage).toEqual({ input: 0, output: 0 }); + expect(timing.token_usage).toEqual({ input: 0, output: 0, reasoning: 0 }); }); it('handles empty results array', () => { @@ -232,7 +232,7 @@ describe('buildTimingArtifact', () => { const timing = buildTimingArtifact(results); expect(timing.total_tokens).toBe(500); - expect(timing.token_usage).toEqual({ input: 500, output: 0 }); + expect(timing.token_usage).toEqual({ input: 500, output: 0, reasoning: 0 }); }); }); diff --git a/bun.lock b/bun.lock index 101d988bc..12600c368 100644 --- a/bun.lock +++ b/bun.lock @@ -29,7 +29,7 @@ "agentv": "./dist/cli.js", }, "dependencies": { - "@ai-sdk/openai": "^2.0.0", + "@ai-sdk/openai": "^3.0.0", "@anthropic-ai/claude-agent-sdk": "^0.2.49", "@github/copilot-sdk": "^0.1.25", "@inquirer/prompts": "^8.2.1", @@ -66,17 +66,17 @@ "dependencies": { "@agentclientprotocol/sdk": "^0.14.1", "@agentv/eval": "workspace:*", - "@ai-sdk/anthropic": "^2.0.53", - "@ai-sdk/azure": "^2.0.78", - "@ai-sdk/google": "^2.0.44", - "@ai-sdk/openai": "^2.0.0", + "@ai-sdk/anthropic": "^3.0.0", + "@ai-sdk/azure": "^3.0.0", + "@ai-sdk/google": "^3.0.0", + "@ai-sdk/openai": "^3.0.0", "@anthropic-ai/claude-agent-sdk": "^0.2.49", "@github/copilot-sdk": "^0.1.25", "@mariozechner/pi-agent-core": "^0.54.2", "@mariozechner/pi-ai": "^0.54.2", "@openai/codex-sdk": "^0.104.0", "@openrouter/ai-sdk-provider": "^2.3.1", - "ai": "^5.0.106", + "ai": "^6.0.0", "fast-glob": "^3.3.3", "json5": "^2.2.3", "micromatch": "^4.0.8", @@ -113,19 +113,19 @@ "@agentv/web": ["@agentv/web@workspace:apps/web"], - "@ai-sdk/anthropic": ["@ai-sdk/anthropic@2.0.56", "", { "dependencies": { "@ai-sdk/provider": "2.0.0", "@ai-sdk/provider-utils": "3.0.19" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-XHJKu0Yvfu9SPzRfsAFESa+9T7f2YJY6TxykKMfRsAwpeWAiX/Gbx5J5uM15AzYC3Rw8tVP3oH+j7jEivENirQ=="], + "@ai-sdk/anthropic": ["@ai-sdk/anthropic@3.0.58", "", { "dependencies": { "@ai-sdk/provider": "3.0.8", "@ai-sdk/provider-utils": "4.0.19" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-/53SACgmVukO4bkms4dpxpRlYhW8Ct6QZRe6sj1Pi5H00hYhxIrqfiLbZBGxkdRvjsBQeP/4TVGsXgH5rQeb8Q=="], - "@ai-sdk/azure": ["@ai-sdk/azure@2.0.87", "", { "dependencies": { "@ai-sdk/openai": "2.0.85", "@ai-sdk/provider": "2.0.0", "@ai-sdk/provider-utils": "3.0.19" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-raGHMKOqsUIWtWyC1IRxgB+D/MrGldNh5l6HUyHblKKA9yXrIr4RThpLUPhfpt58vvSxgM4yXaOyiFE6AtIDTQ=="], + "@ai-sdk/azure": ["@ai-sdk/azure@3.0.42", "", { "dependencies": { "@ai-sdk/openai": "3.0.41", "@ai-sdk/provider": "3.0.8", "@ai-sdk/provider-utils": "4.0.19" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-BGg0e3GEI7KHkwUv7d5f9rXzDlTiWhQ4xzVakdHLV/OP24jvXes5X7fI3QZ0rbKBop6URq0yaxomBfwEqqRlzw=="], - "@ai-sdk/gateway": ["@ai-sdk/gateway@2.0.21", "", { "dependencies": { "@ai-sdk/provider": "2.0.0", "@ai-sdk/provider-utils": "3.0.19", "@vercel/oidc": "3.0.5" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-BwV7DU/lAm3Xn6iyyvZdWgVxgLu3SNXzl5y57gMvkW4nGhAOV5269IrJzQwGt03bb107sa6H6uJwWxc77zXoGA=="], + "@ai-sdk/gateway": ["@ai-sdk/gateway@3.0.66", "", { "dependencies": { "@ai-sdk/provider": "3.0.8", "@ai-sdk/provider-utils": "4.0.19", "@vercel/oidc": "3.1.0" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-SIQ0YY0iMuv+07HLsZ+bB990zUJ6S4ujORAh+Jv1V2KGNn73qQKnGO0JBk+w+Res8YqOFSycwDoWcFlQrVxS4A=="], - "@ai-sdk/google": ["@ai-sdk/google@2.0.46", "", { "dependencies": { "@ai-sdk/provider": "2.0.0", "@ai-sdk/provider-utils": "3.0.19" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-8PK6u4sGE/kXebd7ZkTp+0aya4kNqzoqpS5m7cHY2NfTK6fhPc6GNvE+MZIZIoHQTp5ed86wGBdeBPpFaaUtyg=="], + "@ai-sdk/google": ["@ai-sdk/google@3.0.43", "", { "dependencies": { "@ai-sdk/provider": "3.0.8", "@ai-sdk/provider-utils": "4.0.19" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-NGCgP5g8HBxrNdxvF8Dhww+UKfqAkZAmyYBvbu9YLoBkzAmGKDBGhVptN/oXPB5Vm0jggMdoLycZ8JReQM8Zqg=="], - "@ai-sdk/openai": ["@ai-sdk/openai@2.0.85", "", { "dependencies": { "@ai-sdk/provider": "2.0.0", "@ai-sdk/provider-utils": "3.0.19" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-3pzr7qVhsOXwjPAfmvFNZz3sRWCuyMOc3GgLHe7sWY0t8J4hA5mwQ4LISTKYI3iIr8IXzAQn9MUrC8Hiji9RpA=="], + "@ai-sdk/openai": ["@ai-sdk/openai@3.0.41", "", { "dependencies": { "@ai-sdk/provider": "3.0.8", "@ai-sdk/provider-utils": "4.0.19" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-IZ42A+FO+vuEQCVNqlnAPYQnnUpUfdJIwn1BEDOBywiEHa23fw7PahxVtlX9zm3/zMvTW4JKPzWyvAgDu+SQ2A=="], - "@ai-sdk/provider": ["@ai-sdk/provider@2.0.0", "", { "dependencies": { "json-schema": "^0.4.0" } }, "sha512-6o7Y2SeO9vFKB8lArHXehNuusnpddKPk7xqL7T2/b+OvXMRIXUO1rR4wcv1hAFUAT9avGZshty3Wlua/XA7TvA=="], + "@ai-sdk/provider": ["@ai-sdk/provider@3.0.8", "", { "dependencies": { "json-schema": "^0.4.0" } }, "sha512-oGMAgGoQdBXbZqNG0Ze56CHjDZ1IDYOwGYxYjO5KLSlz5HiNQ9udIXsPZ61VWaHGZ5XW/jyjmr6t2xz2jGVwbQ=="], - "@ai-sdk/provider-utils": ["@ai-sdk/provider-utils@3.0.19", "", { "dependencies": { "@ai-sdk/provider": "2.0.0", "@standard-schema/spec": "^1.0.0", "eventsource-parser": "^3.0.6" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-W41Wc9/jbUVXVwCN/7bWa4IKe8MtxO3EyA0Hfhx6grnmiYlCvpI8neSYWFE0zScXJkgA/YK3BRybzgyiXuu6JA=="], + "@ai-sdk/provider-utils": ["@ai-sdk/provider-utils@4.0.19", "", { "dependencies": { "@ai-sdk/provider": "3.0.8", "@standard-schema/spec": "^1.1.0", "eventsource-parser": "^3.0.6" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-3eG55CrSWCu2SXlqq2QCsFjo3+E7+Gmg7i/oRVoSZzIodTuDSfLb3MRje67xE9RFea73Zao7Lm4mADIfUETKGg=="], "@anthropic-ai/claude-agent-sdk": ["@anthropic-ai/claude-agent-sdk@0.2.49", "", { "optionalDependencies": { "@img/sharp-darwin-arm64": "^0.34.2", "@img/sharp-darwin-x64": "^0.34.2", "@img/sharp-linux-arm": "^0.34.2", "@img/sharp-linux-arm64": "^0.34.2", "@img/sharp-linux-x64": "^0.34.2", "@img/sharp-linuxmusl-arm64": "^0.34.2", "@img/sharp-linuxmusl-x64": "^0.34.2", "@img/sharp-win32-arm64": "^0.34.2", "@img/sharp-win32-x64": "^0.34.2" }, "peerDependencies": { "zod": "^4.0.0" } }, "sha512-3avi409dwuGkPEETpWa0gyJvRMr3b6LxeuW5/sAPCOtLD9WxH9fYltbA5wZoazxTw5mlbXmjDp7JqO1rlmpaIQ=="], @@ -709,7 +709,7 @@ "@speed-highlight/core": ["@speed-highlight/core@1.2.14", "", {}, "sha512-G4ewlBNhUtlLvrJTb88d2mdy2KRijzs4UhnlrOSRT4bmjh/IqNElZa3zkrZ+TC47TwtlDWzVLFADljF1Ijp5hA=="], - "@standard-schema/spec": ["@standard-schema/spec@1.0.0", "", {}, "sha512-m2bOd0f2RT9k8QJx1JN85cZYyH1RqFBdlwtkSlf4tBDYLCiiZnv1fIIwacK6cqwXavOydf0NPToMQgpKq+dVlA=="], + "@standard-schema/spec": ["@standard-schema/spec@1.1.0", "", {}, "sha512-l2aFy5jALhniG5HgqrD6jXLi/rUWrKvqN/qJx6yoJsgKhblVd+iqqU4RCXavm/jPityDo5TCvKMnpjKnOriy0w=="], "@tootallnate/quickjs-emscripten": ["@tootallnate/quickjs-emscripten@0.23.0", "", {}, "sha512-C5Mc6rdnsaJDjO3UpGW/CQTHtCKaYlScZTly4JIu97Jxo/odCiH0ITnDXSJPTOrEKk/ycSZ0AOgTmkDtkOsvIA=="], @@ -749,7 +749,7 @@ "@ungap/structured-clone": ["@ungap/structured-clone@1.3.0", "", {}, "sha512-WmoN8qaIAo7WTYWbAZuG8PYEhn5fkz7dZrqTBZ7dtt//lL2Gwms1IcnQ5yHqjDfX8Ft5j4YzDM23f87zBfDe9g=="], - "@vercel/oidc": ["@vercel/oidc@3.0.5", "", {}, "sha512-fnYhv671l+eTTp48gB4zEsTW/YtRgRPnkI2nT7x6qw5rkI1Lq2hTmQIpHPgyThI0znLK+vX2n9XxKdXZ7BUbbw=="], + "@vercel/oidc": ["@vercel/oidc@3.1.0", "", {}, "sha512-Fw28YZpRnA3cAHHDlkt7xQHiJ0fcL+NRcIqsocZQUSmbzeIKRpwttJjik5ZGanXP+vlA4SbTg+AbA3bP363l+w=="], "acorn": ["acorn@8.15.0", "", { "bin": { "acorn": "bin/acorn" } }, "sha512-NZyJarBfL7nWwIq+FDL6Zp/yHEhePMNnnJ0y3qfieCrmNvYct8uvtiV41UvlSe6apAfk0fY1FbWx+NwfmpvtTg=="], @@ -761,7 +761,7 @@ "agentv": ["agentv@workspace:apps/cli"], - "ai": ["ai@5.0.112", "", { "dependencies": { "@ai-sdk/gateway": "2.0.21", "@ai-sdk/provider": "2.0.0", "@ai-sdk/provider-utils": "3.0.19", "@opentelemetry/api": "1.9.0" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-Y0dluYpe5wn81UkfHbZL78mH6CsceUfMiu4oPRaWZvjlmcoXSPdEAsPcYbOjvX8ZPvQc6m4kNZhkcEXmT2ln4w=="], + "ai": ["ai@6.0.116", "", { "dependencies": { "@ai-sdk/gateway": "3.0.66", "@ai-sdk/provider": "3.0.8", "@ai-sdk/provider-utils": "4.0.19", "@opentelemetry/api": "1.9.0" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-7yM+cTmyRLeNIXwt4Vj+mrrJgVQ9RMIW5WO0ydoLoYkewIvsMcvUmqS4j2RJTUXaF1HphwmSKUMQ/HypNRGOmA=="], "ajv": ["ajv@8.17.1", "", { "dependencies": { "fast-deep-equal": "^3.1.3", "fast-uri": "^3.0.1", "json-schema-traverse": "^1.0.0", "require-from-string": "^2.0.2" } }, "sha512-B/gBuNg5SiMTrPkC+A2+cW0RszwxYmn6VYxB/inlBStS5nx6xHIt/ehKRhIMhqusl7a8LjQoZnjCs5vhwxOQ1g=="], diff --git a/packages/core/package.json b/packages/core/package.json index a8476ffb1..79702d601 100644 --- a/packages/core/package.json +++ b/packages/core/package.json @@ -41,17 +41,17 @@ "dependencies": { "@agentclientprotocol/sdk": "^0.14.1", "@agentv/eval": "workspace:*", - "@ai-sdk/anthropic": "^2.0.53", - "@ai-sdk/azure": "^2.0.78", - "@ai-sdk/google": "^2.0.44", - "@ai-sdk/openai": "^2.0.0", + "@ai-sdk/anthropic": "^3.0.0", + "@ai-sdk/azure": "^3.0.0", + "@ai-sdk/google": "^3.0.0", + "@ai-sdk/openai": "^3.0.0", "@anthropic-ai/claude-agent-sdk": "^0.2.49", "@github/copilot-sdk": "^0.1.25", "@mariozechner/pi-agent-core": "^0.54.2", "@mariozechner/pi-ai": "^0.54.2", "@openai/codex-sdk": "^0.104.0", "@openrouter/ai-sdk-provider": "^2.3.1", - "ai": "^5.0.106", + "ai": "^6.0.0", "fast-glob": "^3.3.3", "json5": "^2.2.3", "micromatch": "^4.0.8", diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index ce8b2983a..99136efca 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -1527,6 +1527,7 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise {}); } else { - return { ...errorResult, workspacePath, beforeEachOutput, afterEachOutput }; + return { ...errorResult, evalRun, workspacePath, beforeEachOutput, afterEachOutput }; } } - return { ...errorResult, beforeEachOutput, afterEachOutput }; + return { ...errorResult, evalRun, beforeEachOutput, afterEachOutput }; } } @@ -2565,6 +2591,53 @@ function buildResultInput(promptInputs: PromptInputs): EvaluationResult['input'] return promptInputs.question; } +/** + * Sum token usage across all evaluator results (including nested children). + * Returns undefined when no evaluator reported token usage. + */ +function aggregateEvaluatorTokenUsage(scores?: readonly EvaluatorResult[]): TokenUsage | undefined { + if (!scores || scores.length === 0) return undefined; + + let hasAny = false; + let input = 0; + let output = 0; + let reasoning = 0; + let cached = 0; + let hasReasoning = false; + let hasCached = false; + + const visit = (items: readonly EvaluatorResult[]): void => { + for (const item of items) { + if (item.tokenUsage) { + hasAny = true; + input += item.tokenUsage.input; + output += item.tokenUsage.output; + if (item.tokenUsage.reasoning != null) { + hasReasoning = true; + reasoning += item.tokenUsage.reasoning; + } + if (item.tokenUsage.cached != null) { + hasCached = true; + cached += item.tokenUsage.cached; + } + } + if (item.scores) { + visit(item.scores); + } + } + }; + + visit(scores); + if (!hasAny) return undefined; + + return { + input, + output, + ...(hasReasoning ? { reasoning } : {}), + ...(hasCached ? { cached } : {}), + }; +} + function isTimeoutLike(error: unknown): boolean { if (!error) { return false; diff --git a/packages/core/src/evaluation/providers/agentv-provider.ts b/packages/core/src/evaluation/providers/agentv-provider.ts index 1ba115452..d4c66db3d 100644 --- a/packages/core/src/evaluation/providers/agentv-provider.ts +++ b/packages/core/src/evaluation/providers/agentv-provider.ts @@ -31,9 +31,7 @@ function createLanguageModel(modelString: string): LanguageModel { switch (provider) { case 'openai': - // Cast: @ai-sdk/openai may return LanguageModelV3 while the rest of the - // codebase uses LanguageModelV2. The runtime API is compatible. - return createOpenAI()(modelName) as unknown as LanguageModel; + return createOpenAI()(modelName); case 'anthropic': return createAnthropic()(modelName); case 'azure': diff --git a/packages/core/src/evaluation/providers/ai-sdk.ts b/packages/core/src/evaluation/providers/ai-sdk.ts index 1af7b63bb..df561679e 100644 --- a/packages/core/src/evaluation/providers/ai-sdk.ts +++ b/packages/core/src/evaluation/providers/ai-sdk.ts @@ -133,9 +133,7 @@ export class OpenRouterProvider implements Provider { const openrouter = createOpenRouter({ apiKey: config.apiKey, }); - // Cast: OpenRouter may return LanguageModelV3 while the rest of the - // codebase uses LanguageModelV2. The runtime API is compatible. - this.model = openrouter(config.model) as unknown as LanguageModel; + this.model = openrouter(config.model); } async invoke(request: ProviderRequest): Promise { @@ -392,9 +390,16 @@ async function invokeModel(options: { function mapResponse(result: TextResult): ProviderResponse { const content = result.text ?? ''; const rawUsage = result.totalUsage ?? result.usage; + const reasoning = rawUsage?.outputTokenDetails?.reasoningTokens ?? undefined; + const cached = rawUsage?.inputTokenDetails?.cacheReadTokens ?? undefined; const tokenUsage = rawUsage?.inputTokens != null && rawUsage?.outputTokens != null - ? { input: rawUsage.inputTokens, output: rawUsage.outputTokens } + ? { + input: rawUsage.inputTokens, + output: rawUsage.outputTokens, + ...(reasoning != null ? { reasoning } : {}), + ...(cached != null ? { cached } : {}), + } : undefined; return { diff --git a/packages/core/src/evaluation/providers/claude-cli.ts b/packages/core/src/evaluation/providers/claude-cli.ts index 659d82097..27fa2e200 100644 --- a/packages/core/src/evaluation/providers/claude-cli.ts +++ b/packages/core/src/evaluation/providers/claude-cli.ts @@ -119,10 +119,12 @@ export class ClaudeCliProvider implements Provider { ((usage.cache_read_input_tokens as number) ?? 0) + ((usage.cache_creation_input_tokens as number) ?? 0); const outputTokens = (usage.output_tokens as number) ?? 0; + const reasoningTokens = (usage.reasoning_tokens as number) ?? undefined; tokenUsage = { input: inputTokens, output: outputTokens, cached: (usage.cache_read_input_tokens as number) ?? undefined, + reasoning: reasoningTokens, }; // Stream callback for LLM usage diff --git a/packages/core/src/evaluation/providers/types.ts b/packages/core/src/evaluation/providers/types.ts index 904751cf4..f482b711f 100644 --- a/packages/core/src/evaluation/providers/types.ts +++ b/packages/core/src/evaluation/providers/types.ts @@ -193,6 +193,8 @@ export interface ProviderTokenUsage { readonly output: number; /** Cached tokens (optional, provider-specific) */ readonly cached?: number; + /** Reasoning/thinking tokens (optional, provider-specific) */ + readonly reasoning?: number; } export interface ProviderResponse { diff --git a/packages/core/src/evaluation/trace.ts b/packages/core/src/evaluation/trace.ts index 95f23de6e..a4d486d58 100644 --- a/packages/core/src/evaluation/trace.ts +++ b/packages/core/src/evaluation/trace.ts @@ -13,6 +13,8 @@ export interface TokenUsage { readonly output: number; /** Cached tokens (optional, provider-specific) */ readonly cached?: number; + /** Reasoning/thinking tokens (optional, provider-specific) */ + readonly reasoning?: number; } /** diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts index 0a4a32cfa..970903d5c 100644 --- a/packages/core/src/evaluation/types.ts +++ b/packages/core/src/evaluation/types.ts @@ -909,7 +909,7 @@ export interface EvaluationResult { readonly tokenUsage?: TokenUsage; /** Total cost in USD (optional, from provider) */ readonly costUsd?: number; - /** Total execution duration in milliseconds (optional) */ + /** Candidate/agent execution duration in milliseconds (excludes grading time) */ readonly durationMs?: number; /** ISO 8601 timestamp when execution started */ readonly startTime?: string; @@ -948,6 +948,11 @@ export interface EvaluationResult { readonly costLimited?: boolean; /** Whether the evaluation was skipped due to suite-level budget exhaustion */ readonly budgetExceeded?: boolean; + /** Aggregate metrics for the full eval run (candidate + grading) */ + readonly evalRun?: { + readonly durationMs?: number; + readonly tokenUsage?: TokenUsage; + }; /** Primary classification: ok, quality_failure, or execution_error */ readonly executionStatus: ExecutionStatus; /** Pipeline stage where failure occurred (only when executionStatus !== 'ok') */ diff --git a/packages/core/test/evaluation/providers/agentv-provider.test.ts b/packages/core/test/evaluation/providers/agentv-provider.test.ts index 78559acca..72ce58ce7 100644 --- a/packages/core/test/evaluation/providers/agentv-provider.test.ts +++ b/packages/core/test/evaluation/providers/agentv-provider.test.ts @@ -5,7 +5,7 @@ import { describe, expect, it, vi } from 'vitest'; vi.mock('@ai-sdk/openai', () => ({ createOpenAI: () => (modelId: string) => ({ modelId, - specificationVersion: 'v2', + specificationVersion: 'v3', provider: 'openai', }), })); @@ -13,7 +13,7 @@ vi.mock('@ai-sdk/openai', () => ({ vi.mock('@ai-sdk/anthropic', () => ({ createAnthropic: () => (modelId: string) => ({ modelId, - specificationVersion: 'v2', + specificationVersion: 'v3', provider: 'anthropic', }), })); @@ -21,7 +21,7 @@ vi.mock('@ai-sdk/anthropic', () => ({ vi.mock('@ai-sdk/azure', () => ({ createAzure: () => (modelId: string) => ({ modelId, - specificationVersion: 'v2', + specificationVersion: 'v3', provider: 'azure', }), })); @@ -29,7 +29,7 @@ vi.mock('@ai-sdk/azure', () => ({ vi.mock('@ai-sdk/google', () => ({ createGoogleGenerativeAI: () => (modelId: string) => ({ modelId, - specificationVersion: 'v2', + specificationVersion: 'v3', provider: 'google', }), }));