diff --git a/apps/cli/package.json b/apps/cli/package.json
index c0eb6dcac..d99932954 100644
--- a/apps/cli/package.json
+++ b/apps/cli/package.json
@@ -28,7 +28,7 @@
     "test:watch": "bun test --watch"
   },
   "dependencies": {
-    "@ai-sdk/openai": "^2.0.0",
+    "@ai-sdk/openai": "^3.0.0",
     "@anthropic-ai/claude-agent-sdk": "^0.2.49",
     "@github/copilot-sdk": "^0.1.25",
     "@inquirer/prompts": "^8.2.1",
diff --git a/apps/cli/src/commands/eval/artifact-writer.ts b/apps/cli/src/commands/eval/artifact-writer.ts
index 275484c01..893ec4861 100644
--- a/apps/cli/src/commands/eval/artifact-writer.ts
+++ b/apps/cli/src/commands/eval/artifact-writer.ts
@@ -49,6 +49,7 @@ export interface TimingArtifact {
   readonly token_usage: {
     readonly input: number;
     readonly output: number;
+    readonly reasoning: number;
   };
 }
 
@@ -273,13 +274,17 @@ export function buildGradingArtifact(result: EvaluationResult): GradingArtifact
 export function buildTimingArtifact(results: readonly EvaluationResult[]): TimingArtifact {
   let totalInput = 0;
   let totalOutput = 0;
+  let totalReasoning = 0;
   let totalDurationMs = 0;
 
   for (const result of results) {
-    const usage = result.tokenUsage as { input?: number; output?: number } | undefined;
+    const usage = result.tokenUsage as
+      | { input?: number; output?: number; reasoning?: number }
+      | undefined;
     if (usage) {
       totalInput += usage.input ?? 0;
       totalOutput += usage.output ?? 0;
+      totalReasoning += usage.reasoning ?? 0;
     }
     if (result.durationMs != null) {
       totalDurationMs += result.durationMs;
@@ -293,6 +298,7 @@ export function buildTimingArtifact(results: readonly EvaluationResult[]): Timin
     token_usage: {
       input: totalInput,
       output: totalOutput,
+      reasoning: totalReasoning,
     },
   };
 }
diff --git a/apps/cli/test/commands/eval/artifact-writer.test.ts b/apps/cli/test/commands/eval/artifact-writer.test.ts
index 07229e9de..2edd42c2a 100644
--- a/apps/cli/test/commands/eval/artifact-writer.test.ts
+++ b/apps/cli/test/commands/eval/artifact-writer.test.ts
@@ -202,7 +202,7 @@ describe('buildTimingArtifact', () => {
     expect(timing.total_tokens).toBe(4500);
     expect(timing.duration_ms).toBe(90000);
     expect(timing.total_duration_seconds).toBe(90);
-    expect(timing.token_usage).toEqual({ input: 3000, output: 1500 });
+    expect(timing.token_usage).toEqual({ input: 3000, output: 1500, reasoning: 0 });
   });
 
   it('handles results with no timing data', () => {
@@ -212,7 +212,7 @@ describe('buildTimingArtifact', () => {
     expect(timing.total_tokens).toBe(0);
     expect(timing.duration_ms).toBe(0);
     expect(timing.total_duration_seconds).toBe(0);
-    expect(timing.token_usage).toEqual({ input: 0, output: 0 });
+    expect(timing.token_usage).toEqual({ input: 0, output: 0, reasoning: 0 });
   });
 
   it('handles empty results array', () => {
@@ -232,7 +232,7 @@ describe('buildTimingArtifact', () => {
 
     const timing = buildTimingArtifact(results);
     expect(timing.total_tokens).toBe(500);
-    expect(timing.token_usage).toEqual({ input: 500, output: 0 });
+    expect(timing.token_usage).toEqual({ input: 500, output: 0, reasoning: 0 });
   });
 });
 
diff --git a/bun.lock b/bun.lock
index 101d988bc..12600c368 100644
--- a/bun.lock
+++ b/bun.lock
@@ -29,7 +29,7 @@
         "agentv": "./dist/cli.js",
       },
       "dependencies": {
-        "@ai-sdk/openai": "^2.0.0",
+        "@ai-sdk/openai": "^3.0.0",
         "@anthropic-ai/claude-agent-sdk": "^0.2.49",
         "@github/copilot-sdk": "^0.1.25",
         "@inquirer/prompts": "^8.2.1",
@@ -66,17 +66,17 @@
       "dependencies": {
         "@agentclientprotocol/sdk": "^0.14.1",
         "@agentv/eval": "workspace:*",
-        "@ai-sdk/anthropic": "^2.0.53",
-        "@ai-sdk/azure": "^2.0.78",
-        "@ai-sdk/google": "^2.0.44",
-        "@ai-sdk/openai": "^2.0.0",
+        "@ai-sdk/anthropic": "^3.0.0",
+        "@ai-sdk/azure": "^3.0.0",
+        "@ai-sdk/google": "^3.0.0",
+        "@ai-sdk/openai": "^3.0.0",
         "@anthropic-ai/claude-agent-sdk": "^0.2.49",
         "@github/copilot-sdk": "^0.1.25",
         "@mariozechner/pi-agent-core": "^0.54.2",
         "@mariozechner/pi-ai": "^0.54.2",
         "@openai/codex-sdk": "^0.104.0",
         "@openrouter/ai-sdk-provider": "^2.3.1",
-        "ai": "^5.0.106",
+        "ai": "^6.0.0",
         "fast-glob": "^3.3.3",
         "json5": "^2.2.3",
         "micromatch": "^4.0.8",
@@ -113,19 +113,19 @@
 
     "@agentv/web": ["@agentv/web@workspace:apps/web"],
 
-    "@ai-sdk/anthropic": ["@ai-sdk/anthropic@2.0.56", "", { "dependencies": { "@ai-sdk/provider": "2.0.0", "@ai-sdk/provider-utils": "3.0.19" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-XHJKu0Yvfu9SPzRfsAFESa+9T7f2YJY6TxykKMfRsAwpeWAiX/Gbx5J5uM15AzYC3Rw8tVP3oH+j7jEivENirQ=="],
+    "@ai-sdk/anthropic": ["@ai-sdk/anthropic@3.0.58", "", { "dependencies": { "@ai-sdk/provider": "3.0.8", "@ai-sdk/provider-utils": "4.0.19" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-/53SACgmVukO4bkms4dpxpRlYhW8Ct6QZRe6sj1Pi5H00hYhxIrqfiLbZBGxkdRvjsBQeP/4TVGsXgH5rQeb8Q=="],
 
-    "@ai-sdk/azure": ["@ai-sdk/azure@2.0.87", "", { "dependencies": { "@ai-sdk/openai": "2.0.85", "@ai-sdk/provider": "2.0.0", "@ai-sdk/provider-utils": "3.0.19" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-raGHMKOqsUIWtWyC1IRxgB+D/MrGldNh5l6HUyHblKKA9yXrIr4RThpLUPhfpt58vvSxgM4yXaOyiFE6AtIDTQ=="],
+    "@ai-sdk/azure": ["@ai-sdk/azure@3.0.42", "", { "dependencies": { "@ai-sdk/openai": "3.0.41", "@ai-sdk/provider": "3.0.8", "@ai-sdk/provider-utils": "4.0.19" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-BGg0e3GEI7KHkwUv7d5f9rXzDlTiWhQ4xzVakdHLV/OP24jvXes5X7fI3QZ0rbKBop6URq0yaxomBfwEqqRlzw=="],
 
-    "@ai-sdk/gateway": ["@ai-sdk/gateway@2.0.21", "", { "dependencies": { "@ai-sdk/provider": "2.0.0", "@ai-sdk/provider-utils": "3.0.19", "@vercel/oidc": "3.0.5" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-BwV7DU/lAm3Xn6iyyvZdWgVxgLu3SNXzl5y57gMvkW4nGhAOV5269IrJzQwGt03bb107sa6H6uJwWxc77zXoGA=="],
+    "@ai-sdk/gateway": ["@ai-sdk/gateway@3.0.66", "", { "dependencies": { "@ai-sdk/provider": "3.0.8", "@ai-sdk/provider-utils": "4.0.19", "@vercel/oidc": "3.1.0" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-SIQ0YY0iMuv+07HLsZ+bB990zUJ6S4ujORAh+Jv1V2KGNn73qQKnGO0JBk+w+Res8YqOFSycwDoWcFlQrVxS4A=="],
 
-    "@ai-sdk/google": ["@ai-sdk/google@2.0.46", "", { "dependencies": { "@ai-sdk/provider": "2.0.0", "@ai-sdk/provider-utils": "3.0.19" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-8PK6u4sGE/kXebd7ZkTp+0aya4kNqzoqpS5m7cHY2NfTK6fhPc6GNvE+MZIZIoHQTp5ed86wGBdeBPpFaaUtyg=="],
+    "@ai-sdk/google": ["@ai-sdk/google@3.0.43", "", { "dependencies": { "@ai-sdk/provider": "3.0.8", "@ai-sdk/provider-utils": "4.0.19" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-NGCgP5g8HBxrNdxvF8Dhww+UKfqAkZAmyYBvbu9YLoBkzAmGKDBGhVptN/oXPB5Vm0jggMdoLycZ8JReQM8Zqg=="],
 
-    "@ai-sdk/openai": ["@ai-sdk/openai@2.0.85", "", { "dependencies": { "@ai-sdk/provider": "2.0.0", "@ai-sdk/provider-utils": "3.0.19" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-3pzr7qVhsOXwjPAfmvFNZz3sRWCuyMOc3GgLHe7sWY0t8J4hA5mwQ4LISTKYI3iIr8IXzAQn9MUrC8Hiji9RpA=="],
+    "@ai-sdk/openai": ["@ai-sdk/openai@3.0.41", "", { "dependencies": { "@ai-sdk/provider": "3.0.8", "@ai-sdk/provider-utils": "4.0.19" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-IZ42A+FO+vuEQCVNqlnAPYQnnUpUfdJIwn1BEDOBywiEHa23fw7PahxVtlX9zm3/zMvTW4JKPzWyvAgDu+SQ2A=="],
 
-    "@ai-sdk/provider": ["@ai-sdk/provider@2.0.0", "", { "dependencies": { "json-schema": "^0.4.0" } }, "sha512-6o7Y2SeO9vFKB8lArHXehNuusnpddKPk7xqL7T2/b+OvXMRIXUO1rR4wcv1hAFUAT9avGZshty3Wlua/XA7TvA=="],
+    "@ai-sdk/provider": ["@ai-sdk/provider@3.0.8", "", { "dependencies": { "json-schema": "^0.4.0" } }, "sha512-oGMAgGoQdBXbZqNG0Ze56CHjDZ1IDYOwGYxYjO5KLSlz5HiNQ9udIXsPZ61VWaHGZ5XW/jyjmr6t2xz2jGVwbQ=="],
 
-    "@ai-sdk/provider-utils": ["@ai-sdk/provider-utils@3.0.19", "", { "dependencies": { "@ai-sdk/provider": "2.0.0", "@standard-schema/spec": "^1.0.0", "eventsource-parser": "^3.0.6" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-W41Wc9/jbUVXVwCN/7bWa4IKe8MtxO3EyA0Hfhx6grnmiYlCvpI8neSYWFE0zScXJkgA/YK3BRybzgyiXuu6JA=="],
+    "@ai-sdk/provider-utils": ["@ai-sdk/provider-utils@4.0.19", "", { "dependencies": { "@ai-sdk/provider": "3.0.8", "@standard-schema/spec": "^1.1.0", "eventsource-parser": "^3.0.6" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-3eG55CrSWCu2SXlqq2QCsFjo3+E7+Gmg7i/oRVoSZzIodTuDSfLb3MRje67xE9RFea73Zao7Lm4mADIfUETKGg=="],
 
     "@anthropic-ai/claude-agent-sdk": ["@anthropic-ai/claude-agent-sdk@0.2.49", "", { "optionalDependencies": { "@img/sharp-darwin-arm64": "^0.34.2", "@img/sharp-darwin-x64": "^0.34.2", "@img/sharp-linux-arm": "^0.34.2", "@img/sharp-linux-arm64": "^0.34.2", "@img/sharp-linux-x64": "^0.34.2", "@img/sharp-linuxmusl-arm64": "^0.34.2", "@img/sharp-linuxmusl-x64": "^0.34.2", "@img/sharp-win32-arm64": "^0.34.2", "@img/sharp-win32-x64": "^0.34.2" }, "peerDependencies": { "zod": "^4.0.0" } }, "sha512-3avi409dwuGkPEETpWa0gyJvRMr3b6LxeuW5/sAPCOtLD9WxH9fYltbA5wZoazxTw5mlbXmjDp7JqO1rlmpaIQ=="],
 
@@ -709,7 +709,7 @@
 
     "@speed-highlight/core": ["@speed-highlight/core@1.2.14", "", {}, "sha512-G4ewlBNhUtlLvrJTb88d2mdy2KRijzs4UhnlrOSRT4bmjh/IqNElZa3zkrZ+TC47TwtlDWzVLFADljF1Ijp5hA=="],
 
-    "@standard-schema/spec": ["@standard-schema/spec@1.0.0", "", {}, "sha512-m2bOd0f2RT9k8QJx1JN85cZYyH1RqFBdlwtkSlf4tBDYLCiiZnv1fIIwacK6cqwXavOydf0NPToMQgpKq+dVlA=="],
+    "@standard-schema/spec": ["@standard-schema/spec@1.1.0", "", {}, "sha512-l2aFy5jALhniG5HgqrD6jXLi/rUWrKvqN/qJx6yoJsgKhblVd+iqqU4RCXavm/jPityDo5TCvKMnpjKnOriy0w=="],
 
     "@tootallnate/quickjs-emscripten": ["@tootallnate/quickjs-emscripten@0.23.0", "", {}, "sha512-C5Mc6rdnsaJDjO3UpGW/CQTHtCKaYlScZTly4JIu97Jxo/odCiH0ITnDXSJPTOrEKk/ycSZ0AOgTmkDtkOsvIA=="],
 
@@ -749,7 +749,7 @@
 
     "@ungap/structured-clone": ["@ungap/structured-clone@1.3.0", "", {}, "sha512-WmoN8qaIAo7WTYWbAZuG8PYEhn5fkz7dZrqTBZ7dtt//lL2Gwms1IcnQ5yHqjDfX8Ft5j4YzDM23f87zBfDe9g=="],
 
-    "@vercel/oidc": ["@vercel/oidc@3.0.5", "", {}, "sha512-fnYhv671l+eTTp48gB4zEsTW/YtRgRPnkI2nT7x6qw5rkI1Lq2hTmQIpHPgyThI0znLK+vX2n9XxKdXZ7BUbbw=="],
+    "@vercel/oidc": ["@vercel/oidc@3.1.0", "", {}, "sha512-Fw28YZpRnA3cAHHDlkt7xQHiJ0fcL+NRcIqsocZQUSmbzeIKRpwttJjik5ZGanXP+vlA4SbTg+AbA3bP363l+w=="],
 
     "acorn": ["acorn@8.15.0", "", { "bin": { "acorn": "bin/acorn" } }, "sha512-NZyJarBfL7nWwIq+FDL6Zp/yHEhePMNnnJ0y3qfieCrmNvYct8uvtiV41UvlSe6apAfk0fY1FbWx+NwfmpvtTg=="],
 
@@ -761,7 +761,7 @@
 
     "agentv": ["agentv@workspace:apps/cli"],
 
-    "ai": ["ai@5.0.112", "", { "dependencies": { "@ai-sdk/gateway": "2.0.21", "@ai-sdk/provider": "2.0.0", "@ai-sdk/provider-utils": "3.0.19", "@opentelemetry/api": "1.9.0" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-Y0dluYpe5wn81UkfHbZL78mH6CsceUfMiu4oPRaWZvjlmcoXSPdEAsPcYbOjvX8ZPvQc6m4kNZhkcEXmT2ln4w=="],
+    "ai": ["ai@6.0.116", "", { "dependencies": { "@ai-sdk/gateway": "3.0.66", "@ai-sdk/provider": "3.0.8", "@ai-sdk/provider-utils": "4.0.19", "@opentelemetry/api": "1.9.0" }, "peerDependencies": { "zod": "^3.25.76 || ^4.1.8" } }, "sha512-7yM+cTmyRLeNIXwt4Vj+mrrJgVQ9RMIW5WO0ydoLoYkewIvsMcvUmqS4j2RJTUXaF1HphwmSKUMQ/HypNRGOmA=="],
 
     "ajv": ["ajv@8.17.1", "", { "dependencies": { "fast-deep-equal": "^3.1.3", "fast-uri": "^3.0.1", "json-schema-traverse": "^1.0.0", "require-from-string": "^2.0.2" } }, "sha512-B/gBuNg5SiMTrPkC+A2+cW0RszwxYmn6VYxB/inlBStS5nx6xHIt/ehKRhIMhqusl7a8LjQoZnjCs5vhwxOQ1g=="],
 
diff --git a/packages/core/package.json b/packages/core/package.json
index a8476ffb1..79702d601 100644
--- a/packages/core/package.json
+++ b/packages/core/package.json
@@ -41,17 +41,17 @@
   "dependencies": {
     "@agentclientprotocol/sdk": "^0.14.1",
     "@agentv/eval": "workspace:*",
-    "@ai-sdk/anthropic": "^2.0.53",
-    "@ai-sdk/azure": "^2.0.78",
-    "@ai-sdk/google": "^2.0.44",
-    "@ai-sdk/openai": "^2.0.0",
+    "@ai-sdk/anthropic": "^3.0.0",
+    "@ai-sdk/azure": "^3.0.0",
+    "@ai-sdk/google": "^3.0.0",
+    "@ai-sdk/openai": "^3.0.0",
     "@anthropic-ai/claude-agent-sdk": "^0.2.49",
     "@github/copilot-sdk": "^0.1.25",
     "@mariozechner/pi-agent-core": "^0.54.2",
     "@mariozechner/pi-ai": "^0.54.2",
     "@openai/codex-sdk": "^0.104.0",
     "@openrouter/ai-sdk-provider": "^2.3.1",
-    "ai": "^5.0.106",
+    "ai": "^6.0.0",
     "fast-glob": "^3.3.3",
     "json5": "^2.2.3",
     "micromatch": "^4.0.8",
diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts
index ce8b2983a..99136efca 100644
--- a/packages/core/src/evaluation/orchestrator.ts
+++ b/packages/core/src/evaluation/orchestrator.ts
@@ -1527,6 +1527,7 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise<Evaluati
     }
   }
 
+  const caseStartMs = Date.now();
   const attemptBudget = (maxRetries ?? 0) + 1;
   let attempt = 0;
   let providerResponse: ProviderResponse | undefined = cachedResponse;
@@ -1713,6 +1714,29 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise<Evaluati
       workspacePath,
     });
 
+    const totalDurationMs = Date.now() - caseStartMs;
+
+    // Aggregate grader token usage from individual evaluator results
+    const graderTokens = aggregateEvaluatorTokenUsage(result.scores);
+    const evalRunTokenUsage =
+      tokenUsage || graderTokens
+        ? {
+            input: (tokenUsage?.input ?? 0) + (graderTokens?.input ?? 0),
+            output: (tokenUsage?.output ?? 0) + (graderTokens?.output ?? 0),
+            ...(tokenUsage?.reasoning != null || graderTokens?.reasoning != null
+              ? { reasoning: (tokenUsage?.reasoning ?? 0) + (graderTokens?.reasoning ?? 0) }
+              : {}),
+            ...(tokenUsage?.cached != null || graderTokens?.cached != null
+              ? { cached: (tokenUsage?.cached ?? 0) + (graderTokens?.cached ?? 0) }
+              : {}),
+          }
+        : undefined;
+
+    const evalRun = {
+      durationMs: totalDurationMs,
+      ...(evalRunTokenUsage ? { tokenUsage: evalRunTokenUsage } : {}),
+    };
+
     const executionStatus: ExecutionStatus = providerError
       ? 'execution_error'
       : classifyQualityStatus(result.score);
@@ -1720,6 +1744,7 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise<Evaluati
     const finalResult = providerError
       ? {
           ...result,
+          evalRun,
           error: providerError,
           executionStatus,
           failureStage: 'agent' as const,
@@ -1729,7 +1754,7 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise<Evaluati
           beforeEachOutput,
           afterEachOutput,
         }
-      : { ...result, executionStatus, beforeAllOutput, beforeEachOutput, afterEachOutput };
+      : { ...result, evalRun, executionStatus, beforeAllOutput, beforeEachOutput, afterEachOutput };
 
     // Determine if this is a failure (has error or low score)
     const isFailure = !!finalResult.error || finalResult.score < 0.5;
@@ -1751,6 +1776,7 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise<Evaluati
 
     return finalResult;
   } catch (error) {
+    const evalRun = { durationMs: Date.now() - caseStartMs };
     const errorResult = buildErrorResult(
       evalCase,
       target.name,
@@ -1766,10 +1792,10 @@ export async function runEvalCase(options: RunEvalCaseOptions): Promise<Evaluati
       if (forceCleanup || (retainOnFailure ?? 'keep') === 'cleanup') {
         await cleanupWorkspace(workspacePath).catch(() => {});
       } else {
-        return { ...errorResult, workspacePath, beforeEachOutput, afterEachOutput };
+        return { ...errorResult, evalRun, workspacePath, beforeEachOutput, afterEachOutput };
       }
     }
-    return { ...errorResult, beforeEachOutput, afterEachOutput };
+    return { ...errorResult, evalRun, beforeEachOutput, afterEachOutput };
   }
 }
 
@@ -2565,6 +2591,53 @@ function buildResultInput(promptInputs: PromptInputs): EvaluationResult['input']
   return promptInputs.question;
 }
 
+/**
+ * Sum token usage across all evaluator results (including nested children).
+ * Returns undefined when no evaluator reported token usage.
+ */
+function aggregateEvaluatorTokenUsage(scores?: readonly EvaluatorResult[]): TokenUsage | undefined {
+  if (!scores || scores.length === 0) return undefined;
+
+  let hasAny = false;
+  let input = 0;
+  let output = 0;
+  let reasoning = 0;
+  let cached = 0;
+  let hasReasoning = false;
+  let hasCached = false;
+
+  const visit = (items: readonly EvaluatorResult[]): void => {
+    for (const item of items) {
+      if (item.tokenUsage) {
+        hasAny = true;
+        input += item.tokenUsage.input;
+        output += item.tokenUsage.output;
+        if (item.tokenUsage.reasoning != null) {
+          hasReasoning = true;
+          reasoning += item.tokenUsage.reasoning;
+        }
+        if (item.tokenUsage.cached != null) {
+          hasCached = true;
+          cached += item.tokenUsage.cached;
+        }
+      }
+      if (item.scores) {
+        visit(item.scores);
+      }
+    }
+  };
+
+  visit(scores);
+  if (!hasAny) return undefined;
+
+  return {
+    input,
+    output,
+    ...(hasReasoning ? { reasoning } : {}),
+    ...(hasCached ? { cached } : {}),
+  };
+}
+
 function isTimeoutLike(error: unknown): boolean {
   if (!error) {
     return false;
diff --git a/packages/core/src/evaluation/providers/agentv-provider.ts b/packages/core/src/evaluation/providers/agentv-provider.ts
index 1ba115452..d4c66db3d 100644
--- a/packages/core/src/evaluation/providers/agentv-provider.ts
+++ b/packages/core/src/evaluation/providers/agentv-provider.ts
@@ -31,9 +31,7 @@ function createLanguageModel(modelString: string): LanguageModel {
 
   switch (provider) {
     case 'openai':
-      // Cast: @ai-sdk/openai may return LanguageModelV3 while the rest of the
-      // codebase uses LanguageModelV2. The runtime API is compatible.
-      return createOpenAI()(modelName) as unknown as LanguageModel;
+      return createOpenAI()(modelName);
     case 'anthropic':
       return createAnthropic()(modelName);
     case 'azure':
diff --git a/packages/core/src/evaluation/providers/ai-sdk.ts b/packages/core/src/evaluation/providers/ai-sdk.ts
index 1af7b63bb..df561679e 100644
--- a/packages/core/src/evaluation/providers/ai-sdk.ts
+++ b/packages/core/src/evaluation/providers/ai-sdk.ts
@@ -133,9 +133,7 @@ export class OpenRouterProvider implements Provider {
     const openrouter = createOpenRouter({
       apiKey: config.apiKey,
     });
-    // Cast: OpenRouter may return LanguageModelV3 while the rest of the
-    // codebase uses LanguageModelV2. The runtime API is compatible.
-    this.model = openrouter(config.model) as unknown as LanguageModel;
+    this.model = openrouter(config.model);
   }
 
   async invoke(request: ProviderRequest): Promise<ProviderResponse> {
@@ -392,9 +390,16 @@ async function invokeModel(options: {
 function mapResponse(result: TextResult): ProviderResponse {
   const content = result.text ?? '';
   const rawUsage = result.totalUsage ?? result.usage;
+  const reasoning = rawUsage?.outputTokenDetails?.reasoningTokens ?? undefined;
+  const cached = rawUsage?.inputTokenDetails?.cacheReadTokens ?? undefined;
   const tokenUsage =
     rawUsage?.inputTokens != null && rawUsage?.outputTokens != null
-      ? { input: rawUsage.inputTokens, output: rawUsage.outputTokens }
+      ? {
+          input: rawUsage.inputTokens,
+          output: rawUsage.outputTokens,
+          ...(reasoning != null ? { reasoning } : {}),
+          ...(cached != null ? { cached } : {}),
+        }
       : undefined;
 
   return {
diff --git a/packages/core/src/evaluation/providers/claude-cli.ts b/packages/core/src/evaluation/providers/claude-cli.ts
index 659d82097..27fa2e200 100644
--- a/packages/core/src/evaluation/providers/claude-cli.ts
+++ b/packages/core/src/evaluation/providers/claude-cli.ts
@@ -119,10 +119,12 @@ export class ClaudeCliProvider implements Provider {
                 ((usage.cache_read_input_tokens as number) ?? 0) +
                 ((usage.cache_creation_input_tokens as number) ?? 0);
               const outputTokens = (usage.output_tokens as number) ?? 0;
+              const reasoningTokens = (usage.reasoning_tokens as number) ?? undefined;
               tokenUsage = {
                 input: inputTokens,
                 output: outputTokens,
                 cached: (usage.cache_read_input_tokens as number) ?? undefined,
+                reasoning: reasoningTokens,
               };
 
               // Stream callback for LLM usage
diff --git a/packages/core/src/evaluation/providers/types.ts b/packages/core/src/evaluation/providers/types.ts
index 904751cf4..f482b711f 100644
--- a/packages/core/src/evaluation/providers/types.ts
+++ b/packages/core/src/evaluation/providers/types.ts
@@ -193,6 +193,8 @@ export interface ProviderTokenUsage {
   readonly output: number;
   /** Cached tokens (optional, provider-specific) */
   readonly cached?: number;
+  /** Reasoning/thinking tokens (optional, provider-specific) */
+  readonly reasoning?: number;
 }
 
 export interface ProviderResponse {
diff --git a/packages/core/src/evaluation/trace.ts b/packages/core/src/evaluation/trace.ts
index 95f23de6e..a4d486d58 100644
--- a/packages/core/src/evaluation/trace.ts
+++ b/packages/core/src/evaluation/trace.ts
@@ -13,6 +13,8 @@ export interface TokenUsage {
   readonly output: number;
   /** Cached tokens (optional, provider-specific) */
   readonly cached?: number;
+  /** Reasoning/thinking tokens (optional, provider-specific) */
+  readonly reasoning?: number;
 }
 
 /**
diff --git a/packages/core/src/evaluation/types.ts b/packages/core/src/evaluation/types.ts
index 0a4a32cfa..970903d5c 100644
--- a/packages/core/src/evaluation/types.ts
+++ b/packages/core/src/evaluation/types.ts
@@ -909,7 +909,7 @@ export interface EvaluationResult {
   readonly tokenUsage?: TokenUsage;
   /** Total cost in USD (optional, from provider) */
   readonly costUsd?: number;
-  /** Total execution duration in milliseconds (optional) */
+  /** Candidate/agent execution duration in milliseconds (excludes grading time) */
   readonly durationMs?: number;
   /** ISO 8601 timestamp when execution started */
   readonly startTime?: string;
@@ -948,6 +948,11 @@ export interface EvaluationResult {
   readonly costLimited?: boolean;
   /** Whether the evaluation was skipped due to suite-level budget exhaustion */
   readonly budgetExceeded?: boolean;
+  /** Aggregate metrics for the full eval run (candidate + grading) */
+  readonly evalRun?: {
+    readonly durationMs?: number;
+    readonly tokenUsage?: TokenUsage;
+  };
   /** Primary classification: ok, quality_failure, or execution_error */
   readonly executionStatus: ExecutionStatus;
   /** Pipeline stage where failure occurred (only when executionStatus !== 'ok') */
diff --git a/packages/core/test/evaluation/providers/agentv-provider.test.ts b/packages/core/test/evaluation/providers/agentv-provider.test.ts
index 78559acca..72ce58ce7 100644
--- a/packages/core/test/evaluation/providers/agentv-provider.test.ts
+++ b/packages/core/test/evaluation/providers/agentv-provider.test.ts
@@ -5,7 +5,7 @@ import { describe, expect, it, vi } from 'vitest';
 vi.mock('@ai-sdk/openai', () => ({
   createOpenAI: () => (modelId: string) => ({
     modelId,
-    specificationVersion: 'v2',
+    specificationVersion: 'v3',
     provider: 'openai',
   }),
 }));
@@ -13,7 +13,7 @@ vi.mock('@ai-sdk/openai', () => ({
 vi.mock('@ai-sdk/anthropic', () => ({
   createAnthropic: () => (modelId: string) => ({
     modelId,
-    specificationVersion: 'v2',
+    specificationVersion: 'v3',
     provider: 'anthropic',
   }),
 }));
@@ -21,7 +21,7 @@ vi.mock('@ai-sdk/anthropic', () => ({
 vi.mock('@ai-sdk/azure', () => ({
   createAzure: () => (modelId: string) => ({
     modelId,
-    specificationVersion: 'v2',
+    specificationVersion: 'v3',
     provider: 'azure',
   }),
 }));
@@ -29,7 +29,7 @@ vi.mock('@ai-sdk/azure', () => ({
 vi.mock('@ai-sdk/google', () => ({
   createGoogleGenerativeAI: () => (modelId: string) => ({
     modelId,
-    specificationVersion: 'v2',
+    specificationVersion: 'v3',
     provider: 'google',
   }),
 }));