diff --git a/examples/features/agent-skills-evals/.agentv/judges/trigger-judge.ts b/examples/features/agent-skills-evals/.agentv/judges/trigger-judge.ts new file mode 100644 index 000000000..fb2d4d866 --- /dev/null +++ b/examples/features/agent-skills-evals/.agentv/judges/trigger-judge.ts @@ -0,0 +1,89 @@ +#!/usr/bin/env bun +/** + * trigger-judge: detects whether the agent invoked a named Claude Code skill. + * + * Mirrors the post-hoc fallback detection in skill-creator's run_eval.py: + * - Only the FIRST tool call matters. Any non-Skill/Read tool as the first + * call means the skill was not triggered (mirrors run_eval.py's early-exit). + * - Skill tool: checks input.skill contains the skill name (case-sensitive). + * - Read tool: checks input.file_path contains the skill name (case-sensitive). + * - Supports negative cases via should_trigger: false. + * + * Usage in eval YAML: + * assertions: + * - type: trigger-judge # discovered from .agentv/judges/ + * skill: my-skill-name # required: exact name as installed in .claude/commands/ + * should_trigger: true # optional: expected behaviour (default: true) + * + * Positive case (should_trigger: true): passes when skill fires. + * Negative case (should_trigger: false): passes when skill does NOT fire. + */ +import { defineCodeJudge } from '@agentv/eval'; + +export default defineCodeJudge(({ output, config }) => { + const skillName = config?.skill as string | undefined; + const shouldTrigger = (config?.should_trigger ?? true) as boolean; + + if (!skillName) { + return { + score: 0, + misses: ['config.skill is required'], + reasoning: 'No skill name configured', + }; + } + + // Flatten all tool calls across messages and take only the first one. + // run_eval.py returns false as soon as a non-Skill/Read tool starts, so + // only the first tool call is relevant. + const firstTool = (output ?? []).flatMap((msg) => msg.toolCalls ?? [])[0]; + + let triggered = false; + let evidence = ''; + + if (firstTool) { + const input = (firstTool.input ?? {}) as Record; + + if (firstTool.tool === 'Skill') { + const skillArg = String(input.skill ?? ''); + if (skillArg.includes(skillName)) { + triggered = true; + evidence = `Skill tool invoked with skill="${skillArg}"`; + } + } else if (firstTool.tool === 'Read') { + const filePath = String(input.file_path ?? ''); + if (filePath.includes(skillName)) { + triggered = true; + evidence = `Read tool loaded skill file: ${filePath}`; + } + } + // Any other tool as first call: triggered remains false + } + + const pass = triggered === shouldTrigger; + + if (pass) { + return { + score: 1, + hits: [ + shouldTrigger + ? evidence || `Skill "${skillName}" triggered as expected` + : `Skill "${skillName}" correctly did not trigger`, + ], + reasoning: shouldTrigger ? 'Skill triggered correctly' : 'No false trigger', + }; + } + + return { + score: 0, + misses: [ + shouldTrigger + ? firstTool + ? `First tool was "${firstTool.tool}" — not Skill/Read for "${skillName}"` + : `No tool calls recorded` + : evidence || `Skill "${skillName}" triggered unexpectedly`, + ], + reasoning: shouldTrigger + ? `Skill "${skillName}" was not triggered` + : `False trigger: skill fired when it should not have`, + }; +}); diff --git a/packages/core/src/evaluation/orchestrator.ts b/packages/core/src/evaluation/orchestrator.ts index 17d0f2abb..12047bb8c 100644 --- a/packages/core/src/evaluation/orchestrator.ts +++ b/packages/core/src/evaluation/orchestrator.ts @@ -28,7 +28,7 @@ import type { TargetDefinition, } from './providers/types.js'; import { extractLastAssistantContent, isAgentProvider } from './providers/types.js'; -import { createBuiltinRegistry, discoverAssertions } from './registry/index.js'; +import { createBuiltinRegistry, discoverAssertions, discoverJudges } from './registry/index.js'; import { type TokenUsage, type TraceSummary, @@ -375,6 +375,7 @@ export async function runEvaluation( // Directory containing the eval YAML file, used as default cwd for workspace scripts const evalDir = discoveryBaseDir; await discoverAssertions(typeRegistry, discoveryBaseDir); + await discoverJudges(typeRegistry, discoveryBaseDir); // Discover custom providers from .agentv/providers/ directory const providerRegistry = createBuiltinProviderRegistry(); diff --git a/packages/core/src/evaluation/providers/claude-cli.ts b/packages/core/src/evaluation/providers/claude-cli.ts new file mode 100644 index 000000000..659d82097 --- /dev/null +++ b/packages/core/src/evaluation/providers/claude-cli.ts @@ -0,0 +1,597 @@ +import { spawn } from 'node:child_process'; +import { randomUUID } from 'node:crypto'; +import { createWriteStream } from 'node:fs'; +import type { WriteStream } from 'node:fs'; +import { mkdir } from 'node:fs/promises'; +import path from 'node:path'; + +import { recordClaudeLogEntry } from './claude-log-tracker.js'; +import { buildPromptDocument, normalizeInputFiles } from './preread.js'; +import type { ClaudeResolvedConfig } from './targets.js'; +import type { + Message, + Provider, + ProviderRequest, + ProviderResponse, + ProviderTokenUsage, + ToolCall, +} from './types.js'; + +/** + * Claude CLI provider that spawns `claude -p` as a subprocess. + * Uses --output-format stream-json --include-partial-messages for structured output. + * This is the default `claude` provider. Use `claude-sdk` for SDK-based invocation. + */ +export class ClaudeCliProvider implements Provider { + readonly id: string; + readonly kind = 'claude-cli' as const; + readonly targetName: string; + readonly supportsBatch = false; + + private readonly config: ClaudeResolvedConfig; + + constructor(targetName: string, config: ClaudeResolvedConfig) { + this.id = `claude-cli:${targetName}`; + this.targetName = targetName; + this.config = config; + } + + async invoke(request: ProviderRequest): Promise { + if (request.signal?.aborted) { + throw new Error('Claude CLI request was aborted before execution'); + } + + const startTime = new Date().toISOString(); + const startMs = Date.now(); + + const logger = await this.createStreamLogger(request).catch(() => undefined); + + // Build the prompt + const inputFiles = normalizeInputFiles(request.inputFiles); + const prompt = buildPromptDocument(request, inputFiles); + + const args = this.buildArgs(); + const cwd = this.resolveCwd(request.cwd); + const env = sanitizeEnvForClaude(request.braintrustSpanIds); + + // Track state from stream events + const completedToolCalls: ToolCall[] = []; + const output: Message[] = []; + let tokenUsage: ProviderTokenUsage | undefined; + let costUsd: number | undefined; + let durationMs: number | undefined; + + try { + const result = await this.runClaude({ + args, + cwd, + prompt, + env, + signal: request.signal, + onLine: (line) => { + logger?.handleLine(line); + const event = tryParseJson(line); + if (!event) return; + + if (event.type === 'assistant') { + const betaMessage = event.message; + if (betaMessage && typeof betaMessage === 'object') { + const msg = betaMessage as Record; + const content = msg.content; + const textContent = extractTextContent(content); + const toolCalls = extractToolCalls(content); + + const outputMsg: Message = { + role: 'assistant', + content: textContent, + toolCalls: toolCalls.length > 0 ? toolCalls : undefined, + }; + output.push(outputMsg); + completedToolCalls.push(...toolCalls); + + // Stream callbacks for real-time observability + if (request.streamCallbacks) { + for (const tc of toolCalls) { + request.streamCallbacks.onToolCallEnd?.( + tc.tool, + tc.input, + tc.output, + tc.durationMs ?? 0, + tc.id, + ); + } + } + } + } + + if (event.type === 'result') { + const resultEvent = event as Record; + if (typeof resultEvent.total_cost_usd === 'number') { + costUsd = resultEvent.total_cost_usd; + } + if (typeof resultEvent.duration_ms === 'number') { + durationMs = resultEvent.duration_ms; + } + const usage = resultEvent.usage as Record | undefined; + if (usage) { + const inputTokens = + ((usage.input_tokens as number) ?? 0) + + ((usage.cache_read_input_tokens as number) ?? 0) + + ((usage.cache_creation_input_tokens as number) ?? 0); + const outputTokens = (usage.output_tokens as number) ?? 0; + tokenUsage = { + input: inputTokens, + output: outputTokens, + cached: (usage.cache_read_input_tokens as number) ?? undefined, + }; + + // Stream callback for LLM usage + request.streamCallbacks?.onLlmCallEnd?.(this.config.model ?? 'claude', tokenUsage); + } + } + }, + }); + + if (result.timedOut) { + throw new Error( + `Claude CLI timed out${formatTimeoutSuffix(this.config.timeoutMs ?? undefined)}`, + ); + } + + if (result.exitCode !== 0) { + const detail = result.stderr.trim() || result.stdout.trim(); + const prefix = `Claude CLI exited with code ${result.exitCode}`; + throw new Error(detail ? `${prefix}: ${detail}` : prefix); + } + + const endTime = new Date().toISOString(); + const totalDurationMs = durationMs ?? Date.now() - startMs; + + return { + raw: { + model: this.config.model, + logFile: logger?.filePath, + args, + exitCode: result.exitCode, + }, + output, + tokenUsage, + costUsd, + durationMs: totalDurationMs, + startTime, + endTime, + }; + } finally { + await logger?.close(); + } + } + + private buildArgs(): string[] { + // --verbose is required when combining -p with --output-format stream-json + const args = [ + '-p', + '--output-format', + 'stream-json', + '--include-partial-messages', + '--verbose', + ]; + + if (this.config.model) { + args.push('--model', this.config.model); + } + + if (this.config.maxTurns !== undefined) { + args.push('--max-turns', String(this.config.maxTurns)); + } + + return args; + } + + private resolveCwd(cwdOverride?: string): string | undefined { + if (cwdOverride) { + return path.resolve(cwdOverride); + } + if (this.config.cwd) { + return path.resolve(this.config.cwd); + } + return undefined; + } + + private resolveLogDirectory(): string | undefined { + const disabled = isClaudeCliLogStreamingDisabled(); + if (disabled) { + return undefined; + } + if (this.config.logDir) { + return path.resolve(this.config.logDir); + } + return path.join(process.cwd(), '.agentv', 'logs', 'claude-cli'); + } + + private async createStreamLogger( + request: ProviderRequest, + ): Promise { + const logDir = this.resolveLogDirectory(); + if (!logDir) { + return undefined; + } + try { + await mkdir(logDir, { recursive: true }); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + console.warn(`Skipping Claude CLI stream logging (could not create ${logDir}): ${message}`); + return undefined; + } + + const filePath = path.join(logDir, buildLogFilename(request, this.targetName)); + + try { + const logger = await ClaudeCliStreamLogger.create({ + filePath, + targetName: this.targetName, + evalCaseId: request.evalCaseId, + attempt: request.attempt, + format: this.config.logFormat ?? 'summary', + }); + recordClaudeLogEntry({ + filePath, + targetName: this.targetName, + evalCaseId: request.evalCaseId, + attempt: request.attempt, + }); + return logger; + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + console.warn(`Skipping Claude CLI stream logging for ${filePath}: ${message}`); + return undefined; + } + } + + private async runClaude(options: { + readonly args: string[]; + readonly cwd: string | undefined; + readonly prompt: string; + readonly env: Record; + readonly signal?: AbortSignal; + readonly onLine: (line: string) => void; + }): Promise<{ stdout: string; stderr: string; exitCode: number; timedOut: boolean }> { + return new Promise((resolve, reject) => { + const spawnOptions: Parameters[2] = { + stdio: ['pipe', 'pipe', 'pipe'], + env: options.env as NodeJS.ProcessEnv, + }; + if (options.cwd) { + spawnOptions.cwd = options.cwd; + } + + const child = spawn('claude', options.args, spawnOptions); + + let stdout = ''; + let stderr = ''; + let timedOut = false; + let stdoutBuffer = ''; + + const onAbort = (): void => { + child.kill('SIGTERM'); + }; + + if (options.signal) { + if (options.signal.aborted) { + onAbort(); + } else { + options.signal.addEventListener('abort', onAbort, { once: true }); + } + } + + let timeoutHandle: NodeJS.Timeout | undefined; + if (this.config.timeoutMs && this.config.timeoutMs > 0) { + timeoutHandle = setTimeout(() => { + timedOut = true; + child.kill('SIGTERM'); + }, this.config.timeoutMs); + timeoutHandle.unref?.(); + } + + if (child.stdout) { + child.stdout.setEncoding('utf8'); + child.stdout.on('data', (chunk: string) => { + stdout += chunk; + stdoutBuffer += chunk; + // Process complete lines + const lines = stdoutBuffer.split(/\r?\n/); + stdoutBuffer = lines.pop() ?? ''; + for (const line of lines) { + const trimmed = line.trim(); + if (trimmed.length > 0) { + options.onLine(trimmed); + } + } + }); + } + + if (child.stderr) { + child.stderr.setEncoding('utf8'); + child.stderr.on('data', (chunk: string) => { + stderr += chunk; + }); + } + + // Send prompt via stdin + child.stdin?.end(options.prompt); + + const cleanup = (): void => { + if (timeoutHandle) { + clearTimeout(timeoutHandle); + } + if (options.signal) { + options.signal.removeEventListener('abort', onAbort); + } + }; + + child.on('error', (error) => { + cleanup(); + const err = error as NodeJS.ErrnoException; + if (err.code === 'ENOENT') { + reject( + new Error( + `Claude CLI executable 'claude' was not found on PATH. Install claude-code or ensure it is in PATH.`, + ), + ); + } else { + reject(error); + } + }); + + child.on('close', (code) => { + cleanup(); + // Flush remaining buffer + if (stdoutBuffer.trim().length > 0) { + options.onLine(stdoutBuffer.trim()); + } + resolve({ + stdout, + stderr, + exitCode: typeof code === 'number' ? code : -1, + timedOut, + }); + }); + }); + } +} + +class ClaudeCliStreamLogger { + readonly filePath: string; + private readonly stream: WriteStream; + private readonly startedAt = Date.now(); + private readonly format: 'summary' | 'json'; + + private constructor(filePath: string, format: 'summary' | 'json') { + this.filePath = filePath; + this.format = format; + this.stream = createWriteStream(filePath, { flags: 'a' }); + } + + static async create(options: { + readonly filePath: string; + readonly targetName: string; + readonly evalCaseId?: string; + readonly attempt?: number; + readonly format: 'summary' | 'json'; + }): Promise { + const logger = new ClaudeCliStreamLogger(options.filePath, options.format); + const header = [ + '# Claude CLI stream log', + `# target: ${options.targetName}`, + options.evalCaseId ? `# eval: ${options.evalCaseId}` : undefined, + options.attempt !== undefined ? `# attempt: ${options.attempt + 1}` : undefined, + `# started: ${new Date().toISOString()}`, + '', + ].filter((line): line is string => Boolean(line)); + for (const line of header) { + logger.stream.write(`${line}\n`); + } + return logger; + } + + handleLine(line: string): void { + const elapsed = formatElapsed(this.startedAt); + const event = tryParseJson(line); + + if (this.format === 'json') { + if (event) { + this.stream.write(`${JSON.stringify({ time: elapsed, data: event })}\n`); + } else { + this.stream.write(`${JSON.stringify({ time: elapsed, raw: line })}\n`); + } + } else { + if (event) { + const summary = summarizeEvent(event); + if (summary) { + const type = typeof event.type === 'string' ? event.type : 'unknown'; + this.stream.write(`[+${elapsed}] [${type}] ${summary}\n`); + } + } else { + this.stream.write(`[+${elapsed}] ${line}\n`); + } + } + } + + async close(): Promise { + await new Promise((resolve, reject) => { + this.stream.once('error', reject); + this.stream.end(() => resolve()); + }); + } +} + +function summarizeEvent(event: Record): string | undefined { + const type = event.type as string; + switch (type) { + case 'assistant': { + const message = event.message as Record | undefined; + if (message) { + const content = message.content; + if (Array.isArray(content) && content.length > 0) { + const first = content[0] as Record | undefined; + if (first?.type === 'tool_use') { + return `tool_use (${first.name})`; + } + if (first?.type === 'text') { + const text = first.text; + if (typeof text === 'string') { + const preview = text.length > 50 ? `${text.slice(0, 50)}...` : text; + return preview; + } + } + } + } + return 'message'; + } + case 'user': { + const message = event.message as Record | undefined; + if (message) { + const content = message.content; + if (Array.isArray(content) && content.length > 0) { + const first = content[0] as Record | undefined; + if (first?.type === 'tool_result') { + return `tool_result (${first.tool_use_id})`; + } + } + } + return 'user'; + } + case 'result': { + const cost = event.total_cost_usd; + const duration = event.duration_ms; + if (typeof cost === 'number' && typeof duration === 'number') { + return `$${cost.toFixed(4)}, ${Math.round(duration)}ms`; + } + return 'result'; + } + case 'system': + return 'init'; + default: + return undefined; + } +} + +/** + * Extract text content from Claude's content array format. + */ +function extractTextContent(content: unknown): string | undefined { + if (typeof content === 'string') { + return content; + } + if (!Array.isArray(content)) { + return undefined; + } + const textParts: string[] = []; + for (const part of content) { + if (!part || typeof part !== 'object') { + continue; + } + const p = part as Record; + if (p.type === 'text' && typeof p.text === 'string') { + textParts.push(p.text); + } + } + return textParts.length > 0 ? textParts.join('\n') : undefined; +} + +/** + * Extract tool calls from Claude's content array format. + */ +function extractToolCalls(content: unknown): readonly ToolCall[] { + if (!Array.isArray(content)) { + return []; + } + const toolCalls: ToolCall[] = []; + for (const part of content) { + if (!part || typeof part !== 'object') { + continue; + } + const p = part as Record; + if (p.type === 'tool_use' && typeof p.name === 'string') { + toolCalls.push({ + tool: p.name, + input: p.input, + id: typeof p.id === 'string' ? p.id : undefined, + }); + } + } + return toolCalls; +} + +/** + * Build a sanitized process.env without variables that block nested Claude sessions. + * Removes CLAUDECODE so the spawned CLI doesn't refuse to run inside another session. + */ +function sanitizeEnvForClaude(braintrustSpanIds?: { + readonly parentSpanId: string; + readonly rootSpanId: string; +}): Record { + const env = { ...process.env }; + // Remove all Claude Code session markers to allow nested sessions + env.CLAUDECODE = undefined; + env.CLAUDE_CODE_ENTRYPOINT = undefined; + // Inject Braintrust trace IDs so the trace-claude-code plugin can attach + // Claude Code session traces to the AgentV eval span + if (braintrustSpanIds) { + env.CC_PARENT_SPAN_ID = braintrustSpanIds.parentSpanId; + env.CC_ROOT_SPAN_ID = braintrustSpanIds.rootSpanId; + } + return env; +} + +function isClaudeCliLogStreamingDisabled(): boolean { + const envValue = process.env.AGENTV_CLAUDE_STREAM_LOGS; + if (!envValue) { + return false; + } + const normalized = envValue.trim().toLowerCase(); + return normalized === 'false' || normalized === '0' || normalized === 'off'; +} + +function buildLogFilename(request: ProviderRequest, targetName: string): string { + const timestamp = new Date().toISOString().replace(/[:.]/g, '-'); + const evalId = sanitizeForFilename(request.evalCaseId ?? 'claude-cli'); + const attemptSuffix = request.attempt !== undefined ? `_attempt-${request.attempt + 1}` : ''; + const target = sanitizeForFilename(targetName); + return `${timestamp}_${target}_${evalId}${attemptSuffix}_${randomUUID().slice(0, 8)}.log`; +} + +function sanitizeForFilename(value: string): string { + const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, '_'); + return sanitized.length > 0 ? sanitized : 'claude-cli'; +} + +function formatElapsed(startedAt: number): string { + const elapsedSeconds = Math.floor((Date.now() - startedAt) / 1000); + const hours = Math.floor(elapsedSeconds / 3600); + const minutes = Math.floor((elapsedSeconds % 3600) / 60); + const seconds = elapsedSeconds % 60; + if (hours > 0) { + return `${hours.toString().padStart(2, '0')}:${minutes.toString().padStart(2, '0')}:${seconds.toString().padStart(2, '0')}`; + } + return `${minutes.toString().padStart(2, '0')}:${seconds.toString().padStart(2, '0')}`; +} + +function formatTimeoutSuffix(timeoutMs: number | undefined): string { + if (!timeoutMs || timeoutMs <= 0) { + return ''; + } + const seconds = Math.ceil(timeoutMs / 1000); + return ` after ${seconds}s`; +} + +function tryParseJson(line: string): Record | undefined { + try { + const parsed = JSON.parse(line); + if (parsed && typeof parsed === 'object' && !Array.isArray(parsed)) { + return parsed as Record; + } + return undefined; + } catch { + return undefined; + } +} diff --git a/packages/core/src/evaluation/providers/claude-sdk.ts b/packages/core/src/evaluation/providers/claude-sdk.ts new file mode 100644 index 000000000..d4a768b4e --- /dev/null +++ b/packages/core/src/evaluation/providers/claude-sdk.ts @@ -0,0 +1,495 @@ +import { randomUUID } from 'node:crypto'; +import { createWriteStream } from 'node:fs'; +import type { WriteStream } from 'node:fs'; +import { mkdir } from 'node:fs/promises'; +import path from 'node:path'; + +import { recordClaudeLogEntry } from './claude-log-tracker.js'; +import { buildPromptDocument, normalizeInputFiles } from './preread.js'; +import type { ClaudeResolvedConfig } from './targets.js'; +import type { + Message, + Provider, + ProviderRequest, + ProviderResponse, + ProviderTokenUsage, + ToolCall, +} from './types.js'; + +// Lazy-loaded module to avoid bundling issues with dynamic requires +// biome-ignore lint/suspicious/noExplicitAny: dynamic import type +let claudeSdkModule: any = null; + +async function loadClaudeSdk(): Promise { + if (!claudeSdkModule) { + try { + claudeSdkModule = await import('@anthropic-ai/claude-agent-sdk'); + } catch (error) { + throw new Error( + `Failed to load @anthropic-ai/claude-agent-sdk. Please install it:\n npm install @anthropic-ai/claude-agent-sdk\n\nOriginal error: ${error instanceof Error ? error.message : String(error)}`, + ); + } + } + return claudeSdkModule; +} + +/** + * Claude Agent SDK provider using the @anthropic-ai/claude-agent-sdk library directly. + * This provides typed SDK access for structured tool calls, token usage, and clean + * session lifecycle. Use `claude-cli` for subprocess-based invocation. + * + * Note: The SDK is loaded lazily on first use to avoid bundling issues. + * Users must install @anthropic-ai/claude-agent-sdk separately. + */ +export class ClaudeSdkProvider implements Provider { + readonly id: string; + readonly kind = 'claude-sdk' as const; + readonly targetName: string; + readonly supportsBatch = false; + + private readonly config: ClaudeResolvedConfig; + + constructor(targetName: string, config: ClaudeResolvedConfig) { + this.id = `claude-sdk:${targetName}`; + this.targetName = targetName; + this.config = config; + } + + async invoke(request: ProviderRequest): Promise { + if (request.signal?.aborted) { + throw new Error('Claude SDK request was aborted before execution'); + } + + const sdk = await loadClaudeSdk(); + + const startTime = new Date().toISOString(); + const startMs = Date.now(); + + const logger = await this.createStreamLogger(request).catch(() => undefined); + + // Build the prompt + const inputFiles = normalizeInputFiles(request.inputFiles); + const prompt = buildPromptDocument(request, inputFiles); + + // Skip forced diff prompt when AgentV captures file changes + const systemPrompt = this.config.systemPrompt; + + // Build query options + // biome-ignore lint/suspicious/noExplicitAny: SDK options type is dynamically loaded + const queryOptions: any = { + permissionMode: 'bypassPermissions', + allowDangerouslySkipPermissions: true, + // The SDK spawns a Claude Code subprocess. When AgentV itself runs inside + // a Claude Code session the CLAUDECODE env var is set, which causes the + // subprocess to refuse to start ("cannot be launched inside another Claude + // Code session"). Passing a sanitized env removes that guard. + env: sanitizeEnvForClaudeSdk(request.braintrustSpanIds), + }; + + if (this.config.model) { + queryOptions.model = this.config.model; + } + + const cwd = this.resolveCwd(request.cwd); + if (cwd) { + queryOptions.cwd = cwd; + } + + if (systemPrompt) { + queryOptions.systemPrompt = systemPrompt; + } + + if (this.config.maxTurns !== undefined) { + queryOptions.maxTurns = this.config.maxTurns; + } + + if (this.config.maxBudgetUsd !== undefined) { + queryOptions.maxBudgetUsd = this.config.maxBudgetUsd; + } + + if (request.signal) { + queryOptions.abortController = { signal: request.signal } as AbortController; + } + + // Track state from messages + const completedToolCalls: ToolCall[] = []; + const output: Message[] = []; + let tokenUsage: ProviderTokenUsage | undefined; + let costUsd: number | undefined; + let durationMs: number | undefined; + + try { + const q = sdk.query({ prompt, options: queryOptions }); + + // Set up timeout if configured + let timeoutTimer: ReturnType | undefined; + if (this.config.timeoutMs) { + timeoutTimer = setTimeout(() => { + q.return(undefined as never).catch(() => {}); + }, this.config.timeoutMs); + timeoutTimer.unref?.(); + } + + try { + for await (const message of q) { + logger?.handleMessage(message); + + if (message.type === 'assistant') { + const betaMessage = (message as { message?: unknown }).message; + if (betaMessage && typeof betaMessage === 'object') { + const msg = betaMessage as Record; + const content = msg.content; + const textContent = extractTextContent(content); + const toolCalls = extractToolCalls(content); + + const outputMsg: Message = { + role: 'assistant', + content: textContent, + toolCalls: toolCalls.length > 0 ? toolCalls : undefined, + }; + output.push(outputMsg); + completedToolCalls.push(...toolCalls); + + // Stream callbacks for real-time observability + if (request.streamCallbacks) { + for (const tc of toolCalls) { + request.streamCallbacks.onToolCallEnd?.( + tc.tool, + tc.input, + tc.output, + tc.durationMs ?? 0, + tc.id, + ); + } + } + } + } + + if (message.type === 'result') { + const result = message as Record; + if (typeof result.total_cost_usd === 'number') { + costUsd = result.total_cost_usd; + } + if (typeof result.duration_ms === 'number') { + durationMs = result.duration_ms; + } + const usage = result.usage as Record | undefined; + if (usage) { + const inputTokens = + ((usage.input_tokens as number) ?? 0) + + ((usage.cache_read_input_tokens as number) ?? 0) + + ((usage.cache_creation_input_tokens as number) ?? 0); + const outputTokens = (usage.output_tokens as number) ?? 0; + tokenUsage = { + input: inputTokens, + output: outputTokens, + cached: (usage.cache_read_input_tokens as number) ?? undefined, + }; + + // Stream callback for LLM usage + request.streamCallbacks?.onLlmCallEnd?.(this.config.model ?? 'claude', tokenUsage); + } + } + } + } finally { + if (timeoutTimer) { + clearTimeout(timeoutTimer); + } + } + + const endTime = new Date().toISOString(); + const totalDurationMs = durationMs ?? Date.now() - startMs; + + return { + raw: { + model: this.config.model, + logFile: logger?.filePath, + }, + output, + tokenUsage, + costUsd, + durationMs: totalDurationMs, + startTime, + endTime, + }; + } finally { + await logger?.close(); + } + } + + private resolveCwd(cwdOverride?: string): string | undefined { + if (cwdOverride) { + return path.resolve(cwdOverride); + } + if (this.config.cwd) { + return path.resolve(this.config.cwd); + } + return undefined; + } + + private resolveLogDirectory(): string | undefined { + const disabled = isClaudeLogStreamingDisabled(); + if (disabled) { + return undefined; + } + if (this.config.logDir) { + return path.resolve(this.config.logDir); + } + return path.join(process.cwd(), '.agentv', 'logs', 'claude'); + } + + private async createStreamLogger( + request: ProviderRequest, + ): Promise { + const logDir = this.resolveLogDirectory(); + if (!logDir) { + return undefined; + } + try { + await mkdir(logDir, { recursive: true }); + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + console.warn(`Skipping Claude stream logging (could not create ${logDir}): ${message}`); + return undefined; + } + + const filePath = path.join(logDir, buildLogFilename(request, this.targetName)); + + try { + const logger = await ClaudeStreamLogger.create({ + filePath, + targetName: this.targetName, + evalCaseId: request.evalCaseId, + attempt: request.attempt, + format: this.config.logFormat ?? 'summary', + }); + recordClaudeLogEntry({ + filePath, + targetName: this.targetName, + evalCaseId: request.evalCaseId, + attempt: request.attempt, + }); + return logger; + } catch (error) { + const message = error instanceof Error ? error.message : String(error); + console.warn(`Skipping Claude stream logging for ${filePath}: ${message}`); + return undefined; + } + } +} + +/** + * Extract text content from Claude's content array format. + * Claude uses: content: [{ type: "text", text: "..." }, ...] + */ +function extractTextContent(content: unknown): string | undefined { + if (typeof content === 'string') { + return content; + } + if (!Array.isArray(content)) { + return undefined; + } + const textParts: string[] = []; + for (const part of content) { + if (!part || typeof part !== 'object') { + continue; + } + const p = part as Record; + if (p.type === 'text' && typeof p.text === 'string') { + textParts.push(p.text); + } + } + return textParts.length > 0 ? textParts.join('\n') : undefined; +} + +/** + * Extract tool calls from Claude's content array format. + * Claude uses: content: [{ type: "tool_use", name: "...", input: {...}, id: "..." }, ...] + */ +function extractToolCalls(content: unknown): readonly ToolCall[] { + if (!Array.isArray(content)) { + return []; + } + const toolCalls: ToolCall[] = []; + for (const part of content) { + if (!part || typeof part !== 'object') { + continue; + } + const p = part as Record; + if (p.type === 'tool_use' && typeof p.name === 'string') { + toolCalls.push({ + tool: p.name, + input: p.input, + id: typeof p.id === 'string' ? p.id : undefined, + }); + } + } + return toolCalls; +} + +class ClaudeStreamLogger { + readonly filePath: string; + private readonly stream: WriteStream; + private readonly startedAt = Date.now(); + private readonly format: 'summary' | 'json'; + + private constructor(filePath: string, format: 'summary' | 'json') { + this.filePath = filePath; + this.format = format; + this.stream = createWriteStream(filePath, { flags: 'a' }); + } + + static async create(options: { + readonly filePath: string; + readonly targetName: string; + readonly evalCaseId?: string; + readonly attempt?: number; + readonly format: 'summary' | 'json'; + }): Promise { + const logger = new ClaudeStreamLogger(options.filePath, options.format); + const header = [ + '# Claude Agent SDK stream log', + `# target: ${options.targetName}`, + options.evalCaseId ? `# eval: ${options.evalCaseId}` : undefined, + options.attempt !== undefined ? `# attempt: ${options.attempt + 1}` : undefined, + `# started: ${new Date().toISOString()}`, + '', + ].filter((line): line is string => Boolean(line)); + for (const line of header) { + logger.stream.write(`${line}\n`); + } + return logger; + } + + handleMessage(message: unknown): void { + if (!message || typeof message !== 'object') { + return; + } + const elapsed = formatElapsed(this.startedAt); + const msg = message as Record; + const type = typeof msg.type === 'string' ? msg.type : 'unknown'; + + if (this.format === 'json') { + this.stream.write(`${JSON.stringify({ time: elapsed, type, data: message })}\n`); + } else { + const summary = summarizeMessage(msg); + if (summary) { + this.stream.write(`[+${elapsed}] [${type}] ${summary}\n`); + } + } + } + + async close(): Promise { + await new Promise((resolve, reject) => { + this.stream.once('error', reject); + this.stream.end(() => resolve()); + }); + } +} + +function summarizeMessage(msg: Record): string | undefined { + const type = msg.type as string; + switch (type) { + case 'assistant': { + const message = msg.message as Record | undefined; + if (message) { + const content = message.content; + if (Array.isArray(content) && content.length > 0) { + const first = content[0] as Record | undefined; + if (first?.type === 'tool_use') { + return `tool_use (${first.name})`; + } + if (first?.type === 'text') { + const text = first.text; + if (typeof text === 'string') { + const preview = text.length > 50 ? `${text.slice(0, 50)}...` : text; + return preview; + } + } + } + } + return 'message'; + } + case 'user': { + const message = msg.message as Record | undefined; + if (message) { + const content = message.content; + if (Array.isArray(content) && content.length > 0) { + const first = content[0] as Record | undefined; + if (first?.type === 'tool_result') { + return `tool_result (${first.tool_use_id})`; + } + } + } + return 'user'; + } + case 'result': { + const cost = msg.total_cost_usd; + const duration = msg.duration_ms; + if (typeof cost === 'number' && typeof duration === 'number') { + return `$${cost.toFixed(4)}, ${Math.round(duration)}ms`; + } + return 'result'; + } + case 'system': + return 'init'; + default: + return undefined; + } +} + +/** + * Build a process.env copy without variables that block nested Claude sessions. + * The Claude Agent SDK spawns Claude Code as a child process; if CLAUDECODE is + * present the child immediately exits with "cannot be launched inside another + * Claude Code session". + */ +function sanitizeEnvForClaudeSdk(braintrustSpanIds?: { + readonly parentSpanId: string; + readonly rootSpanId: string; +}): Record { + const env = { ...process.env }; + // Remove all Claude Code session markers to allow nested sessions + env.CLAUDECODE = undefined; + env.CLAUDE_CODE_ENTRYPOINT = undefined; + // Inject Braintrust trace IDs so the trace-claude-code plugin can attach + // Claude Code session traces to the AgentV eval span + if (braintrustSpanIds) { + env.CC_PARENT_SPAN_ID = braintrustSpanIds.parentSpanId; + env.CC_ROOT_SPAN_ID = braintrustSpanIds.rootSpanId; + } + return env; +} + +function isClaudeLogStreamingDisabled(): boolean { + const envValue = process.env.AGENTV_CLAUDE_STREAM_LOGS; + if (!envValue) { + return false; + } + const normalized = envValue.trim().toLowerCase(); + return normalized === 'false' || normalized === '0' || normalized === 'off'; +} + +function buildLogFilename(request: ProviderRequest, targetName: string): string { + const timestamp = new Date().toISOString().replace(/[:.]/g, '-'); + const evalId = sanitizeForFilename(request.evalCaseId ?? 'claude'); + const attemptSuffix = request.attempt !== undefined ? `_attempt-${request.attempt + 1}` : ''; + const target = sanitizeForFilename(targetName); + return `${timestamp}_${target}_${evalId}${attemptSuffix}_${randomUUID().slice(0, 8)}.log`; +} + +function sanitizeForFilename(value: string): string { + const sanitized = value.replace(/[^A-Za-z0-9._-]+/g, '_'); + return sanitized.length > 0 ? sanitized : 'claude'; +} + +function formatElapsed(startedAt: number): string { + const elapsedSeconds = Math.floor((Date.now() - startedAt) / 1000); + const hours = Math.floor(elapsedSeconds / 3600); + const minutes = Math.floor((elapsedSeconds % 3600) / 60); + const seconds = elapsedSeconds % 60; + if (hours > 0) { + return `${hours.toString().padStart(2, '0')}:${minutes.toString().padStart(2, '0')}:${seconds.toString().padStart(2, '0')}`; + } + return `${minutes.toString().padStart(2, '0')}:${seconds.toString().padStart(2, '0')}`; +} diff --git a/packages/core/src/evaluation/providers/index.ts b/packages/core/src/evaluation/providers/index.ts index 547183776..62cd8eef8 100644 --- a/packages/core/src/evaluation/providers/index.ts +++ b/packages/core/src/evaluation/providers/index.ts @@ -1,4 +1,6 @@ import { AnthropicProvider, AzureProvider, GeminiProvider } from './ai-sdk.js'; +import { ClaudeCliProvider } from './claude-cli.js'; +import { ClaudeSdkProvider } from './claude-sdk.js'; import { ClaudeProvider } from './claude.js'; import { CliProvider } from './cli.js'; import { CodexProvider } from './codex.js'; @@ -87,7 +89,11 @@ export function createBuiltinProviderRegistry(): ProviderRegistry { .register('copilot-cli', (t) => new CopilotCliProvider(t.name, t.config as never)) .register('pi-coding-agent', (t) => new PiCodingAgentProvider(t.name, t.config as never)) .register('pi-agent-sdk', (t) => new PiAgentSdkProvider(t.name, t.config as never)) - .register('claude', (t) => new ClaudeProvider(t.name, t.config as never)) + // claude-cli is the new default subprocess provider; claude is an alias + .register('claude-cli', (t) => new ClaudeCliProvider(t.name, t.config as never)) + .register('claude', (t) => new ClaudeCliProvider(t.name, t.config as never)) + // claude-sdk is the explicit SDK provider (requires @anthropic-ai/claude-agent-sdk) + .register('claude-sdk', (t) => new ClaudeSdkProvider(t.name, t.config as never)) .register('mock', (t) => new MockProvider(t.name, t.config as never)) .register('vscode', (t) => new VSCodeProvider(t.name, t.config as never, 'vscode')) .register( diff --git a/packages/core/src/evaluation/providers/targets.ts b/packages/core/src/evaluation/providers/targets.ts index 1fb331d6c..aa30b06b6 100644 --- a/packages/core/src/evaluation/providers/targets.ts +++ b/packages/core/src/evaluation/providers/targets.ts @@ -596,6 +596,22 @@ export type ResolvedTarget = readonly providerBatching?: boolean; readonly config: ClaudeResolvedConfig; } + | { + readonly kind: 'claude-cli'; + readonly name: string; + readonly judgeTarget?: string; + readonly workers?: number; + readonly providerBatching?: boolean; + readonly config: ClaudeResolvedConfig; + } + | { + readonly kind: 'claude-sdk'; + readonly name: string; + readonly judgeTarget?: string; + readonly workers?: number; + readonly providerBatching?: boolean; + readonly config: ClaudeResolvedConfig; + } | { readonly kind: 'mock'; readonly name: string; @@ -788,9 +804,18 @@ export function resolveTargetDefinition( }; case 'claude': case 'claude-code': + case 'claude-cli': + return { + kind: 'claude-cli', + name: parsed.name, + judgeTarget: parsed.judge_target, + workers: parsed.workers, + providerBatching, + config: resolveClaudeConfig(parsed, env, evalFilePath), + }; case 'claude-sdk': return { - kind: 'claude', + kind: 'claude-sdk', name: parsed.name, judgeTarget: parsed.judge_target, workers: parsed.workers, diff --git a/packages/core/src/evaluation/providers/types.ts b/packages/core/src/evaluation/providers/types.ts index a30108d5b..af5e3b6a1 100644 --- a/packages/core/src/evaluation/providers/types.ts +++ b/packages/core/src/evaluation/providers/types.ts @@ -20,6 +20,8 @@ export type ProviderKind = | 'pi-coding-agent' | 'pi-agent-sdk' | 'claude' + | 'claude-cli' + | 'claude-sdk' | 'cli' | 'mock' | 'vscode' @@ -35,6 +37,8 @@ export const AGENT_PROVIDER_KINDS: readonly ProviderKind[] = [ 'copilot-cli', 'pi-coding-agent', 'claude', + 'claude-cli', + 'claude-sdk', 'vscode', 'vscode-insiders', ] as const; @@ -53,6 +57,8 @@ export const KNOWN_PROVIDERS: readonly ProviderKind[] = [ 'pi-coding-agent', 'pi-agent-sdk', 'claude', + 'claude-cli', + 'claude-sdk', 'cli', 'mock', 'vscode', @@ -73,7 +79,6 @@ export const PROVIDER_ALIASES: readonly string[] = [ 'pi', // alias for "pi-coding-agent" 'claude-code', // alias for "claude" (legacy) - 'claude-sdk', // alias for "claude" 'openai', // legacy/future support 'bedrock', // legacy/future support 'vertex', // legacy/future support diff --git a/packages/core/src/evaluation/registry/index.ts b/packages/core/src/evaluation/registry/index.ts index fc60bb177..75c8332dc 100644 --- a/packages/core/src/evaluation/registry/index.ts +++ b/packages/core/src/evaluation/registry/index.ts @@ -7,3 +7,4 @@ export { EvaluatorRegistry, DeterministicAssertionEvaluator } from './evaluator- export type { EvaluatorDispatchContext, EvaluatorFactoryFn } from './evaluator-registry.js'; export { createBuiltinRegistry } from './builtin-evaluators.js'; export { discoverAssertions } from './assertion-discovery.js'; +export { discoverJudges } from './judge-discovery.js'; diff --git a/packages/core/src/evaluation/registry/judge-discovery.ts b/packages/core/src/evaluation/registry/judge-discovery.ts new file mode 100644 index 000000000..c4a843565 --- /dev/null +++ b/packages/core/src/evaluation/registry/judge-discovery.ts @@ -0,0 +1,78 @@ +/** + * Convention-based discovery of custom judge scripts. + * + * Scans `.agentv/judges/` for TypeScript/JavaScript files and registers + * them as code-judge evaluators in the registry. The file name (without + * extension) becomes the evaluator type name. + * + * Example: `.agentv/judges/trigger-judge.ts` → type "trigger-judge" in EVAL.yaml + */ + +import path from 'node:path'; +import fg from 'fast-glob'; + +import { CodeEvaluator } from '../evaluators/code-evaluator.js'; +import type { EvaluatorFactoryFn } from './evaluator-registry.js'; +import type { EvaluatorRegistry } from './evaluator-registry.js'; + +/** + * Discover custom judge scripts from `.agentv/judges/` and register + * them as evaluator types in the registry. + * + * @param registry - The evaluator registry to register discovered judges into + * @param baseDir - The base directory to search from (typically project root or eval file dir) + * @returns Names of discovered judge types + */ +export async function discoverJudges( + registry: EvaluatorRegistry, + baseDir: string, +): Promise { + const patterns = ['*.ts', '*.js', '*.mts', '*.mjs']; + + // Search baseDir and its ancestors for .agentv/judges/ + const candidateDirs: string[] = []; + let dir = path.resolve(baseDir); + const root = path.parse(dir).root; + while (dir !== root) { + candidateDirs.push(path.join(dir, '.agentv', 'judges')); + dir = path.dirname(dir); + } + + let files: string[] = []; + for (const judgesDir of candidateDirs) { + try { + const found = await fg(patterns, { + cwd: judgesDir, + absolute: true, + onlyFiles: true, + }); + files = files.concat(found); + } catch { + // Directory doesn't exist — skip + } + } + + const discoveredTypes: string[] = []; + + for (const filePath of files) { + const basename = path.basename(filePath); + const typeName = basename.replace(/\.(ts|js|mts|mjs)$/, ''); + + // Don't override built-in types + if (registry.has(typeName)) { + continue; + } + + const factory: EvaluatorFactoryFn = (_config, context) => { + return new CodeEvaluator({ + command: ['bun', 'run', filePath], + agentTimeoutMs: context.agentTimeoutMs, + }); + }; + + registry.register(typeName, factory); + discoveredTypes.push(typeName); + } + + return discoveredTypes; +} diff --git a/packages/core/src/evaluation/validation/targets-validator.ts b/packages/core/src/evaluation/validation/targets-validator.ts index c507308e9..068848e00 100644 --- a/packages/core/src/evaluation/validation/targets-validator.ts +++ b/packages/core/src/evaluation/validation/targets-validator.ts @@ -217,6 +217,7 @@ function getKnownSettings(provider: string): Set | null { return COPILOT_CLI_SETTINGS; case 'claude': case 'claude-code': + case 'claude-cli': case 'claude-sdk': return CLAUDE_SETTINGS; case 'vscode': diff --git a/packages/core/src/index.ts b/packages/core/src/index.ts index 7df57f3f2..514f7acae 100644 --- a/packages/core/src/index.ts +++ b/packages/core/src/index.ts @@ -57,6 +57,7 @@ export type { } from './evaluation/registry/evaluator-registry.js'; export { createBuiltinRegistry } from './evaluation/registry/builtin-evaluators.js'; export { discoverAssertions } from './evaluation/registry/assertion-discovery.js'; +export { discoverJudges } from './evaluation/registry/judge-discovery.js'; export type AgentKernel = { status: string; diff --git a/packages/core/test/evaluation/providers/claude-provider-aliases.test.ts b/packages/core/test/evaluation/providers/claude-provider-aliases.test.ts new file mode 100644 index 000000000..4834d02c5 --- /dev/null +++ b/packages/core/test/evaluation/providers/claude-provider-aliases.test.ts @@ -0,0 +1,63 @@ +import { describe, expect, it } from 'bun:test'; + +import { ClaudeCliProvider } from '../../../src/evaluation/providers/claude-cli.js'; +import { ClaudeSdkProvider } from '../../../src/evaluation/providers/claude-sdk.js'; +import { ClaudeProvider } from '../../../src/evaluation/providers/claude.js'; +import { createBuiltinProviderRegistry } from '../../../src/evaluation/providers/index.js'; + +const mockClaudeConfig = { + model: undefined, + cwd: undefined, + timeoutMs: undefined, + logDir: undefined, + logFormat: 'summary' as const, + systemPrompt: undefined, + maxTurns: undefined, + maxBudgetUsd: undefined, +}; + +describe('Claude provider alias resolution', () => { + const registry = createBuiltinProviderRegistry(); + + it('creates a ClaudeCliProvider for claude-cli kind', () => { + const provider = registry.create({ + name: 'test-target', + kind: 'claude-cli', + config: mockClaudeConfig, + }); + expect(provider).toBeInstanceOf(ClaudeCliProvider); + expect(provider.kind).toBe('claude-cli'); + expect(provider.id).toBe('claude-cli:test-target'); + }); + + it('creates a ClaudeCliProvider for claude kind (alias for claude-cli)', () => { + const provider = registry.create({ + name: 'test-target', + kind: 'claude', + config: mockClaudeConfig, + }); + expect(provider).toBeInstanceOf(ClaudeCliProvider); + expect(provider.kind).toBe('claude-cli'); + }); + + it('creates a ClaudeSdkProvider for claude-sdk kind', () => { + const provider = registry.create({ + name: 'test-target', + kind: 'claude-sdk', + config: mockClaudeConfig, + }); + expect(provider).toBeInstanceOf(ClaudeSdkProvider); + expect(provider.kind).toBe('claude-sdk'); + expect(provider.id).toBe('claude-sdk:test-target'); + }); + + it('ClaudeCliProvider and ClaudeProvider are different classes', () => { + // ClaudeProvider is the legacy SDK provider kept for reference + const cliProvider = new ClaudeCliProvider('target', mockClaudeConfig); + const sdkProvider = new ClaudeProvider('target', mockClaudeConfig as never); + expect(cliProvider).toBeInstanceOf(ClaudeCliProvider); + expect(sdkProvider).toBeInstanceOf(ClaudeProvider); + expect(cliProvider.kind).toBe('claude-cli'); + expect(sdkProvider.kind).toBe('claude'); + }); +});