diff --git a/.github/workflows/ricky-evals.yml b/.github/workflows/ricky-evals.yml new file mode 100644 index 00000000..3f5e9116 --- /dev/null +++ b/.github/workflows/ricky-evals.yml @@ -0,0 +1,114 @@ +name: Ricky Evals + +on: + pull_request: + paths: + - ".github/workflows/ricky-evals.yml" + - "evals/**" + - "scripts/evals/**" + - "AGENTS.md" + - "README.md" + - "SPEC.md" + - "docs/**" + - "specs/**" + - "src/cloud/**" + - "src/local/**" + - "src/product/**" + - "src/runtime/**" + - "src/surfaces/**" + - "src/shared/**" + - "workflows/shared/**" + - "workflows/meta/spec/**" + - "package.json" + - "package-lock.json" + push: + branches: + - main + paths: + - ".github/workflows/ricky-evals.yml" + - "evals/**" + - "scripts/evals/**" + - "AGENTS.md" + - "README.md" + - "SPEC.md" + - "docs/**" + - "specs/**" + - "src/cloud/**" + - "src/local/**" + - "src/product/**" + - "src/runtime/**" + - "src/surfaces/**" + - "src/shared/**" + - "workflows/shared/**" + - "workflows/meta/spec/**" + - "package.json" + - "package-lock.json" + workflow_dispatch: + +concurrency: + group: ricky-evals-${{ github.ref }} + cancel-in-progress: true + +permissions: + contents: read + pull-requests: write + +env: + NODE_VERSION: "22.14.0" + NPM_CONFIG_FUND: "false" + RICKY_EVAL_OPENROUTER_MODEL: openai/gpt-oss-120b:free + +jobs: + evals: + name: Provider-backed evals + runs-on: ubuntu-latest + + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Setup Node.js + uses: actions/setup-node@v4 + with: + node-version: ${{ env.NODE_VERSION }} + cache: npm + cache-dependency-path: package-lock.json + + - name: Install dependencies + run: npm ci + + - name: Require OpenRouter API key + if: github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository + env: + OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }} + run: | + if [ -z "${OPENROUTER_API_KEY}" ]; then + echo "OPENROUTER_API_KEY GitHub secret is required for provider-backed Ricky evals." + exit 1 + fi + + - name: Run provider evals + if: github.event_name != 'pull_request' || github.event.pull_request.head.repo.full_name == github.repository + env: + OPENROUTER_API_KEY: ${{ secrets.OPENROUTER_API_KEY }} + run: npm run evals:provider -- --trials 1 + + - name: Summarize evals + if: always() + run: node scripts/evals/ci-summary.mjs + + - name: Comment human-review cases + if: always() && github.event_name == 'pull_request' + env: + GITHUB_TOKEN: ${{ github.token }} + PR_NUMBER: ${{ github.event.pull_request.number }} + run: node scripts/evals/ci-review-comment.mjs + + - name: Upload eval artifacts + if: always() + uses: actions/upload-artifact@v4 + with: + name: ricky-eval-run + path: .ricky/evals/runs/ + retention-days: 14 + if-no-files-found: ignore diff --git a/package.json b/package.json index 9cef880d..3bb6d170 100644 --- a/package.json +++ b/package.json @@ -44,6 +44,7 @@ "test": "npm run bundle && vitest run", "evals:compile": "node scripts/evals/compile-ricky-evals.mjs", "evals": "npm run evals:compile && node scripts/evals/run-ricky-evals.mjs", + "evals:provider": "npm run build && npm run evals -- --provider --executor openrouter", "evals:opencode": "npm run evals:compile && node scripts/evals/run-ricky-evals.mjs --provider --executor opencode", "evals:list": "npm run evals:compile && node scripts/evals/run-ricky-evals.mjs --list", "evals:summary": "node scripts/evals/summarize-ricky-evals.mjs", diff --git a/scripts/evals/ci-review-comment.mjs b/scripts/evals/ci-review-comment.mjs new file mode 100644 index 00000000..9b13ff90 --- /dev/null +++ b/scripts/evals/ci-review-comment.mjs @@ -0,0 +1,262 @@ +#!/usr/bin/env node + +import { appendFileSync, existsSync, readdirSync, readFileSync } from 'node:fs'; +import path from 'node:path'; +import process from 'node:process'; +import { fileURLToPath } from 'node:url'; + +const ROOT = path.resolve(path.dirname(fileURLToPath(import.meta.url)), '../..'); +const RUNS_DIR = path.join(ROOT, '.ricky', 'evals', 'runs'); +const MARKER = ''; +const MAX_COMMENT_CHARS = 60000; +const MAX_OUTPUT_CHARS = 1200; + +const runDir = findLatestRunDir(); +if (!runDir) { + console.log('No Ricky eval run found; skipping PR comment.'); + process.exit(0); +} + +const result = readResultJson(path.join(runDir, 'result.json')); +const comment = renderComment({ result, runDir }); + +if (process.env.GITHUB_STEP_SUMMARY) { + appendFileSync(process.env.GITHUB_STEP_SUMMARY, [ + '', + '## Ricky Eval Review Comment', + '', + 'A detailed human-review comment was generated for this PR.', + '', + ].join('\n')); +} + +if (process.env.GITHUB_TOKEN && process.env.GITHUB_REPOSITORY && process.env.PR_NUMBER) { + await upsertPullRequestComment(comment); +} else { + console.log(comment); +} + +function renderComment({ result, runDir }) { + const failed = result.tests.filter((test) => test.status === 'failed'); + const skipped = result.tests.filter((test) => test.status === 'skipped'); + const needsHuman = result.tests.filter((test) => test.status === 'needs-human'); + const reviewableNeedsHuman = needsHuman.filter(hasCapturedOutput); + const missingOutputNeedsHuman = needsHuman.filter((test) => !hasCapturedOutput(test)); + const lines = [ + MARKER, + '# Ricky Eval Review', + '', + `Run: \`${path.relative(ROOT, runDir)}\``, + `Mode: \`${result.mode}\``, + `Git SHA: \`${result.git_sha}\``, + '', + `**Passed:** ${result.passed} | **Needs human:** ${result.needs_human} | **Reviewable:** ${reviewableNeedsHuman.length} | **Missing output:** ${missingOutputNeedsHuman.length} | **Failed:** ${result.failed} | **Skipped:** ${result.skipped}`, + '', + ]; + + if (failed.length > 0 || skipped.length > 0) { + lines.push('## Blocking Cases', ''); + for (const test of [...failed, ...skipped]) { + appendCaseDetails(lines, test, { forceOpen: true }); + } + } + + if (reviewableNeedsHuman.length > 0) { + lines.push( + '## Human Review Cases', + '', + 'These cases passed deterministic checks and include captured Ricky output for a human verdict against their `Must` / `Must Not` rubric.', + '', + ); + for (const test of reviewableNeedsHuman) { + appendCaseDetails(lines, test, { forceOpen: false }); + } + } else { + lines.push('## Human Review Cases', '', 'No reviewable human-review cases captured Ricky output.', ''); + } + + if (missingOutputNeedsHuman.length > 0) { + lines.push( + '## Cases Missing Ricky Output', + '', + 'These cases are not expanded because there is no candidate Ricky response to judge. Change them to `Executor: openrouter`, run with `--executor openrouter`, or provide `Candidate Output`, before treating them as human-review evidence.', + '', + ); + for (const test of missingOutputNeedsHuman) { + lines.push(`- \`${test.id}\` (${test.suite}/${test.executor})`); + } + lines.push(''); + } + + const body = `${lines.join('\n')}\n`; + if (body.length <= MAX_COMMENT_CHARS) return body; + return `${body.slice(0, MAX_COMMENT_CHARS - 2000)}\n\n---\n\n_Comment truncated to stay within GitHub limits. Download the \`ricky-eval-run\` artifact for the full \`human-review.md\`._\n`; +} + +function appendCaseDetails(lines, test, { forceOpen }) { + const summaryStatus = test.status === 'failed' ? 'FAIL' : test.status === 'skipped' ? 'SKIP' : 'REVIEW'; + lines.push(``); + lines.push(`${summaryStatus} ${escapeHtml(test.id)} (${escapeHtml(test.suite)}/${escapeHtml(test.executor)})`); + lines.push(''); + + if (test.input?.message) { + lines.push('**User message**', ''); + lines.push(blockquote(String(test.input.message))); + lines.push(''); + } + + appendRickyOutput(lines, test); + appendRubricList(lines, 'Must', test.expected?.must); + appendRubricList(lines, 'Must Not', test.expected?.mustNot); + + const deterministicChecks = (test.checks ?? []).filter((check) => !String(check.name).startsWith('human:')); + if (deterministicChecks.length > 0) { + lines.push('**Deterministic checks**', ''); + for (const check of deterministicChecks) { + lines.push(`- ${check.passed ? 'PASS' : 'FAIL'} \`${check.name}\`: ${check.message ?? ''}`); + } + lines.push(''); + } + + if (test.error) { + lines.push('**Error**', ''); + lines.push('```text'); + lines.push(String(test.error).slice(0, MAX_OUTPUT_CHARS)); + lines.push('```', ''); + } + + lines.push('', ''); +} + +function appendRickyOutput(lines, test) { + const actualContent = getCapturedOutput(test).trim(); + lines.push('**Ricky output**', ''); + if (actualContent.length > 0) { + const preview = actualContent.length > MAX_OUTPUT_CHARS + ? `${actualContent.slice(0, MAX_OUTPUT_CHARS)}\n...[truncated]` + : actualContent; + lines.push('```text'); + lines.push(preview); + lines.push('```', ''); + } else { + lines.push(`_No Ricky output captured for this case. Executor: \`${test.executor}\`._`, ''); + } +} + +function hasCapturedOutput(test) { + return getCapturedOutput(test).trim().length > 0; +} + +function getCapturedOutput(test) { + return String( + test.actual?.content ?? + test.candidate_output ?? + test.candidateOutput ?? + test.candidate?.content ?? + '', + ); +} + +function appendRubricList(lines, title, items) { + if (!Array.isArray(items) || items.length === 0) return; + lines.push(`**${title}**`, ''); + for (const item of items) { + lines.push(`- ${String(item)}`); + } + lines.push(''); +} + +function blockquote(text) { + return text.split('\n').map((line) => `> ${line}`).join('\n'); +} + +function escapeHtml(value) { + return String(value) + .replace(/&/g, '&') + .replace(//g, '>'); +} + +function findLatestRunDir() { + if (!existsSync(RUNS_DIR)) return null; + const runs = readdirSync(RUNS_DIR) + .map((dir) => path.join(RUNS_DIR, dir)) + .filter((dir) => existsSync(path.join(dir, 'result.json'))) + .flatMap((dir) => { + const result = safeReadResultJson(path.join(dir, 'result.json')); + return result ? [{ dir, result }] : []; + }) + .sort((a, b) => String(b.result.timestamp).localeCompare(String(a.result.timestamp))); + return runs[0]?.dir ?? null; +} + +function readResultJson(filePath) { + const result = safeReadResultJson(filePath); + if (!result) { + throw new Error(`Could not parse Ricky eval result: ${path.relative(ROOT, filePath)}`); + } + return result; +} + +function safeReadResultJson(filePath) { + try { + return JSON.parse(readFileSync(filePath, 'utf8')); + } catch (error) { + console.warn(`Skipping malformed Ricky eval result ${path.relative(ROOT, filePath)}: ${error instanceof Error ? error.message : String(error)}`); + return null; + } +} + +async function upsertPullRequestComment(body) { + const [owner, repo] = process.env.GITHUB_REPOSITORY.split('/'); + if (!owner || !repo) { + throw new Error(`Invalid GITHUB_REPOSITORY format: expected "owner/repo", got "${process.env.GITHUB_REPOSITORY}"`); + } + const prNumber = process.env.PR_NUMBER; + const headers = { + accept: 'application/vnd.github+json', + authorization: `Bearer ${process.env.GITHUB_TOKEN}`, + 'content-type': 'application/json', + 'x-github-api-version': '2022-11-28', + }; + const commentsUrl = `https://api.github.com/repos/${owner}/${repo}/issues/${prNumber}/comments`; + const existing = await findExistingReviewComment(commentsUrl, headers); + + const method = existing?.url ? 'PATCH' : 'POST'; + const url = existing?.url ?? commentsUrl; + const response = await globalThis.fetch(url, { + method, + headers, + body: JSON.stringify({ body }), + }); + if (!response.ok) { + throw new Error(`Failed to ${method === 'PATCH' ? 'update' : 'create'} PR comment: ${response.status} ${await response.text()}`); + } + console.log(`${method === 'PATCH' ? 'Updated' : 'Created'} Ricky eval review comment.`); +} + +async function findExistingReviewComment(commentsUrl, headers) { + let url = `${commentsUrl}?per_page=100`; + while (url) { + const response = await globalThis.fetch(url, { headers }); + if (!response.ok) { + throw new Error(`Failed to list PR comments: ${response.status} ${await response.text()}`); + } + const comments = await response.json(); + if (Array.isArray(comments)) { + const existing = comments.find((comment) => typeof comment.body === 'string' && comment.body.includes(MARKER)); + if (existing) return existing; + } + url = getNextLink(response.headers.get('link')); + } + return undefined; +} + +function getNextLink(linkHeader) { + if (!linkHeader) return null; + for (const part of linkHeader.split(',')) { + const match = /<([^>]+)>;\s*rel="next"/.exec(part.trim()); + if (match) return match[1]; + } + return null; +} diff --git a/scripts/evals/ci-summary.mjs b/scripts/evals/ci-summary.mjs new file mode 100644 index 00000000..0cf6d236 --- /dev/null +++ b/scripts/evals/ci-summary.mjs @@ -0,0 +1,161 @@ +#!/usr/bin/env node + +import { existsSync, readdirSync, readFileSync, writeFileSync } from 'node:fs'; +import path from 'node:path'; +import { fileURLToPath } from 'node:url'; + +const ROOT = path.resolve(path.dirname(fileURLToPath(import.meta.url)), '../..'); +const RUNS_DIR = path.join(ROOT, '.ricky', 'evals', 'runs'); + +const runDir = findLatestRunDir(); +if (!runDir) { + const summary = '# Ricky Eval CI Summary\n\nNo Ricky eval run found; provider evals may have been skipped for an untrusted fork PR.\n'; + console.log(summary); + if (process.env.GITHUB_STEP_SUMMARY) { + writeFileSync(process.env.GITHUB_STEP_SUMMARY, summary, { flag: 'a' }); + } + process.exit(0); +} + +const resultPath = path.join(runDir, 'result.json'); +const reviewPath = path.join(runDir, 'human-review.md'); +const result = readResultJson(resultPath); + +const failed = result.tests.filter((test) => test.status === 'failed'); +const skipped = result.tests.filter((test) => test.status === 'skipped'); +const needsHuman = result.tests.filter((test) => test.status === 'needs-human'); +const reviewableNeedsHuman = needsHuman.filter(hasCapturedOutput); +const missingOutputNeedsHuman = needsHuman.filter((test) => !hasCapturedOutput(test)); + +const lines = [ + '# Ricky Eval CI Summary', + '', + `- Run directory: \`${path.relative(ROOT, runDir)}\``, + `- Mode: \`${result.mode}\``, + `- Git SHA: \`${result.git_sha}\``, + `- Passed: ${result.passed}`, + `- Needs human review: ${result.needs_human}`, + `- Reviewable human cases: ${reviewableNeedsHuman.length}`, + `- Human cases missing Ricky output: ${missingOutputNeedsHuman.length}`, + `- Failed: ${result.failed}`, + `- Skipped: ${result.skipped}`, + '', +]; + +appendStatusSection(lines, 'Failed', failed); +appendStatusSection(lines, 'Skipped', skipped); +appendHumanReviewSection(lines, reviewableNeedsHuman, missingOutputNeedsHuman); + +const summary = `${lines.join('\n')}\n`; +console.log(summary); + +if (process.env.GITHUB_STEP_SUMMARY) { + writeFileSync(process.env.GITHUB_STEP_SUMMARY, summary, { flag: 'a' }); +} + +if (failed.length > 0 || skipped.length > 0 || missingOutputNeedsHuman.length > 0) { + process.exitCode = 1; +} + +function appendStatusSection(lines, title, tests) { + if (tests.length === 0) return; + lines.push(`## ${title}`, ''); + for (const test of tests) { + lines.push(`- \`${test.id}\` (${test.suite}/${test.executor})`); + if (test.error) lines.push(` - ${test.error}`); + for (const check of test.checks ?? []) { + if (check.passed) continue; + lines.push(` - FAIL ${check.name}: ${check.message}`); + } + } + lines.push(''); +} + +function appendHumanReviewSection(lines, reviewableTests, missingOutputTests) { + if (reviewableTests.length === 0 && missingOutputTests.length === 0) { + lines.push('## Human Review', '', 'No cases require human review.', ''); + return; + } + + lines.push( + '## Human Review', + '', + `These ${reviewableTests.length} cases passed deterministic checks and include captured Ricky output for human review.`, + '', + `Review worksheet: \`${path.relative(ROOT, reviewPath)}\``, + '', + ); + + const bySuite = new Map(); + for (const test of reviewableTests) { + const suite = test.suite ?? 'unknown'; + if (!bySuite.has(suite)) bySuite.set(suite, []); + bySuite.get(suite).push(test); + } + + for (const [suite, suiteTests] of [...bySuite.entries()].sort(([a], [b]) => a.localeCompare(b))) { + lines.push(`### ${suite}`, ''); + for (const test of suiteTests) { + lines.push(`- \`${test.id}\``); + } + lines.push(''); + } + + if (missingOutputTests.length > 0) { + lines.push( + '### Missing Ricky Output', + '', + 'These cases cannot be reviewed because no candidate Ricky response was captured. Use `Executor: openrouter`, run with `--executor openrouter`, or provide `Candidate Output`.', + '', + ); + for (const test of missingOutputTests) { + lines.push(`- \`${test.id}\` (${test.suite}/${test.executor})`); + } + lines.push(''); + } +} + +function hasCapturedOutput(test) { + return getCapturedOutput(test).trim().length > 0; +} + +function getCapturedOutput(test) { + return String( + test.actual?.content ?? + test.candidate_output ?? + test.candidateOutput ?? + test.candidate?.content ?? + '', + ); +} + +function findLatestRunDir() { + if (!existsSync(RUNS_DIR)) return null; + const runs = readdirSync(RUNS_DIR) + .map((dir) => path.join(RUNS_DIR, dir)) + .filter((dir) => existsSync(path.join(dir, 'result.json'))) + .flatMap((dir) => { + const result = safeReadResultJson(path.join(dir, 'result.json')); + return result ? [{ dir, result }] : []; + }) + .sort((a, b) => String(b.result.timestamp).localeCompare(String(a.result.timestamp))); + + return runs[0]?.dir ?? null; +} + +function readResultJson(filePath) { + const result = safeReadResultJson(filePath); + if (!result) { + throw new Error(`Could not parse Ricky eval result: ${path.relative(ROOT, filePath)}`); + } + return result; +} + +function safeReadResultJson(filePath) { + try { + return JSON.parse(readFileSync(filePath, 'utf8')); + } catch (error) { + console.warn(`Skipping malformed Ricky eval result ${path.relative(ROOT, filePath)}: ${error instanceof Error ? error.message : String(error)}`); + return null; + } +} diff --git a/scripts/evals/run-ricky-evals.mjs b/scripts/evals/run-ricky-evals.mjs index 4c841e10..32a98753 100644 --- a/scripts/evals/run-ricky-evals.mjs +++ b/scripts/evals/run-ricky-evals.mjs @@ -15,6 +15,8 @@ import { const ROOT = path.resolve(path.dirname(fileURLToPath(import.meta.url)), '../..'); const DEFAULT_OPENCODE_MODEL = 'opencode/minimax-m2.5-free'; +const DEFAULT_OPENROUTER_MODEL = 'openai/gpt-oss-120b:free'; +const OPENROUTER_CHAT_COMPLETIONS_ENDPOINT = 'https://openrouter.ai/api/v1/chat/completions'; const { argv: evalArgv, executorOverride } = parseRickyEvalArgs(process.argv.slice(2)); const defaultExecutors = createDefaultHumanEvalExecutors(ROOT); @@ -25,6 +27,7 @@ const exitCode = await runHumanEvalCli({ runsDir: path.join(ROOT, '.ricky', 'evals', 'runs'), executors: { manual: executeManual, + openrouter: executeOpenRouter, opencode: executeOpenCode, 'ricky-cli': executeRickyCli, }, @@ -41,12 +44,167 @@ const exitCode = await runHumanEvalCli({ process.exitCode = exitCode; function executeManual(testCase, context) { - if (context.providerMode && executorOverride === 'opencode') { + if (executorOverride === 'openrouter') { + return executeOpenRouter(testCase, context); + } + if (executorOverride === 'opencode') { return executeOpenCode(testCase, context); } return defaultExecutors.manual(testCase, context); } +async function executeOpenRouter(testCase, context) { + if (!context.providerMode) { + throw createSkippedEvalError('openrouter executor skipped; rerun with --provider or HUMAN_EVAL_PROVIDER=1'); + } + + const apiKey = process.env.OPENROUTER_API_KEY; + if (!apiKey) { + throw createSkippedEvalError('openrouter executor skipped; OPENROUTER_API_KEY is missing'); + } + + const model = process.env.RICKY_EVAL_OPENROUTER_MODEL ?? DEFAULT_OPENROUTER_MODEL; + const timeoutMs = readPositiveInt(process.env.RICKY_EVAL_OPENROUTER_TIMEOUT_MS, 120_000); + const maxAttempts = readPositiveInt(process.env.RICKY_EVAL_OPENROUTER_MAX_ATTEMPTS, 3); + const maxTokens = readPositiveInt(process.env.RICKY_EVAL_OPENROUTER_MAX_TOKENS, 1200); + const startedAt = Date.now(); + const emptyAttempts = []; + + for (let attempt = 1; attempt <= maxAttempts; attempt += 1) { + try { + const { content, note } = await runOpenRouterAttempt({ + apiKey, + model, + timeoutMs, + maxTokens, + testCase, + }); + if (content) { + const durationMs = Date.now() - startedAt; + return { + ok: true, + status: 'completed', + content, + model, + toolCalls: [], + notes: `Ran OpenRouter eval with model ${model}; attempts=${attempt}; durationMs=${durationMs}.${note ? ` ${note}` : ''}`, + }; + } + emptyAttempts.push(`attempt ${attempt}: ${note || 'empty content'}`); + } catch (error) { + if (attempt >= maxAttempts || !isRetryableOpenRouterError(error)) { + throw error; + } + emptyAttempts.push(`attempt ${attempt}: ${error instanceof Error ? error.message : String(error)}`); + } + } + + return { + ok: false, + status: 'completed', + content: [ + `OpenRouter returned an empty response after ${maxAttempts} attempts for ${testCase.id}.`, + 'This provider response is reviewable as an infrastructure-quality signal, but it is not a Ricky product answer.', + '', + 'Attempts:', + ...emptyAttempts.map((attempt) => `- ${attempt}`), + ].join('\n'), + model, + toolCalls: [], + notes: `OpenRouter empty response fallback after ${maxAttempts} attempts; durationMs=${Date.now() - startedAt}.`, + }; +} + +async function runOpenRouterAttempt({ apiKey, model, timeoutMs, maxTokens, testCase }) { + const controller = new AbortController(); + const timeout = setTimeout(() => controller.abort(), timeoutMs); + try { + const response = await fetch(OPENROUTER_CHAT_COMPLETIONS_ENDPOINT, { + method: 'POST', + signal: controller.signal, + headers: { + authorization: `Bearer ${apiKey}`, + 'content-type': 'application/json', + 'http-referer': process.env.GITHUB_SERVER_URL + ? `${process.env.GITHUB_SERVER_URL}/${process.env.GITHUB_REPOSITORY ?? ''}` + : 'https://github.com/AgentWorkforce/ricky', + 'x-title': 'Ricky Evals', + }, + body: JSON.stringify({ + model, + temperature: 0, + max_tokens: maxTokens, + messages: [ + { + role: 'system', + content: [ + 'You are Ricky, the AgentWorkforce workflow reliability, coordination, and authoring assistant.', + 'Follow Ricky repository conventions from AGENTS.md, workflow standards, shared authoring rules, and product specs.', + 'Answer the user request directly. Keep the answer concise and under 700 words.', + 'Do not mention this eval harness or hidden rubric.', + ].join(' '), + }, + { + role: 'user', + content: buildProviderPrompt(testCase), + }, + ], + }), + }); + + const payload = await response.json().catch(() => ({})); + if (!response.ok) { + const detail = typeof payload?.error?.message === 'string' ? payload.error.message : JSON.stringify(payload); + const error = new Error(`OpenRouter eval failed: ${response.status} ${detail}`); + error.status = response.status; + throw error; + } + + const choice = payload?.choices?.[0]; + const content = contentFromOpenRouterChoice(choice); + const finishReason = typeof choice?.finish_reason === 'string' ? choice.finish_reason : undefined; + return { + content, + note: finishReason ? `finish_reason=${finishReason}` : undefined, + }; + } catch (error) { + if (error instanceof Error && error.name === 'AbortError') { + const timeoutError = new Error(`OpenRouter eval timed out after ${timeoutMs}ms.`); + timeoutError.retryable = true; + throw timeoutError; + } + throw error; + } finally { + clearTimeout(timeout); + } +} + +function contentFromOpenRouterChoice(choice) { + const message = choice?.message; + const direct = typeof message?.content === 'string' ? message.content.trim() : ''; + if (direct) return direct; + + const contentParts = Array.isArray(message?.content) ? message.content : []; + const fromParts = contentParts + .map((part) => { + if (typeof part === 'string') return part; + if (typeof part?.text === 'string') return part.text; + if (typeof part?.content === 'string') return part.content; + return ''; + }) + .join('\n') + .trim(); + if (fromParts) return fromParts; + + return ''; +} + +function isRetryableOpenRouterError(error) { + if (!(error instanceof Error)) return false; + const status = typeof error.status === 'number' ? error.status : undefined; + return error.retryable === true || error.name === 'AbortError' || status === 408 || status === 409 || status === 429 || (status !== undefined && status >= 500); +} + function executeOpenCode(testCase, context) { if (!context.providerMode) { throw createSkippedEvalError('opencode executor skipped; rerun with --provider or HUMAN_EVAL_PROVIDER=1'); @@ -188,6 +346,10 @@ function decodeMockText(value) { } function buildOpenCodePrompt(testCase) { + return buildProviderPrompt(testCase); +} + +function buildProviderPrompt(testCase) { const systemPrompt = stringValue(testCase.input.systemPrompt); const threadHistory = Array.isArray(testCase.input.threadHistory) ? testCase.input.threadHistory