diff --git a/README.md b/README.md index 9f833a11..86d1bb95 100644 --- a/README.md +++ b/README.md @@ -181,6 +181,13 @@ rli mcp start # Start the MCP server rli mcp install # Install Runloop MCP server configurat... ``` +### Benchmark-job Commands (alias: `bmj`) + +```bash +rli benchmark-job run # Run a benchmark job with an agent +rli benchmark-job status # Get benchmark job status and results +``` + ## MCP Server (AI Integration) diff --git a/src/commands/benchmark-job/run.ts b/src/commands/benchmark-job/run.ts new file mode 100644 index 00000000..e0960841 --- /dev/null +++ b/src/commands/benchmark-job/run.ts @@ -0,0 +1,300 @@ +/** + * Run benchmark job command + */ + +import chalk from "chalk"; +import { createBenchmarkJob } from "../../services/benchmarkJobService.js"; +import { + listBenchmarks, + listPublicBenchmarks, +} from "../../services/benchmarkService.js"; +import { getClient } from "../../utils/client.js"; +import { output, outputError } from "../../utils/output.js"; + +// Secret name prefix for benchmark job secrets +const SECRET_PREFIX = "BMJ_"; + +// Supported agents and their automatic environment variables (mapped to BMJ_* secrets) +// - automaticEnvVars: env vars that will be auto-populated from secrets or environment +// - requiresAny: if true, at least one must be set; if false, just try to auto-populate +const SUPPORTED_AGENTS = { + "claude-code": { + automaticEnvVars: ["ANTHROPIC_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"], + requiresAny: true, // At least one of these is required + }, + codex: { + automaticEnvVars: ["OPENAI_API_KEY"], + requiresAny: true, + }, + opencode: { + automaticEnvVars: ["ANTHROPIC_API_KEY", "OPENAI_API_KEY", "GOOGLE_API_KEY"], + requiresAny: false, // Try to auto-populate, but user may configure differently + }, + goose: { + automaticEnvVars: ["ANTHROPIC_API_KEY", "OPENAI_API_KEY", "GOOGLE_API_KEY"], + requiresAny: false, // Try to auto-populate, but user may configure differently + }, + "gemini-cli": { + automaticEnvVars: ["GEMINI_API_KEY", "GOOGLE_API_KEY"], + requiresAny: true, // At least one of these is required + }, +} as const; + +type SupportedAgent = keyof typeof SUPPORTED_AGENTS; + +interface RunOptions { + agent: string; + model: string; + benchmark?: string; + scenarios?: string[]; + jobName?: string; + envVars?: string[]; + secrets?: string[]; + timeout?: string; + nAttempts?: string; + nConcurrentTrials?: string; + timeoutMultiplier?: string; + output?: string; +} + +// Parse environment variables from KEY=value format +function parseEnvVars(envVars: string[]): Record { + const result: Record = {}; + for (const envVar of envVars) { + const eqIndex = envVar.indexOf("="); + if (eqIndex === -1) { + throw new Error( + `Invalid environment variable format: ${envVar}. Expected KEY=value`, + ); + } + const key = envVar.substring(0, eqIndex); + const value = envVar.substring(eqIndex + 1); + result[key] = value; + } + return result; +} + +// Parse secrets from ENV_VAR=SECRET_NAME format +function parseSecrets(secrets: string[]): Record { + const result: Record = {}; + for (const secret of secrets) { + const eqIndex = secret.indexOf("="); + if (eqIndex === -1) { + throw new Error( + `Invalid secret format: ${secret}. Expected ENV_VAR=SECRET_NAME`, + ); + } + const envVarName = secret.substring(0, eqIndex); + const secretName = secret.substring(eqIndex + 1); + result[envVarName] = secretName; + } + return result; +} + +// Validate agent is supported +function validateAgent(agent: string): asserts agent is SupportedAgent { + if (!(agent in SUPPORTED_AGENTS)) { + const supportedList = Object.keys(SUPPORTED_AGENTS).join(", "); + throw new Error( + `Unsupported agent: ${agent}. Supported agents: ${supportedList}`, + ); + } +} + +// Check if a secret exists by name +async function secretExists(secretName: string): Promise { + const client = getClient(); + // TODO: Fetch by name when API exposed. + const result = await client.secrets.list({ limit: 5000 }); + return result.secrets?.some((s) => s.name === secretName) ?? false; +} + +// Create a secret +async function createSecret(name: string, value: string): Promise { + const client = getClient(); + await client.secrets.create({ name, value }); +} + +// Ensure agent secrets exist, creating them from env vars if needed +// Returns the secrets mapping (ENV_VAR -> BMJ_ENV_VAR) +async function ensureAgentSecrets( + agent: SupportedAgent, +): Promise> { + const agentConfig = SUPPORTED_AGENTS[agent]; + const secrets: Record = {}; + + for (const varName of agentConfig.automaticEnvVars) { + const secretName = `${SECRET_PREFIX}${varName}`; + const envValue = process.env[varName]; + + // Check if secret exists + const exists = await secretExists(secretName); + + if (exists) { + console.log(chalk.dim(`Secret ${secretName} exists`)); + secrets[varName] = secretName; + } else if (envValue) { + // Create secret from env var + console.log( + chalk.cyan(`Creating secret ${secretName} from ${varName} env var`), + ); + await createSecret(secretName, envValue); + secrets[varName] = secretName; + } else { + // No secret and no env var - skip (will be validated later if required) + console.log( + chalk.yellow( + `Secret ${secretName} not found and ${varName} not set in environment`, + ), + ); + } + } + + return secrets; +} + +// Resolve benchmark name to ID if needed +async function resolveBenchmarkId(benchmarkIdOrName: string): Promise { + // If it looks like an ID (starts with bm_ or similar), return as-is + if ( + benchmarkIdOrName.startsWith("bm_") || + benchmarkIdOrName.startsWith("bmk_") + ) { + return benchmarkIdOrName; + } + + // Search both user benchmarks and public benchmarks + const [userResult, publicResult] = await Promise.all([ + listBenchmarks({ + limit: 100, + search: benchmarkIdOrName, + }), + listPublicBenchmarks({ + limit: 100, + search: benchmarkIdOrName, + }), + ]); + + // Combine results + const allBenchmarks = [...userResult.benchmarks, ...publicResult.benchmarks]; + + // Look for exact name match + const exactMatch = allBenchmarks.find((b) => b.name === benchmarkIdOrName); + + if (exactMatch) { + return exactMatch.id; + } + + if (allBenchmarks.length === 0) { + throw new Error(`No benchmark found with name: ${benchmarkIdOrName}`); + } + + // If no exact match but we have results, suggest them + const suggestions = allBenchmarks + .slice(0, 5) + .map((b) => ` - ${b.name} (${b.id})`) + .join("\n"); + throw new Error( + `No exact match for benchmark "${benchmarkIdOrName}". Did you mean:\n${suggestions}`, + ); +} + +export async function runBenchmarkJob(options: RunOptions) { + try { + // Validate agent + validateAgent(options.agent); + const agent = options.agent as SupportedAgent; + + // Parse provided env vars and secrets + const providedEnvVars = options.envVars + ? parseEnvVars(options.envVars) + : {}; + const providedSecrets = options.secrets + ? parseSecrets(options.secrets) + : {}; + + // Ensure agent secrets exist (auto-create from env vars if needed) + // Maps ENV_VAR -> BMJ_ENV_VAR (e.g., ANTHROPIC_API_KEY -> BMJ_ANTHROPIC_API_KEY) + const agentSecrets = await ensureAgentSecrets(agent); + + // Validate that at least one secret is available (only if requiresAny is true) + const agentConfig = SUPPORTED_AGENTS[agent]; + if (agentConfig.requiresAny) { + const hasAny = agentConfig.automaticEnvVars.some( + (varName) => agentSecrets[varName], + ); + if (!hasAny) { + throw new Error( + `Agent ${agent} requires at least one of: ${agentConfig.automaticEnvVars.join(", ")}. ` + + `Create secrets (${agentConfig.automaticEnvVars.map((v) => `${SECRET_PREFIX}${v}`).join(", ")}) ` + + `or set environment variables.`, + ); + } + } + // If requiresAny is false, we just use whatever secrets were auto-populated + // User may be configuring credentials via other means (e.g., --secrets flag) + + // Combine agent secrets with user-provided secrets + const secrets = { + ...agentSecrets, + ...providedSecrets, + }; + + // Validate that either benchmark or scenarios is provided, but not both + if (!options.benchmark && !options.scenarios) { + throw new Error("Either --benchmark or --scenarios must be specified"); + } + if (options.benchmark && options.scenarios) { + throw new Error("Cannot specify both --benchmark and --scenarios"); + } + + // Resolve benchmark ID if name was provided + let benchmarkId: string | undefined; + if (options.benchmark) { + benchmarkId = await resolveBenchmarkId(options.benchmark); + } + + // Build orchestrator config with defaults + const orchestratorConfig = { + nConcurrentTrials: options.nConcurrentTrials + ? parseInt(options.nConcurrentTrials, 10) + : 10, + nAttempts: options.nAttempts ? parseInt(options.nAttempts, 10) : 1, + timeoutMultiplier: options.timeoutMultiplier + ? parseFloat(options.timeoutMultiplier) + : 1.0, + quiet: false, + }; + + // Create the benchmark job + const job = await createBenchmarkJob({ + name: options.jobName, + benchmarkId, + scenarioIds: options.scenarios, + agentConfigs: [ + { + name: agent, + modelName: options.model, + timeoutSeconds: options.timeout + ? parseInt(options.timeout, 10) + : 1800, + environmentVariables: + Object.keys(providedEnvVars).length > 0 + ? providedEnvVars + : undefined, + secrets, + }, + ], + orchestratorConfig, + }); + + // Output result + if (!options.output || options.output === "text") { + console.log(job.id); + } else { + output(job, { format: options.output, defaultFormat: "json" }); + } + } catch (error) { + outputError("Failed to run benchmark job", error); + } +} diff --git a/src/commands/benchmark-job/status.ts b/src/commands/benchmark-job/status.ts new file mode 100644 index 00000000..c65bef8a --- /dev/null +++ b/src/commands/benchmark-job/status.ts @@ -0,0 +1,295 @@ +/** + * Status benchmark job command + */ + +import chalk from "chalk"; +import { getBenchmarkJob } from "../../services/benchmarkJobService.js"; +import { output, outputError } from "../../utils/output.js"; + +interface StatusOptions { + watch?: boolean; + output?: string; +} + +// Job states that indicate completion +const COMPLETED_STATES = ["completed", "failed", "canceled", "timeout"]; + +// Polling config +const POLL_INTERVAL_MS = 10 * 1000; // 10 seconds +const MAX_WAIT_MS = 60 * 60 * 4 * 1000; // 4 hours + +// Sleep utility +function sleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + +interface ScenarioOutcome { + scenario_name?: string; + scenario_definition_id?: string; + state?: string; + score?: number; +} + +interface BenchmarkOutcome { + agent_name?: string; + model_name?: string; + scenario_outcomes?: ScenarioOutcome[]; +} + +interface JobData { + id: string; + name?: string; + state?: string; + benchmark_outcomes?: BenchmarkOutcome[]; +} + +// Calculate stats for scenario outcomes +function calculateStats(outcomes: ScenarioOutcome[]): { + total: number; + passed: number; + failedZero: number; + failedError: number; +} { + let passed = 0; + let failedZero = 0; + let failedError = 0; + + for (const outcome of outcomes) { + const state = outcome.state?.toUpperCase(); + const score = outcome.score; + + if (state === "COMPLETED") { + if (score === 1.0) { + passed++; + } else { + failedZero++; + } + } else { + // Any non-COMPLETED state is an error + failedError++; + } + } + + return { + total: outcomes.length, + passed, + failedZero, + failedError, + }; +} + +// Format percentage +function formatPercent(count: number, total: number): string { + if (total === 0) return "0.0%"; + return ((count / total) * 100).toFixed(1) + "%"; +} + +// Print current status (brief) +function printStatus(job: JobData): void { + const jobName = job.name || job.id; + const state = job.state || "unknown"; + + console.log(`Job: ${jobName}`); + console.log(`ID: ${job.id}`); + console.log(`State: ${state}`); + + if (COMPLETED_STATES.includes(state)) { + const outcomes = job.benchmark_outcomes || []; + if (outcomes.length > 0) { + let totalScenarios = 0; + let totalPassed = 0; + for (const outcome of outcomes) { + const stats = calculateStats(outcome.scenario_outcomes || []); + totalScenarios += stats.total; + totalPassed += stats.passed; + } + if (totalScenarios > 0) { + console.log( + `Results: ${totalPassed}/${totalScenarios} passed (${formatPercent(totalPassed, totalScenarios)})`, + ); + } + } + } +} + +// Print results table +function printResultsTable(job: JobData): void { + const outcomes = job.benchmark_outcomes || []; + + if (outcomes.length === 0) { + console.log(chalk.yellow("No benchmark outcomes found")); + return; + } + + // Header + console.log(); + console.log(chalk.bold("Benchmark Job Results")); + console.log(chalk.dim(`Job ID: ${job.id}`)); + if (job.name) { + console.log(chalk.dim(`Name: ${job.name}`)); + } + console.log(chalk.dim(`State: ${job.state}`)); + console.log(); + + // Table header + const agentCol = "Agent / Model".padEnd(40); + const passedCol = "Passed".padStart(10); + const failedCol = "Failed (0.0)".padStart(14); + const errorCol = "Failed (error)".padStart(16); + const totalCol = "Total".padStart(8); + + console.log( + chalk.bold(agentCol + passedCol + failedCol + errorCol + totalCol), + ); + console.log(chalk.dim("-".repeat(88))); + + // Print each agent's results + for (const outcome of outcomes) { + const agentName = outcome.agent_name || "unknown"; + const modelName = outcome.model_name || "default"; + const scenarioOutcomes = outcome.scenario_outcomes || []; + + const stats = calculateStats(scenarioOutcomes); + + // Format agent/model column + let agentModelStr = agentName; + if (modelName && modelName !== "default") { + agentModelStr += ` (${modelName})`; + } + if (agentModelStr.length > 38) { + agentModelStr = agentModelStr.slice(0, 35) + "..."; + } + const agentModelCol = agentModelStr.padEnd(40); + + // Format stats columns with colors + const passedStr = formatPercent(stats.passed, stats.total); + const failedZeroStr = formatPercent(stats.failedZero, stats.total); + const failedErrorStr = formatPercent(stats.failedError, stats.total); + + const passedColored = + stats.passed > 0 + ? chalk.green(passedStr.padStart(10)) + : chalk.dim(passedStr.padStart(10)); + + const failedZeroColored = + stats.failedZero > 0 + ? chalk.yellow(failedZeroStr.padStart(14)) + : chalk.dim(failedZeroStr.padStart(14)); + + const failedErrorColored = + stats.failedError > 0 + ? chalk.red(failedErrorStr.padStart(16)) + : chalk.dim(failedErrorStr.padStart(16)); + + const totalColStr = String(stats.total).padStart(8); + + console.log( + agentModelCol + + passedColored + + failedZeroColored + + failedErrorColored + + chalk.dim(totalColStr), + ); + + // Print individual scenario results underneath (indented) + for (const scenario of scenarioOutcomes) { + const scenarioName = + scenario.scenario_name || scenario.scenario_definition_id || "unknown"; + const state = scenario.state || "unknown"; + const score = scenario.score; + + let statusIcon: string; + let statusColor: typeof chalk.green; + + if (state.toUpperCase() === "COMPLETED") { + if (score === 1.0) { + statusIcon = chalk.green("\u2713"); // checkmark + statusColor = chalk.green; + } else { + statusIcon = chalk.yellow("\u2717"); // X + statusColor = chalk.yellow; + } + } else { + statusIcon = chalk.red("!"); + statusColor = chalk.red; + } + + const scenarioNameTrunc = + scenarioName.length > 50 + ? scenarioName.slice(0, 47) + "..." + : scenarioName; + + const scoreStr = + score !== undefined ? `score=${score.toFixed(1)}` : state; + + console.log( + chalk.dim(" ") + + statusIcon + + " " + + chalk.dim(scenarioNameTrunc.padEnd(52)) + + statusColor(scoreStr), + ); + } + } + + console.log(); +} + +export async function statusBenchmarkJob( + id: string, + options: StatusOptions = {}, +) { + try { + // Initial fetch + let job = (await getBenchmarkJob(id)) as unknown as JobData; + + // Check if job is complete + const isComplete = COMPLETED_STATES.includes(job.state || ""); + + // If not waiting or already complete, just print status/results + if (!options.watch || isComplete) { + if (options.output && options.output !== "text") { + output(job, { format: options.output, defaultFormat: "json" }); + } else if (isComplete) { + printResultsTable(job); + } else { + printStatus(job); + } + return; + } + + // Wait mode: poll until complete + const jobName = job.name || job.id; + console.log(chalk.cyan(`Awaiting job "${jobName}" completion...`)); + console.log(chalk.dim(`Current state: ${job.state}`)); + console.log(); + + const startTime = Date.now(); + + while (!COMPLETED_STATES.includes(job.state || "")) { + // Check timeout + if (Date.now() - startTime > MAX_WAIT_MS) { + console.log(); + outputError( + `Timeout waiting for job completion after ${MAX_WAIT_MS / 1000 / 60} minutes`, + ); + } + + await sleep(POLL_INTERVAL_MS); + job = (await getBenchmarkJob(id)) as unknown as JobData; + process.stdout.write(chalk.dim(".")); + } + + console.log(); + console.log(); + + // Output based on format + if (options.output && options.output !== "text") { + output(job, { format: options.output, defaultFormat: "json" }); + } else { + printResultsTable(job); + } + } catch (error) { + outputError("Failed to get benchmark job status", error); + } +} diff --git a/src/services/benchmarkService.ts b/src/services/benchmarkService.ts index 17a164cd..e6373464 100644 --- a/src/services/benchmarkService.ts +++ b/src/services/benchmarkService.ts @@ -179,6 +179,40 @@ export async function getBenchmark(id: string): Promise { return client.benchmarks.retrieve(id); } +/** + * List public benchmark definitions with pagination + */ +export async function listPublicBenchmarks( + options: ListBenchmarksOptions, +): Promise { + const client = getClient(); + + const queryParams: { + limit?: number; + starting_after?: string; + search?: string; + } = { + limit: options.limit, + }; + + if (options.startingAfter) { + queryParams.starting_after = options.startingAfter; + } + + if (options.search) { + queryParams.search = options.search; + } + + const page = await client.benchmarks.listPublic(queryParams); + const benchmarks = page.benchmarks || []; + + return { + benchmarks, + totalCount: benchmarks.length, + hasMore: page.has_more || false, + }; +} + /** * Create/start a benchmark run with selected benchmarks */ diff --git a/src/utils/commands.ts b/src/utils/commands.ts index 07c67d0b..80038122 100644 --- a/src/utils/commands.ts +++ b/src/utils/commands.ts @@ -1012,6 +1012,62 @@ export function createProgram(): Command { await installMcpConfig(); }); + // Benchmark job commands + const benchmarkJob = program + .command("benchmark-job") + .description("Manage benchmark jobs") + .alias("bmj"); + + benchmarkJob + .command("run") + .description("Run a benchmark job with an agent") + .requiredOption( + "--agent ", + "Agent to use (claude-code, codex, opencode, goose, gemini-cli)", + ) + .requiredOption("--model ", "Model name for the agent") + .option("--benchmark ", "Benchmark ID or name to run") + .option( + "--scenarios ", + "Scenario IDs to run (alternative to --benchmark)", + ) + .option("-n, --job-name ", "Job name") + .option( + "--env-vars ", + "Additional environment variables (format: KEY=value)", + ) + .option( + "--secrets ", + "Secrets to inject as environment variables (format: ENV_VAR=SECRET_NAME)", + ) + .option("--timeout ", "Agent timeout in seconds") + .option("--n-attempts ", "Number of attempts per scenario") + .option("--n-concurrent-trials ", "Number of concurrent trials") + .option("--timeout-multiplier ", "Timeout multiplier") + .option( + "-o, --output [format]", + "Output format: text|json|yaml (default: text)", + ) + .action(async (options) => { + const { runBenchmarkJob } = + await import("../commands/benchmark-job/run.js"); + await runBenchmarkJob(options); + }); + + benchmarkJob + .command("status ") + .description("Get benchmark job status and results") + .option("-w, --watch", "Watch for job to complete before showing results") + .option( + "-o, --output [format]", + "Output format: text|json|yaml (default: text)", + ) + .action(async (id, options) => { + const { statusBenchmarkJob } = + await import("../commands/benchmark-job/status.js"); + await statusBenchmarkJob(id, options); + }); + // Hidden command: 'rli mcp' without subcommand starts the server (for Claude Desktop config compatibility) program .command("mcp-server", { hidden: true })