From 3868d961c41cdf01ac4de2c0fbed46b5d1b42d29 Mon Sep 17 00:00:00 2001 From: Ross Date: Thu, 5 Mar 2026 11:18:52 -0800 Subject: [PATCH 1/6] add benchmark job cli --- src/commands/benchmark-job/run.ts | 264 ++++++++++++++++++++++++ src/commands/benchmark-job/status.ts | 295 +++++++++++++++++++++++++++ src/utils/commands.ts | 58 ++++++ 3 files changed, 617 insertions(+) create mode 100644 src/commands/benchmark-job/run.ts create mode 100644 src/commands/benchmark-job/status.ts diff --git a/src/commands/benchmark-job/run.ts b/src/commands/benchmark-job/run.ts new file mode 100644 index 00000000..99f7d466 --- /dev/null +++ b/src/commands/benchmark-job/run.ts @@ -0,0 +1,264 @@ +/** + * Run benchmark job command + */ + +import { createBenchmarkJob } from "../../services/benchmarkJobService.js"; +import { listBenchmarks } from "../../services/benchmarkService.js"; +import { output, outputError } from "../../utils/output.js"; + +// Supported agents and their required environment variables +const SUPPORTED_AGENTS = { + "claude-code": { + requiredEnvVars: ["ANTHROPIC_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"], + requiresAny: true, // At least one of these is required + }, + codex: { + requiredEnvVars: ["OPENAI_API_KEY"], + requiresAny: false, + }, + opencode: { + requiredEnvVars: ["ANTHROPIC_API_KEY"], + requiresAny: false, + }, + goose: { + requiredEnvVars: ["ANTHROPIC_API_KEY"], + requiresAny: false, + }, + "gemini-cli": { + requiredEnvVars: ["GEMINI_API_KEY", "GOOGLE_API_KEY"], + requiresAny: true, // At least one of these is required + }, +} as const; + +type SupportedAgent = keyof typeof SUPPORTED_AGENTS; + +interface RunOptions { + agent: string; + model: string; + benchmark?: string; + scenarios?: string[]; + jobName?: string; + envVars?: string[]; + secrets?: string[]; + timeout?: string; + nAttempts?: string; + nConcurrentTrials?: string; + timeoutMultiplier?: string; + output?: string; +} + +// Parse environment variables from KEY=value format +function parseEnvVars(envVars: string[]): Record { + const result: Record = {}; + for (const envVar of envVars) { + const eqIndex = envVar.indexOf("="); + if (eqIndex === -1) { + throw new Error( + `Invalid environment variable format: ${envVar}. Expected KEY=value`, + ); + } + const key = envVar.substring(0, eqIndex); + const value = envVar.substring(eqIndex + 1); + result[key] = value; + } + return result; +} + +// Parse secrets from ENV_VAR=SECRET_NAME format +function parseSecrets(secrets: string[]): Record { + const result: Record = {}; + for (const secret of secrets) { + const eqIndex = secret.indexOf("="); + if (eqIndex === -1) { + throw new Error( + `Invalid secret format: ${secret}. Expected ENV_VAR=SECRET_NAME`, + ); + } + const envVarName = secret.substring(0, eqIndex); + const secretName = secret.substring(eqIndex + 1); + result[envVarName] = secretName; + } + return result; +} + +// Validate agent is supported +function validateAgent(agent: string): asserts agent is SupportedAgent { + if (!(agent in SUPPORTED_AGENTS)) { + const supportedList = Object.keys(SUPPORTED_AGENTS).join(", "); + throw new Error( + `Unsupported agent: ${agent}. Supported agents: ${supportedList}`, + ); + } +} + +// Get env vars from current environment for the agent +function getAgentEnvVars(agent: SupportedAgent): Record { + const agentConfig = SUPPORTED_AGENTS[agent]; + const envVars: Record = {}; + + for (const varName of agentConfig.requiredEnvVars) { + const value = process.env[varName]; + if (value) { + envVars[varName] = value; + } + } + + return envVars; +} + +// Validate that required env vars are present +function validateEnvVars( + agent: SupportedAgent, + providedEnvVars: Record, +): void { + const agentConfig = SUPPORTED_AGENTS[agent]; + const allEnvVars = { ...getAgentEnvVars(agent), ...providedEnvVars }; + + if (agentConfig.requiresAny) { + // At least one of the required env vars must be present + const hasAny = agentConfig.requiredEnvVars.some( + (varName) => allEnvVars[varName], + ); + if (!hasAny) { + throw new Error( + `Agent ${agent} requires at least one of: ${agentConfig.requiredEnvVars.join(", ")}. ` + + `Set via --env-vars or as environment variables.`, + ); + } + } else { + // For agents that don't use requiresAny, we just need at least one key + // since different models may need different keys + const hasAny = agentConfig.requiredEnvVars.some( + (varName) => allEnvVars[varName], + ); + if (!hasAny) { + throw new Error( + `Agent ${agent} requires environment variables. Expected one of: ${agentConfig.requiredEnvVars.join(", ")}. ` + + `Set via --env-vars or as environment variables.`, + ); + } + } +} + +// Resolve benchmark name to ID if needed +async function resolveBenchmarkId(benchmarkIdOrName: string): Promise { + // If it looks like an ID (starts with bm_ or similar), return as-is + if ( + benchmarkIdOrName.startsWith("bm_") || + benchmarkIdOrName.startsWith("bmk_") + ) { + return benchmarkIdOrName; + } + + // Otherwise, search for benchmark by name + const result = await listBenchmarks({ + limit: 100, + search: benchmarkIdOrName, + }); + + // Look for exact name match + const exactMatch = result.benchmarks.find( + (b) => b.name === benchmarkIdOrName, + ); + + if (exactMatch) { + return exactMatch.id; + } + + if (result.benchmarks.length === 0) { + throw new Error(`No benchmark found with name: ${benchmarkIdOrName}`); + } + + // If no exact match but we have results, suggest them + const suggestions = result.benchmarks + .slice(0, 5) + .map((b) => ` - ${b.name} (${b.id})`) + .join("\n"); + throw new Error( + `No exact match for benchmark "${benchmarkIdOrName}". Did you mean:\n${suggestions}`, + ); +} + +export async function runBenchmarkJob(options: RunOptions) { + try { + // Validate agent + validateAgent(options.agent); + const agent = options.agent as SupportedAgent; + + // Parse provided env vars and secrets + const providedEnvVars = options.envVars + ? parseEnvVars(options.envVars) + : {}; + const providedSecrets = options.secrets + ? parseSecrets(options.secrets) + : {}; + + // Merge environment variables (CLI-provided override auto-detected) + const environmentVariables = { + ...getAgentEnvVars(agent), + ...providedEnvVars, + }; + + // Validate required env vars + validateEnvVars(agent, providedEnvVars); + + // Validate that either benchmark or scenarios is provided, but not both + if (!options.benchmark && !options.scenarios) { + throw new Error( + "Either --benchmark or --scenarios must be specified", + ); + } + if (options.benchmark && options.scenarios) { + throw new Error("Cannot specify both --benchmark and --scenarios"); + } + + // Resolve benchmark ID if name was provided + let benchmarkId: string | undefined; + if (options.benchmark) { + benchmarkId = await resolveBenchmarkId(options.benchmark); + } + + // Build orchestrator config with defaults + const orchestratorConfig = { + nConcurrentTrials: options.nConcurrentTrials + ? parseInt(options.nConcurrentTrials, 10) + : 10, + nAttempts: options.nAttempts ? parseInt(options.nAttempts, 10) : 1, + timeoutMultiplier: options.timeoutMultiplier + ? parseFloat(options.timeoutMultiplier) + : 1.0, + quiet: false, + }; + + // Create the benchmark job + const job = await createBenchmarkJob({ + name: options.jobName, + benchmarkId, + scenarioIds: options.scenarios, + agentConfigs: [ + { + name: agent, + modelName: options.model, + timeoutSeconds: options.timeout + ? parseInt(options.timeout, 10) + : 1800, + environmentVariables, + secrets: + Object.keys(providedSecrets).length > 0 + ? providedSecrets + : undefined, + }, + ], + orchestratorConfig, + }); + + // Output result + if (!options.output || options.output === "text") { + console.log(job.id); + } else { + output(job, { format: options.output, defaultFormat: "json" }); + } + } catch (error) { + outputError("Failed to run benchmark job", error); + } +} diff --git a/src/commands/benchmark-job/status.ts b/src/commands/benchmark-job/status.ts new file mode 100644 index 00000000..fa55ffc4 --- /dev/null +++ b/src/commands/benchmark-job/status.ts @@ -0,0 +1,295 @@ +/** + * Status benchmark job command + */ + +import chalk from "chalk"; +import { getBenchmarkJob } from "../../services/benchmarkJobService.js"; +import { output, outputError } from "../../utils/output.js"; + +interface StatusOptions { + wait?: boolean; + output?: string; +} + +// Job states that indicate completion +const COMPLETED_STATES = ["completed", "failed", "canceled", "timeout"]; + +// Polling config +const POLL_INTERVAL_MS = 10 * 1000; // 10 seconds +const MAX_WAIT_MS = 60 * 60 * 1000; // 1 hour + +// Sleep utility +function sleep(ms: number): Promise { + return new Promise((resolve) => setTimeout(resolve, ms)); +} + +interface ScenarioOutcome { + scenario_name?: string; + scenario_definition_id?: string; + state?: string; + score?: number; +} + +interface BenchmarkOutcome { + agent_name?: string; + model_name?: string; + scenario_outcomes?: ScenarioOutcome[]; +} + +interface JobData { + id: string; + name?: string; + state?: string; + benchmark_outcomes?: BenchmarkOutcome[]; +} + +// Calculate stats for scenario outcomes +function calculateStats(outcomes: ScenarioOutcome[]): { + total: number; + passed: number; + failedZero: number; + failedError: number; +} { + let passed = 0; + let failedZero = 0; + let failedError = 0; + + for (const outcome of outcomes) { + const state = outcome.state?.toUpperCase(); + const score = outcome.score; + + if (state === "COMPLETED") { + if (score === 1.0) { + passed++; + } else { + failedZero++; + } + } else { + // Any non-COMPLETED state is an error + failedError++; + } + } + + return { + total: outcomes.length, + passed, + failedZero, + failedError, + }; +} + +// Format percentage +function formatPercent(count: number, total: number): string { + if (total === 0) return "0.0%"; + return ((count / total) * 100).toFixed(1) + "%"; +} + +// Print current status (brief) +function printStatus(job: JobData): void { + const jobName = job.name || job.id; + const state = job.state || "unknown"; + + console.log(`Job: ${jobName}`); + console.log(`ID: ${job.id}`); + console.log(`State: ${state}`); + + if (COMPLETED_STATES.includes(state)) { + const outcomes = job.benchmark_outcomes || []; + if (outcomes.length > 0) { + let totalScenarios = 0; + let totalPassed = 0; + for (const outcome of outcomes) { + const stats = calculateStats(outcome.scenario_outcomes || []); + totalScenarios += stats.total; + totalPassed += stats.passed; + } + if (totalScenarios > 0) { + console.log( + `Results: ${totalPassed}/${totalScenarios} passed (${formatPercent(totalPassed, totalScenarios)})`, + ); + } + } + } +} + +// Print results table +function printResultsTable(job: JobData): void { + const outcomes = job.benchmark_outcomes || []; + + if (outcomes.length === 0) { + console.log(chalk.yellow("No benchmark outcomes found")); + return; + } + + // Header + console.log(); + console.log(chalk.bold("Benchmark Job Results")); + console.log(chalk.dim(`Job ID: ${job.id}`)); + if (job.name) { + console.log(chalk.dim(`Name: ${job.name}`)); + } + console.log(chalk.dim(`State: ${job.state}`)); + console.log(); + + // Table header + const agentCol = "Agent / Model".padEnd(40); + const passedCol = "Passed".padStart(10); + const failedCol = "Failed (0.0)".padStart(14); + const errorCol = "Failed (error)".padStart(16); + const totalCol = "Total".padStart(8); + + console.log( + chalk.bold(agentCol + passedCol + failedCol + errorCol + totalCol), + ); + console.log(chalk.dim("-".repeat(88))); + + // Print each agent's results + for (const outcome of outcomes) { + const agentName = outcome.agent_name || "unknown"; + const modelName = outcome.model_name || "default"; + const scenarioOutcomes = outcome.scenario_outcomes || []; + + const stats = calculateStats(scenarioOutcomes); + + // Format agent/model column + let agentModelStr = agentName; + if (modelName && modelName !== "default") { + agentModelStr += ` (${modelName})`; + } + if (agentModelStr.length > 38) { + agentModelStr = agentModelStr.slice(0, 35) + "..."; + } + const agentModelCol = agentModelStr.padEnd(40); + + // Format stats columns with colors + const passedStr = formatPercent(stats.passed, stats.total); + const failedZeroStr = formatPercent(stats.failedZero, stats.total); + const failedErrorStr = formatPercent(stats.failedError, stats.total); + + const passedColored = + stats.passed > 0 + ? chalk.green(passedStr.padStart(10)) + : chalk.dim(passedStr.padStart(10)); + + const failedZeroColored = + stats.failedZero > 0 + ? chalk.yellow(failedZeroStr.padStart(14)) + : chalk.dim(failedZeroStr.padStart(14)); + + const failedErrorColored = + stats.failedError > 0 + ? chalk.red(failedErrorStr.padStart(16)) + : chalk.dim(failedErrorStr.padStart(16)); + + const totalColStr = String(stats.total).padStart(8); + + console.log( + agentModelCol + + passedColored + + failedZeroColored + + failedErrorColored + + chalk.dim(totalColStr), + ); + + // Print individual scenario results underneath (indented) + for (const scenario of scenarioOutcomes) { + const scenarioName = + scenario.scenario_name || scenario.scenario_definition_id || "unknown"; + const state = scenario.state || "unknown"; + const score = scenario.score; + + let statusIcon: string; + let statusColor: typeof chalk.green; + + if (state.toUpperCase() === "COMPLETED") { + if (score === 1.0) { + statusIcon = chalk.green("\u2713"); // checkmark + statusColor = chalk.green; + } else { + statusIcon = chalk.yellow("\u2717"); // X + statusColor = chalk.yellow; + } + } else { + statusIcon = chalk.red("!"); + statusColor = chalk.red; + } + + const scenarioNameTrunc = + scenarioName.length > 50 + ? scenarioName.slice(0, 47) + "..." + : scenarioName; + + const scoreStr = + score !== undefined ? `score=${score.toFixed(1)}` : state; + + console.log( + chalk.dim(" ") + + statusIcon + + " " + + chalk.dim(scenarioNameTrunc.padEnd(52)) + + statusColor(scoreStr), + ); + } + } + + console.log(); +} + +export async function statusBenchmarkJob( + id: string, + options: StatusOptions = {}, +) { + try { + // Initial fetch + let job = (await getBenchmarkJob(id)) as unknown as JobData; + + // Check if job is complete + const isComplete = COMPLETED_STATES.includes(job.state || ""); + + // If not waiting or already complete, just print status/results + if (!options.wait || isComplete) { + if (options.output && options.output !== "text") { + output(job, { format: options.output, defaultFormat: "json" }); + } else if (isComplete) { + printResultsTable(job); + } else { + printStatus(job); + } + return; + } + + // Wait mode: poll until complete + const jobName = job.name || job.id; + console.log(chalk.cyan(`Awaiting job "${jobName}" completion...`)); + console.log(chalk.dim(`Current state: ${job.state}`)); + console.log(); + + const startTime = Date.now(); + + while (!COMPLETED_STATES.includes(job.state || "")) { + // Check timeout + if (Date.now() - startTime > MAX_WAIT_MS) { + console.log(); + outputError( + `Timeout waiting for job completion after ${MAX_WAIT_MS / 1000 / 60} minutes`, + ); + } + + await sleep(POLL_INTERVAL_MS); + job = (await getBenchmarkJob(id)) as unknown as JobData; + process.stdout.write(chalk.dim(".")); + } + + console.log(); + console.log(); + + // Output based on format + if (options.output && options.output !== "text") { + output(job, { format: options.output, defaultFormat: "json" }); + } else { + printResultsTable(job); + } + } catch (error) { + outputError("Failed to get benchmark job status", error); + } +} diff --git a/src/utils/commands.ts b/src/utils/commands.ts index 07c67d0b..e6bbfc79 100644 --- a/src/utils/commands.ts +++ b/src/utils/commands.ts @@ -1012,6 +1012,64 @@ export function createProgram(): Command { await installMcpConfig(); }); + // Benchmark job commands + const benchmarkJob = program + .command("benchmark-job") + .description("Manage benchmark jobs") + .alias("bmj"); + + benchmarkJob + .command("run") + .description("Run a benchmark job with an agent") + .requiredOption( + "--agent ", + "Agent to use (claude-code, codex, opencode, goose, gemini-cli)", + ) + .requiredOption("--model ", "Model name for the agent") + .option("--benchmark ", "Benchmark ID or name to run") + .option( + "--scenarios ", + "Scenario IDs to run (alternative to --benchmark)", + ) + .option("-n, --job-name ", "Job name") + .option( + "--env-vars ", + "Environment variables (format: KEY=value). Agent-specific API keys are auto-detected from environment.", + ) + .option( + "--secrets ", + "Secrets to inject as environment variables (format: ENV_VAR=SECRET_NAME)", + ) + .option("--timeout ", "Agent timeout in seconds") + .option("--n-attempts ", "Number of attempts per scenario") + .option("--n-concurrent-trials ", "Number of concurrent trials") + .option("--timeout-multiplier ", "Timeout multiplier") + .option( + "-o, --output [format]", + "Output format: text|json|yaml (default: text)", + ) + .action(async (options) => { + const { runBenchmarkJob } = await import( + "../commands/benchmark-job/run.js" + ); + await runBenchmarkJob(options); + }); + + benchmarkJob + .command("status ") + .description("Get benchmark job status and results") + .option("-w, --wait", "Wait for job to complete before showing results") + .option( + "-o, --output [format]", + "Output format: text|json|yaml (default: text)", + ) + .action(async (id, options) => { + const { statusBenchmarkJob } = await import( + "../commands/benchmark-job/status.js" + ); + await statusBenchmarkJob(id, options); + }); + // Hidden command: 'rli mcp' without subcommand starts the server (for Claude Desktop config compatibility) program .command("mcp-server", { hidden: true }) From e0d2419ffe12ac6d45ee42ea78f4f5d648a18e97 Mon Sep 17 00:00:00 2001 From: Ross Date: Thu, 5 Mar 2026 12:33:51 -0800 Subject: [PATCH 2/6] cp --- src/commands/benchmark-job/run.ts | 167 +++++++++++++++++++----------- src/services/benchmarkService.ts | 34 ++++++ src/utils/commands.ts | 2 +- 3 files changed, 139 insertions(+), 64 deletions(-) diff --git a/src/commands/benchmark-job/run.ts b/src/commands/benchmark-job/run.ts index 99f7d466..3982e9e1 100644 --- a/src/commands/benchmark-job/run.ts +++ b/src/commands/benchmark-job/run.ts @@ -2,11 +2,19 @@ * Run benchmark job command */ +import chalk from "chalk"; import { createBenchmarkJob } from "../../services/benchmarkJobService.js"; -import { listBenchmarks } from "../../services/benchmarkService.js"; +import { + listBenchmarks, + listPublicBenchmarks, +} from "../../services/benchmarkService.js"; +import { getClient } from "../../utils/client.js"; import { output, outputError } from "../../utils/output.js"; -// Supported agents and their required environment variables +// Secret name prefix for benchmark job secrets +const SECRET_PREFIX = "BMJ_"; + +// Supported agents and their required environment variables (mapped to BMJ_* secrets) const SUPPORTED_AGENTS = { "claude-code": { requiredEnvVars: ["ANTHROPIC_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"], @@ -91,53 +99,55 @@ function validateAgent(agent: string): asserts agent is SupportedAgent { } } -// Get env vars from current environment for the agent -function getAgentEnvVars(agent: SupportedAgent): Record { - const agentConfig = SUPPORTED_AGENTS[agent]; - const envVars: Record = {}; - - for (const varName of agentConfig.requiredEnvVars) { - const value = process.env[varName]; - if (value) { - envVars[varName] = value; - } - } +// Check if a secret exists by name +async function secretExists(secretName: string): Promise { + const client = getClient(); + const result = await client.secrets.list({ limit: 5000 }); + return result.secrets?.some((s) => s.name === secretName) ?? false; +} - return envVars; +// Create a secret +async function createSecret(name: string, value: string): Promise { + const client = getClient(); + await client.secrets.create({ name, value }); } -// Validate that required env vars are present -function validateEnvVars( +// Ensure agent secrets exist, creating them from env vars if needed +// Returns the secrets mapping (ENV_VAR -> BMJ_ENV_VAR) +async function ensureAgentSecrets( agent: SupportedAgent, - providedEnvVars: Record, -): void { +): Promise> { const agentConfig = SUPPORTED_AGENTS[agent]; - const allEnvVars = { ...getAgentEnvVars(agent), ...providedEnvVars }; + const secrets: Record = {}; - if (agentConfig.requiresAny) { - // At least one of the required env vars must be present - const hasAny = agentConfig.requiredEnvVars.some( - (varName) => allEnvVars[varName], - ); - if (!hasAny) { - throw new Error( - `Agent ${agent} requires at least one of: ${agentConfig.requiredEnvVars.join(", ")}. ` + - `Set via --env-vars or as environment variables.`, + for (const varName of agentConfig.requiredEnvVars) { + const secretName = `${SECRET_PREFIX}${varName}`; + const envValue = process.env[varName]; + + // Check if secret exists + const exists = await secretExists(secretName); + + if (exists) { + console.log(chalk.dim(`Secret ${secretName} exists`)); + secrets[varName] = secretName; + } else if (envValue) { + // Create secret from env var + console.log( + chalk.cyan(`Creating secret ${secretName} from ${varName} env var`), ); - } - } else { - // For agents that don't use requiresAny, we just need at least one key - // since different models may need different keys - const hasAny = agentConfig.requiredEnvVars.some( - (varName) => allEnvVars[varName], - ); - if (!hasAny) { - throw new Error( - `Agent ${agent} requires environment variables. Expected one of: ${agentConfig.requiredEnvVars.join(", ")}. ` + - `Set via --env-vars or as environment variables.`, + await createSecret(secretName, envValue); + secrets[varName] = secretName; + } else { + // No secret and no env var - skip (will be validated later if required) + console.log( + chalk.yellow( + `Secret ${secretName} not found and ${varName} not set in environment`, + ), ); } } + + return secrets; } // Resolve benchmark name to ID if needed @@ -150,27 +160,34 @@ async function resolveBenchmarkId(benchmarkIdOrName: string): Promise { return benchmarkIdOrName; } - // Otherwise, search for benchmark by name - const result = await listBenchmarks({ - limit: 100, - search: benchmarkIdOrName, - }); + // Search both user benchmarks and public benchmarks + const [userResult, publicResult] = await Promise.all([ + listBenchmarks({ + limit: 100, + search: benchmarkIdOrName, + }), + listPublicBenchmarks({ + limit: 100, + search: benchmarkIdOrName, + }), + ]); + + // Combine results + const allBenchmarks = [...userResult.benchmarks, ...publicResult.benchmarks]; // Look for exact name match - const exactMatch = result.benchmarks.find( - (b) => b.name === benchmarkIdOrName, - ); + const exactMatch = allBenchmarks.find((b) => b.name === benchmarkIdOrName); if (exactMatch) { return exactMatch.id; } - if (result.benchmarks.length === 0) { + if (allBenchmarks.length === 0) { throw new Error(`No benchmark found with name: ${benchmarkIdOrName}`); } // If no exact match but we have results, suggest them - const suggestions = result.benchmarks + const suggestions = allBenchmarks .slice(0, 5) .map((b) => ` - ${b.name} (${b.id})`) .join("\n"); @@ -193,20 +210,44 @@ export async function runBenchmarkJob(options: RunOptions) { ? parseSecrets(options.secrets) : {}; - // Merge environment variables (CLI-provided override auto-detected) - const environmentVariables = { - ...getAgentEnvVars(agent), - ...providedEnvVars, - }; + // Ensure agent secrets exist (auto-create from env vars if needed) + // Maps ENV_VAR -> BMJ_ENV_VAR (e.g., ANTHROPIC_API_KEY -> BMJ_ANTHROPIC_API_KEY) + const agentSecrets = await ensureAgentSecrets(agent); + + // Validate that at least one required secret is available + const agentConfig = SUPPORTED_AGENTS[agent]; + if (agentConfig.requiresAny) { + const hasAny = agentConfig.requiredEnvVars.some( + (varName) => agentSecrets[varName], + ); + if (!hasAny) { + throw new Error( + `Agent ${agent} requires at least one of: ${agentConfig.requiredEnvVars.join(", ")}. ` + + `Create secrets (${agentConfig.requiredEnvVars.map((v) => `${SECRET_PREFIX}${v}`).join(", ")}) ` + + `or set environment variables.`, + ); + } + } else { + const hasAny = agentConfig.requiredEnvVars.some( + (varName) => agentSecrets[varName], + ); + if (!hasAny) { + throw new Error( + `Agent ${agent} requires secrets. Expected one of: ${agentConfig.requiredEnvVars.map((v) => `${SECRET_PREFIX}${v}`).join(", ")}. ` + + `Create secrets or set environment variables.`, + ); + } + } - // Validate required env vars - validateEnvVars(agent, providedEnvVars); + // Combine agent secrets with user-provided secrets + const secrets = { + ...agentSecrets, + ...providedSecrets, + }; // Validate that either benchmark or scenarios is provided, but not both if (!options.benchmark && !options.scenarios) { - throw new Error( - "Either --benchmark or --scenarios must be specified", - ); + throw new Error("Either --benchmark or --scenarios must be specified"); } if (options.benchmark && options.scenarios) { throw new Error("Cannot specify both --benchmark and --scenarios"); @@ -242,11 +283,11 @@ export async function runBenchmarkJob(options: RunOptions) { timeoutSeconds: options.timeout ? parseInt(options.timeout, 10) : 1800, - environmentVariables, - secrets: - Object.keys(providedSecrets).length > 0 - ? providedSecrets + environmentVariables: + Object.keys(providedEnvVars).length > 0 + ? providedEnvVars : undefined, + secrets, }, ], orchestratorConfig, diff --git a/src/services/benchmarkService.ts b/src/services/benchmarkService.ts index 17a164cd..e6373464 100644 --- a/src/services/benchmarkService.ts +++ b/src/services/benchmarkService.ts @@ -179,6 +179,40 @@ export async function getBenchmark(id: string): Promise { return client.benchmarks.retrieve(id); } +/** + * List public benchmark definitions with pagination + */ +export async function listPublicBenchmarks( + options: ListBenchmarksOptions, +): Promise { + const client = getClient(); + + const queryParams: { + limit?: number; + starting_after?: string; + search?: string; + } = { + limit: options.limit, + }; + + if (options.startingAfter) { + queryParams.starting_after = options.startingAfter; + } + + if (options.search) { + queryParams.search = options.search; + } + + const page = await client.benchmarks.listPublic(queryParams); + const benchmarks = page.benchmarks || []; + + return { + benchmarks, + totalCount: benchmarks.length, + hasMore: page.has_more || false, + }; +} + /** * Create/start a benchmark run with selected benchmarks */ diff --git a/src/utils/commands.ts b/src/utils/commands.ts index e6bbfc79..31439fe4 100644 --- a/src/utils/commands.ts +++ b/src/utils/commands.ts @@ -1034,7 +1034,7 @@ export function createProgram(): Command { .option("-n, --job-name ", "Job name") .option( "--env-vars ", - "Environment variables (format: KEY=value). Agent-specific API keys are auto-detected from environment.", + "Additional environment variables (format: KEY=value)", ) .option( "--secrets ", From 7bcb047b26799956ac9c5ff36bcf71640885d0a1 Mon Sep 17 00:00:00 2001 From: Ross Date: Thu, 5 Mar 2026 12:34:36 -0800 Subject: [PATCH 3/6] cp --- src/utils/commands.ts | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/src/utils/commands.ts b/src/utils/commands.ts index 31439fe4..8f922289 100644 --- a/src/utils/commands.ts +++ b/src/utils/commands.ts @@ -1049,9 +1049,8 @@ export function createProgram(): Command { "Output format: text|json|yaml (default: text)", ) .action(async (options) => { - const { runBenchmarkJob } = await import( - "../commands/benchmark-job/run.js" - ); + const { runBenchmarkJob } = + await import("../commands/benchmark-job/run.js"); await runBenchmarkJob(options); }); @@ -1064,9 +1063,8 @@ export function createProgram(): Command { "Output format: text|json|yaml (default: text)", ) .action(async (id, options) => { - const { statusBenchmarkJob } = await import( - "../commands/benchmark-job/status.js" - ); + const { statusBenchmarkJob } = + await import("../commands/benchmark-job/status.js"); await statusBenchmarkJob(id, options); }); From 40900430598079c511c2d142a6aefefc4757b080 Mon Sep 17 00:00:00 2001 From: Ross Date: Thu, 5 Mar 2026 12:34:56 -0800 Subject: [PATCH 4/6] cp --- README.md | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/README.md b/README.md index 9f833a11..86d1bb95 100644 --- a/README.md +++ b/README.md @@ -181,6 +181,13 @@ rli mcp start # Start the MCP server rli mcp install # Install Runloop MCP server configurat... ``` +### Benchmark-job Commands (alias: `bmj`) + +```bash +rli benchmark-job run # Run a benchmark job with an agent +rli benchmark-job status # Get benchmark job status and results +``` + ## MCP Server (AI Integration) From 523b89c9567a9d6d96dbc5d8b630ca633ec710a1 Mon Sep 17 00:00:00 2001 From: Ross Date: Thu, 5 Mar 2026 12:41:45 -0800 Subject: [PATCH 5/6] cp --- src/commands/benchmark-job/run.ts | 42 +++++++++++++------------------ 1 file changed, 18 insertions(+), 24 deletions(-) diff --git a/src/commands/benchmark-job/run.ts b/src/commands/benchmark-job/run.ts index 3982e9e1..79e426bf 100644 --- a/src/commands/benchmark-job/run.ts +++ b/src/commands/benchmark-job/run.ts @@ -14,26 +14,28 @@ import { output, outputError } from "../../utils/output.js"; // Secret name prefix for benchmark job secrets const SECRET_PREFIX = "BMJ_"; -// Supported agents and their required environment variables (mapped to BMJ_* secrets) +// Supported agents and their automatic environment variables (mapped to BMJ_* secrets) +// - automaticEnvVars: env vars that will be auto-populated from secrets or environment +// - requiresAny: if true, at least one must be set; if false, just try to auto-populate const SUPPORTED_AGENTS = { "claude-code": { - requiredEnvVars: ["ANTHROPIC_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"], + automaticEnvVars: ["ANTHROPIC_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"], requiresAny: true, // At least one of these is required }, codex: { - requiredEnvVars: ["OPENAI_API_KEY"], - requiresAny: false, + automaticEnvVars: ["OPENAI_API_KEY"], + requiresAny: true, }, opencode: { - requiredEnvVars: ["ANTHROPIC_API_KEY"], - requiresAny: false, + automaticEnvVars: ["ANTHROPIC_API_KEY", "OPENAI_API_KEY", "GOOGLE_API_KEY"], + requiresAny: false, // Try to auto-populate, but user may configure differently }, goose: { - requiredEnvVars: ["ANTHROPIC_API_KEY"], - requiresAny: false, + automaticEnvVars: ["ANTHROPIC_API_KEY", "OPENAI_API_KEY", "GOOGLE_API_KEY"], + requiresAny: false, // Try to auto-populate, but user may configure differently }, "gemini-cli": { - requiredEnvVars: ["GEMINI_API_KEY", "GOOGLE_API_KEY"], + automaticEnvVars: ["GEMINI_API_KEY", "GOOGLE_API_KEY"], requiresAny: true, // At least one of these is required }, } as const; @@ -120,7 +122,7 @@ async function ensureAgentSecrets( const agentConfig = SUPPORTED_AGENTS[agent]; const secrets: Record = {}; - for (const varName of agentConfig.requiredEnvVars) { + for (const varName of agentConfig.automaticEnvVars) { const secretName = `${SECRET_PREFIX}${varName}`; const envValue = process.env[varName]; @@ -214,30 +216,22 @@ export async function runBenchmarkJob(options: RunOptions) { // Maps ENV_VAR -> BMJ_ENV_VAR (e.g., ANTHROPIC_API_KEY -> BMJ_ANTHROPIC_API_KEY) const agentSecrets = await ensureAgentSecrets(agent); - // Validate that at least one required secret is available + // Validate that at least one secret is available (only if requiresAny is true) const agentConfig = SUPPORTED_AGENTS[agent]; if (agentConfig.requiresAny) { - const hasAny = agentConfig.requiredEnvVars.some( + const hasAny = agentConfig.automaticEnvVars.some( (varName) => agentSecrets[varName], ); if (!hasAny) { throw new Error( - `Agent ${agent} requires at least one of: ${agentConfig.requiredEnvVars.join(", ")}. ` + - `Create secrets (${agentConfig.requiredEnvVars.map((v) => `${SECRET_PREFIX}${v}`).join(", ")}) ` + + `Agent ${agent} requires at least one of: ${agentConfig.automaticEnvVars.join(", ")}. ` + + `Create secrets (${agentConfig.automaticEnvVars.map((v) => `${SECRET_PREFIX}${v}`).join(", ")}) ` + `or set environment variables.`, ); } - } else { - const hasAny = agentConfig.requiredEnvVars.some( - (varName) => agentSecrets[varName], - ); - if (!hasAny) { - throw new Error( - `Agent ${agent} requires secrets. Expected one of: ${agentConfig.requiredEnvVars.map((v) => `${SECRET_PREFIX}${v}`).join(", ")}. ` + - `Create secrets or set environment variables.`, - ); - } } + // If requiresAny is false, we just use whatever secrets were auto-populated + // User may be configuring credentials via other means (e.g., --secrets flag) // Combine agent secrets with user-provided secrets const secrets = { From 3e364c202e5811dc4eb97cfbe3b07fa5a106ab63 Mon Sep 17 00:00:00 2001 From: Ross Date: Thu, 5 Mar 2026 14:02:55 -0800 Subject: [PATCH 6/6] pr feedback --- src/commands/benchmark-job/run.ts | 1 + src/commands/benchmark-job/status.ts | 6 +++--- src/utils/commands.ts | 2 +- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/src/commands/benchmark-job/run.ts b/src/commands/benchmark-job/run.ts index 79e426bf..e0960841 100644 --- a/src/commands/benchmark-job/run.ts +++ b/src/commands/benchmark-job/run.ts @@ -104,6 +104,7 @@ function validateAgent(agent: string): asserts agent is SupportedAgent { // Check if a secret exists by name async function secretExists(secretName: string): Promise { const client = getClient(); + // TODO: Fetch by name when API exposed. const result = await client.secrets.list({ limit: 5000 }); return result.secrets?.some((s) => s.name === secretName) ?? false; } diff --git a/src/commands/benchmark-job/status.ts b/src/commands/benchmark-job/status.ts index fa55ffc4..c65bef8a 100644 --- a/src/commands/benchmark-job/status.ts +++ b/src/commands/benchmark-job/status.ts @@ -7,7 +7,7 @@ import { getBenchmarkJob } from "../../services/benchmarkJobService.js"; import { output, outputError } from "../../utils/output.js"; interface StatusOptions { - wait?: boolean; + watch?: boolean; output?: string; } @@ -16,7 +16,7 @@ const COMPLETED_STATES = ["completed", "failed", "canceled", "timeout"]; // Polling config const POLL_INTERVAL_MS = 10 * 1000; // 10 seconds -const MAX_WAIT_MS = 60 * 60 * 1000; // 1 hour +const MAX_WAIT_MS = 60 * 60 * 4 * 1000; // 4 hours // Sleep utility function sleep(ms: number): Promise { @@ -247,7 +247,7 @@ export async function statusBenchmarkJob( const isComplete = COMPLETED_STATES.includes(job.state || ""); // If not waiting or already complete, just print status/results - if (!options.wait || isComplete) { + if (!options.watch || isComplete) { if (options.output && options.output !== "text") { output(job, { format: options.output, defaultFormat: "json" }); } else if (isComplete) { diff --git a/src/utils/commands.ts b/src/utils/commands.ts index 8f922289..80038122 100644 --- a/src/utils/commands.ts +++ b/src/utils/commands.ts @@ -1057,7 +1057,7 @@ export function createProgram(): Command { benchmarkJob .command("status ") .description("Get benchmark job status and results") - .option("-w, --wait", "Wait for job to complete before showing results") + .option("-w, --watch", "Watch for job to complete before showing results") .option( "-o, --output [format]", "Output format: text|json|yaml (default: text)",