diff --git a/README.md b/README.md index 86d1bb95..305d8d4f 100644 --- a/README.md +++ b/README.md @@ -184,7 +184,7 @@ rli mcp install # Install Runloop MCP server configurat ### Benchmark-job Commands (alias: `bmj`) ```bash -rli benchmark-job run # Run a benchmark job with an agent +rli benchmark-job run # Run a benchmark job with one or more ... rli benchmark-job status # Get benchmark job status and results ``` diff --git a/src/commands/benchmark-job/run.ts b/src/commands/benchmark-job/run.ts index e0960841..b1debaaf 100644 --- a/src/commands/benchmark-job/run.ts +++ b/src/commands/benchmark-job/run.ts @@ -43,8 +43,7 @@ const SUPPORTED_AGENTS = { type SupportedAgent = keyof typeof SUPPORTED_AGENTS; interface RunOptions { - agent: string; - model: string; + agent?: string[]; benchmark?: string; scenarios?: string[]; jobName?: string; @@ -57,6 +56,50 @@ interface RunOptions { output?: string; } +interface ParsedAgent { + name: SupportedAgent; + model: string; +} + +// Parse agent strings in "agent:model" format +function parseAgentStrings(agentStrings: string[] | undefined): ParsedAgent[] { + if (!agentStrings || agentStrings.length === 0) { + throw new Error( + "At least one --agent is required. Format: --agent agent:model (e.g., --agent claude-code:claude-sonnet-4)", + ); + } + + const agents: ParsedAgent[] = []; + + for (const agentStr of agentStrings) { + const colonIndex = agentStr.indexOf(":"); + if (colonIndex === -1) { + throw new Error( + `Invalid agent format: "${agentStr}". Use format: agent:model (e.g., claude-code:claude-sonnet-4)`, + ); + } + + const agentName = agentStr.substring(0, colonIndex); + const model = agentStr.substring(colonIndex + 1); + + if (!model) { + throw new Error( + `No model specified for agent "${agentName}". Use format: --agent ${agentName}:model-name`, + ); + } + + // Validate agent + validateAgent(agentName); + + agents.push({ + name: agentName as SupportedAgent, + model, + }); + } + + return agents; +} + // Parse environment variables from KEY=value format function parseEnvVars(envVars: string[]): Record { const result: Record = {}; @@ -201,9 +244,8 @@ async function resolveBenchmarkId(benchmarkIdOrName: string): Promise { export async function runBenchmarkJob(options: RunOptions) { try { - // Validate agent - validateAgent(options.agent); - const agent = options.agent as SupportedAgent; + // Parse agent strings (format: agent:model) + const parsedAgents = parseAgentStrings(options.agent); // Parse provided env vars and secrets const providedEnvVars = options.envVars @@ -213,30 +255,37 @@ export async function runBenchmarkJob(options: RunOptions) { ? parseSecrets(options.secrets) : {}; - // Ensure agent secrets exist (auto-create from env vars if needed) - // Maps ENV_VAR -> BMJ_ENV_VAR (e.g., ANTHROPIC_API_KEY -> BMJ_ANTHROPIC_API_KEY) - const agentSecrets = await ensureAgentSecrets(agent); + // Get unique agent names for secret setup + const uniqueAgentNames = [...new Set(parsedAgents.map((a) => a.name))]; - // Validate that at least one secret is available (only if requiresAny is true) - const agentConfig = SUPPORTED_AGENTS[agent]; - if (agentConfig.requiresAny) { - const hasAny = agentConfig.automaticEnvVars.some( - (varName) => agentSecrets[varName], - ); - if (!hasAny) { - throw new Error( - `Agent ${agent} requires at least one of: ${agentConfig.automaticEnvVars.join(", ")}. ` + - `Create secrets (${agentConfig.automaticEnvVars.map((v) => `${SECRET_PREFIX}${v}`).join(", ")}) ` + - `or set environment variables.`, + // Ensure secrets exist for all unique agents + // Collect all secrets across agents + const allAgentSecrets: Record = {}; + for (const agentName of uniqueAgentNames) { + const agentSecrets = await ensureAgentSecrets(agentName); + + // Validate that at least one secret is available (only if requiresAny is true) + const agentConfig = SUPPORTED_AGENTS[agentName]; + if (agentConfig.requiresAny) { + const hasAny = agentConfig.automaticEnvVars.some( + (varName) => agentSecrets[varName], ); + if (!hasAny) { + throw new Error( + `Agent ${agentName} requires at least one of: ${agentConfig.automaticEnvVars.join(", ")}. ` + + `Create secrets (${agentConfig.automaticEnvVars.map((v) => `${SECRET_PREFIX}${v}`).join(", ")}) ` + + `or set environment variables.`, + ); + } } + + // Merge secrets (later agents can use same secrets) + Object.assign(allAgentSecrets, agentSecrets); } - // If requiresAny is false, we just use whatever secrets were auto-populated - // User may be configuring credentials via other means (e.g., --secrets flag) // Combine agent secrets with user-provided secrets const secrets = { - ...agentSecrets, + ...allAgentSecrets, ...providedSecrets, }; @@ -266,25 +315,22 @@ export async function runBenchmarkJob(options: RunOptions) { quiet: false, }; + // Build agent configs for all parsed agents + const agentConfigs = parsedAgents.map((agent) => ({ + name: agent.name, + modelName: agent.model, + timeoutSeconds: options.timeout ? parseInt(options.timeout, 10) : 1800, + environmentVariables: + Object.keys(providedEnvVars).length > 0 ? providedEnvVars : undefined, + secrets, + })); + // Create the benchmark job const job = await createBenchmarkJob({ name: options.jobName, benchmarkId, scenarioIds: options.scenarios, - agentConfigs: [ - { - name: agent, - modelName: options.model, - timeoutSeconds: options.timeout - ? parseInt(options.timeout, 10) - : 1800, - environmentVariables: - Object.keys(providedEnvVars).length > 0 - ? providedEnvVars - : undefined, - secrets, - }, - ], + agentConfigs, orchestratorConfig, }); diff --git a/src/utils/commands.ts b/src/utils/commands.ts index 80038122..83fc1c25 100644 --- a/src/utils/commands.ts +++ b/src/utils/commands.ts @@ -1020,12 +1020,11 @@ export function createProgram(): Command { benchmarkJob .command("run") - .description("Run a benchmark job with an agent") - .requiredOption( - "--agent ", - "Agent to use (claude-code, codex, opencode, goose, gemini-cli)", + .description("Run a benchmark job with one or more agents") + .option( + "--agent ", + "Agent(s) to use. Format: agent:model (e.g., claude-code:claude-sonnet-4). Can specify multiple.", ) - .requiredOption("--model ", "Model name for the agent") .option("--benchmark ", "Benchmark ID or name to run") .option( "--scenarios ",