runloopai · jrvb-rl · Mar 6, 2026 · Mar 6, 2026 · Mar 6, 2026 · Mar 6, 2026
diff --git a/README.md b/README.md
@@ -184,7 +184,7 @@ rli mcp install                          # Install Runloop MCP server configurat
 ### Benchmark-job Commands (alias: `bmj`)
 
 ```bash
-rli benchmark-job run                    # Run a benchmark job with an agent
+rli benchmark-job run                    # Run a benchmark job with one or more ...
 rli benchmark-job status <id>            # Get benchmark job status and results
 ```
 

diff --git a/src/commands/benchmark-job/run.ts b/src/commands/benchmark-job/run.ts
@@ -43,8 +43,7 @@ const SUPPORTED_AGENTS = {
 type SupportedAgent = keyof typeof SUPPORTED_AGENTS;
 
 interface RunOptions {
-  agent: string;
-  model: string;
+  agent?: string[];
   benchmark?: string;
   scenarios?: string[];
   jobName?: string;
@@ -57,6 +56,50 @@ interface RunOptions {
   output?: string;
 }
 
+interface ParsedAgent {
+  name: SupportedAgent;
+  model: string;
+}
+
+// Parse agent strings in "agent:model" format
+function parseAgentStrings(agentStrings: string[] | undefined): ParsedAgent[] {
+  if (!agentStrings || agentStrings.length === 0) {
+    throw new Error(
+      "At least one --agent is required. Format: --agent agent:model (e.g., --agent claude-code:claude-sonnet-4)",
+    );
+  }
+
+  const agents: ParsedAgent[] = [];
+
+  for (const agentStr of agentStrings) {
+    const colonIndex = agentStr.indexOf(":");
+    if (colonIndex === -1) {
+      throw new Error(
+        `Invalid agent format: "${agentStr}". Use format: agent:model (e.g., claude-code:claude-sonnet-4)`,
+      );
+    }
+
+    const agentName = agentStr.substring(0, colonIndex);
+    const model = agentStr.substring(colonIndex + 1);
+
+    if (!model) {
+      throw new Error(
+        `No model specified for agent "${agentName}". Use format: --agent ${agentName}:model-name`,
+      );
+    }
+
+    // Validate agent
+    validateAgent(agentName);
+
+    agents.push({
+      name: agentName as SupportedAgent,
+      model,
+    });
+  }
+
+  return agents;
+}
+
 // Parse environment variables from KEY=value format
 function parseEnvVars(envVars: string[]): Record<string, string> {
   const result: Record<string, string> = {};
@@ -201,9 +244,8 @@ async function resolveBenchmarkId(benchmarkIdOrName: string): Promise<string> {
 
 export async function runBenchmarkJob(options: RunOptions) {
   try {
-    // Validate agent
-    validateAgent(options.agent);
-    const agent = options.agent as SupportedAgent;
+    // Parse agent strings (format: agent:model)
+    const parsedAgents = parseAgentStrings(options.agent);
 
     // Parse provided env vars and secrets
     const providedEnvVars = options.envVars
@@ -213,30 +255,37 @@ export async function runBenchmarkJob(options: RunOptions) {
       ? parseSecrets(options.secrets)
       : {};
 
-    // Ensure agent secrets exist (auto-create from env vars if needed)
-    // Maps ENV_VAR -> BMJ_ENV_VAR (e.g., ANTHROPIC_API_KEY -> BMJ_ANTHROPIC_API_KEY)
-    const agentSecrets = await ensureAgentSecrets(agent);
+    // Get unique agent names for secret setup
+    const uniqueAgentNames = [...new Set(parsedAgents.map((a) => a.name))];
 
-    // Validate that at least one secret is available (only if requiresAny is true)
-    const agentConfig = SUPPORTED_AGENTS[agent];
-    if (agentConfig.requiresAny) {
-      const hasAny = agentConfig.automaticEnvVars.some(
-        (varName) => agentSecrets[varName],
-      );
-      if (!hasAny) {
-        throw new Error(
-          `Agent ${agent} requires at least one of: ${agentConfig.automaticEnvVars.join(", ")}. ` +
-            `Create secrets (${agentConfig.automaticEnvVars.map((v) => `${SECRET_PREFIX}${v}`).join(", ")}) ` +
-            `or set environment variables.`,
+    // Ensure secrets exist for all unique agents
+    // Collect all secrets across agents
+    const allAgentSecrets: Record<string, string> = {};
+    for (const agentName of uniqueAgentNames) {
+      const agentSecrets = await ensureAgentSecrets(agentName);
+
+      // Validate that at least one secret is available (only if requiresAny is true)
+      const agentConfig = SUPPORTED_AGENTS[agentName];
+      if (agentConfig.requiresAny) {
+        const hasAny = agentConfig.automaticEnvVars.some(
+          (varName) => agentSecrets[varName],
         );
+        if (!hasAny) {
+          throw new Error(
+            `Agent ${agentName} requires at least one of: ${agentConfig.automaticEnvVars.join(", ")}. ` +
+              `Create secrets (${agentConfig.automaticEnvVars.map((v) => `${SECRET_PREFIX}${v}`).join(", ")}) ` +
+              `or set environment variables.`,
+          );
+        }
       }
+
+      // Merge secrets (later agents can use same secrets)
+      Object.assign(allAgentSecrets, agentSecrets);
     }
-    // If requiresAny is false, we just use whatever secrets were auto-populated
-    // User may be configuring credentials via other means (e.g., --secrets flag)
 
     // Combine agent secrets with user-provided secrets
     const secrets = {
-      ...agentSecrets,
+      ...allAgentSecrets,
       ...providedSecrets,
     };
 
@@ -266,25 +315,22 @@ export async function runBenchmarkJob(options: RunOptions) {
       quiet: false,
     };
 
+    // Build agent configs for all parsed agents
+    const agentConfigs = parsedAgents.map((agent) => ({
+      name: agent.name,
+      modelName: agent.model,
+      timeoutSeconds: options.timeout ? parseInt(options.timeout, 10) : 1800,
+      environmentVariables:
+        Object.keys(providedEnvVars).length > 0 ? providedEnvVars : undefined,
+      secrets,
+    }));
+
     // Create the benchmark job
     const job = await createBenchmarkJob({
       name: options.jobName,
       benchmarkId,
       scenarioIds: options.scenarios,
-      agentConfigs: [
-        {
-          name: agent,
-          modelName: options.model,
-          timeoutSeconds: options.timeout
-            ? parseInt(options.timeout, 10)
-            : 1800,
-          environmentVariables:
-            Object.keys(providedEnvVars).length > 0
-              ? providedEnvVars
-              : undefined,
-          secrets,
-        },
-      ],
+      agentConfigs,
       orchestratorConfig,
     });
 

diff --git a/src/utils/commands.ts b/src/utils/commands.ts
@@ -1020,12 +1020,11 @@ export function createProgram(): Command {
 
   benchmarkJob
     .command("run")
-    .description("Run a benchmark job with an agent")
-    .requiredOption(
-      "--agent <agent>",
-      "Agent to use (claude-code, codex, opencode, goose, gemini-cli)",
+    .description("Run a benchmark job with one or more agents")
+    .option(
+      "--agent <agents...>",
+      "Agent(s) to use. Format: agent:model (e.g., claude-code:claude-sonnet-4). Can specify multiple.",
     )
-    .requiredOption("--model <model>", "Model name for the agent")
     .option("--benchmark <id-or-name>", "Benchmark ID or name to run")
     .option(
       "--scenarios <ids...>",