From 3868d961c41cdf01ac4de2c0fbed46b5d1b42d29 Mon Sep 17 00:00:00 2001
From: Ross <ross@runloop.ai>
Date: Thu, 5 Mar 2026 11:18:52 -0800
Subject: [PATCH 1/6] add benchmark job cli

---
 src/commands/benchmark-job/run.ts    | 264 ++++++++++++++++++++++++
 src/commands/benchmark-job/status.ts | 295 +++++++++++++++++++++++++++
 src/utils/commands.ts                |  58 ++++++
 3 files changed, 617 insertions(+)
 create mode 100644 src/commands/benchmark-job/run.ts
 create mode 100644 src/commands/benchmark-job/status.ts

diff --git a/src/commands/benchmark-job/run.ts b/src/commands/benchmark-job/run.ts
new file mode 100644
index 00000000..99f7d466
--- /dev/null
+++ b/src/commands/benchmark-job/run.ts
@@ -0,0 +1,264 @@
+/**
+ * Run benchmark job command
+ */
+
+import { createBenchmarkJob } from "../../services/benchmarkJobService.js";
+import { listBenchmarks } from "../../services/benchmarkService.js";
+import { output, outputError } from "../../utils/output.js";
+
+// Supported agents and their required environment variables
+const SUPPORTED_AGENTS = {
+  "claude-code": {
+    requiredEnvVars: ["ANTHROPIC_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"],
+    requiresAny: true, // At least one of these is required
+  },
+  codex: {
+    requiredEnvVars: ["OPENAI_API_KEY"],
+    requiresAny: false,
+  },
+  opencode: {
+    requiredEnvVars: ["ANTHROPIC_API_KEY"],
+    requiresAny: false,
+  },
+  goose: {
+    requiredEnvVars: ["ANTHROPIC_API_KEY"],
+    requiresAny: false,
+  },
+  "gemini-cli": {
+    requiredEnvVars: ["GEMINI_API_KEY", "GOOGLE_API_KEY"],
+    requiresAny: true, // At least one of these is required
+  },
+} as const;
+
+type SupportedAgent = keyof typeof SUPPORTED_AGENTS;
+
+interface RunOptions {
+  agent: string;
+  model: string;
+  benchmark?: string;
+  scenarios?: string[];
+  jobName?: string;
+  envVars?: string[];
+  secrets?: string[];
+  timeout?: string;
+  nAttempts?: string;
+  nConcurrentTrials?: string;
+  timeoutMultiplier?: string;
+  output?: string;
+}
+
+// Parse environment variables from KEY=value format
+function parseEnvVars(envVars: string[]): Record<string, string> {
+  const result: Record<string, string> = {};
+  for (const envVar of envVars) {
+    const eqIndex = envVar.indexOf("=");
+    if (eqIndex === -1) {
+      throw new Error(
+        `Invalid environment variable format: ${envVar}. Expected KEY=value`,
+      );
+    }
+    const key = envVar.substring(0, eqIndex);
+    const value = envVar.substring(eqIndex + 1);
+    result[key] = value;
+  }
+  return result;
+}
+
+// Parse secrets from ENV_VAR=SECRET_NAME format
+function parseSecrets(secrets: string[]): Record<string, string> {
+  const result: Record<string, string> = {};
+  for (const secret of secrets) {
+    const eqIndex = secret.indexOf("=");
+    if (eqIndex === -1) {
+      throw new Error(
+        `Invalid secret format: ${secret}. Expected ENV_VAR=SECRET_NAME`,
+      );
+    }
+    const envVarName = secret.substring(0, eqIndex);
+    const secretName = secret.substring(eqIndex + 1);
+    result[envVarName] = secretName;
+  }
+  return result;
+}
+
+// Validate agent is supported
+function validateAgent(agent: string): asserts agent is SupportedAgent {
+  if (!(agent in SUPPORTED_AGENTS)) {
+    const supportedList = Object.keys(SUPPORTED_AGENTS).join(", ");
+    throw new Error(
+      `Unsupported agent: ${agent}. Supported agents: ${supportedList}`,
+    );
+  }
+}
+
+// Get env vars from current environment for the agent
+function getAgentEnvVars(agent: SupportedAgent): Record<string, string> {
+  const agentConfig = SUPPORTED_AGENTS[agent];
+  const envVars: Record<string, string> = {};
+
+  for (const varName of agentConfig.requiredEnvVars) {
+    const value = process.env[varName];
+    if (value) {
+      envVars[varName] = value;
+    }
+  }
+
+  return envVars;
+}
+
+// Validate that required env vars are present
+function validateEnvVars(
+  agent: SupportedAgent,
+  providedEnvVars: Record<string, string>,
+): void {
+  const agentConfig = SUPPORTED_AGENTS[agent];
+  const allEnvVars = { ...getAgentEnvVars(agent), ...providedEnvVars };
+
+  if (agentConfig.requiresAny) {
+    // At least one of the required env vars must be present
+    const hasAny = agentConfig.requiredEnvVars.some(
+      (varName) => allEnvVars[varName],
+    );
+    if (!hasAny) {
+      throw new Error(
+        `Agent ${agent} requires at least one of: ${agentConfig.requiredEnvVars.join(", ")}. ` +
+          `Set via --env-vars or as environment variables.`,
+      );
+    }
+  } else {
+    // For agents that don't use requiresAny, we just need at least one key
+    // since different models may need different keys
+    const hasAny = agentConfig.requiredEnvVars.some(
+      (varName) => allEnvVars[varName],
+    );
+    if (!hasAny) {
+      throw new Error(
+        `Agent ${agent} requires environment variables. Expected one of: ${agentConfig.requiredEnvVars.join(", ")}. ` +
+          `Set via --env-vars or as environment variables.`,
+      );
+    }
+  }
+}
+
+// Resolve benchmark name to ID if needed
+async function resolveBenchmarkId(benchmarkIdOrName: string): Promise<string> {
+  // If it looks like an ID (starts with bm_ or similar), return as-is
+  if (
+    benchmarkIdOrName.startsWith("bm_") ||
+    benchmarkIdOrName.startsWith("bmk_")
+  ) {
+    return benchmarkIdOrName;
+  }
+
+  // Otherwise, search for benchmark by name
+  const result = await listBenchmarks({
+    limit: 100,
+    search: benchmarkIdOrName,
+  });
+
+  // Look for exact name match
+  const exactMatch = result.benchmarks.find(
+    (b) => b.name === benchmarkIdOrName,
+  );
+
+  if (exactMatch) {
+    return exactMatch.id;
+  }
+
+  if (result.benchmarks.length === 0) {
+    throw new Error(`No benchmark found with name: ${benchmarkIdOrName}`);
+  }
+
+  // If no exact match but we have results, suggest them
+  const suggestions = result.benchmarks
+    .slice(0, 5)
+    .map((b) => `  - ${b.name} (${b.id})`)
+    .join("\n");
+  throw new Error(
+    `No exact match for benchmark "${benchmarkIdOrName}". Did you mean:\n${suggestions}`,
+  );
+}
+
+export async function runBenchmarkJob(options: RunOptions) {
+  try {
+    // Validate agent
+    validateAgent(options.agent);
+    const agent = options.agent as SupportedAgent;
+
+    // Parse provided env vars and secrets
+    const providedEnvVars = options.envVars
+      ? parseEnvVars(options.envVars)
+      : {};
+    const providedSecrets = options.secrets
+      ? parseSecrets(options.secrets)
+      : {};
+
+    // Merge environment variables (CLI-provided override auto-detected)
+    const environmentVariables = {
+      ...getAgentEnvVars(agent),
+      ...providedEnvVars,
+    };
+
+    // Validate required env vars
+    validateEnvVars(agent, providedEnvVars);
+
+    // Validate that either benchmark or scenarios is provided, but not both
+    if (!options.benchmark && !options.scenarios) {
+      throw new Error(
+        "Either --benchmark or --scenarios must be specified",
+      );
+    }
+    if (options.benchmark && options.scenarios) {
+      throw new Error("Cannot specify both --benchmark and --scenarios");
+    }
+
+    // Resolve benchmark ID if name was provided
+    let benchmarkId: string | undefined;
+    if (options.benchmark) {
+      benchmarkId = await resolveBenchmarkId(options.benchmark);
+    }
+
+    // Build orchestrator config with defaults
+    const orchestratorConfig = {
+      nConcurrentTrials: options.nConcurrentTrials
+        ? parseInt(options.nConcurrentTrials, 10)
+        : 10,
+      nAttempts: options.nAttempts ? parseInt(options.nAttempts, 10) : 1,
+      timeoutMultiplier: options.timeoutMultiplier
+        ? parseFloat(options.timeoutMultiplier)
+        : 1.0,
+      quiet: false,
+    };
+
+    // Create the benchmark job
+    const job = await createBenchmarkJob({
+      name: options.jobName,
+      benchmarkId,
+      scenarioIds: options.scenarios,
+      agentConfigs: [
+        {
+          name: agent,
+          modelName: options.model,
+          timeoutSeconds: options.timeout
+            ? parseInt(options.timeout, 10)
+            : 1800,
+          environmentVariables,
+          secrets:
+            Object.keys(providedSecrets).length > 0
+              ? providedSecrets
+              : undefined,
+        },
+      ],
+      orchestratorConfig,
+    });
+
+    // Output result
+    if (!options.output || options.output === "text") {
+      console.log(job.id);
+    } else {
+      output(job, { format: options.output, defaultFormat: "json" });
+    }
+  } catch (error) {
+    outputError("Failed to run benchmark job", error);
+  }
+}
diff --git a/src/commands/benchmark-job/status.ts b/src/commands/benchmark-job/status.ts
new file mode 100644
index 00000000..fa55ffc4
--- /dev/null
+++ b/src/commands/benchmark-job/status.ts
@@ -0,0 +1,295 @@
+/**
+ * Status benchmark job command
+ */
+
+import chalk from "chalk";
+import { getBenchmarkJob } from "../../services/benchmarkJobService.js";
+import { output, outputError } from "../../utils/output.js";
+
+interface StatusOptions {
+  wait?: boolean;
+  output?: string;
+}
+
+// Job states that indicate completion
+const COMPLETED_STATES = ["completed", "failed", "canceled", "timeout"];
+
+// Polling config
+const POLL_INTERVAL_MS = 10 * 1000; // 10 seconds
+const MAX_WAIT_MS = 60 * 60 * 1000; // 1 hour
+
+// Sleep utility
+function sleep(ms: number): Promise<void> {
+  return new Promise((resolve) => setTimeout(resolve, ms));
+}
+
+interface ScenarioOutcome {
+  scenario_name?: string;
+  scenario_definition_id?: string;
+  state?: string;
+  score?: number;
+}
+
+interface BenchmarkOutcome {
+  agent_name?: string;
+  model_name?: string;
+  scenario_outcomes?: ScenarioOutcome[];
+}
+
+interface JobData {
+  id: string;
+  name?: string;
+  state?: string;
+  benchmark_outcomes?: BenchmarkOutcome[];
+}
+
+// Calculate stats for scenario outcomes
+function calculateStats(outcomes: ScenarioOutcome[]): {
+  total: number;
+  passed: number;
+  failedZero: number;
+  failedError: number;
+} {
+  let passed = 0;
+  let failedZero = 0;
+  let failedError = 0;
+
+  for (const outcome of outcomes) {
+    const state = outcome.state?.toUpperCase();
+    const score = outcome.score;
+
+    if (state === "COMPLETED") {
+      if (score === 1.0) {
+        passed++;
+      } else {
+        failedZero++;
+      }
+    } else {
+      // Any non-COMPLETED state is an error
+      failedError++;
+    }
+  }
+
+  return {
+    total: outcomes.length,
+    passed,
+    failedZero,
+    failedError,
+  };
+}
+
+// Format percentage
+function formatPercent(count: number, total: number): string {
+  if (total === 0) return "0.0%";
+  return ((count / total) * 100).toFixed(1) + "%";
+}
+
+// Print current status (brief)
+function printStatus(job: JobData): void {
+  const jobName = job.name || job.id;
+  const state = job.state || "unknown";
+
+  console.log(`Job: ${jobName}`);
+  console.log(`ID: ${job.id}`);
+  console.log(`State: ${state}`);
+
+  if (COMPLETED_STATES.includes(state)) {
+    const outcomes = job.benchmark_outcomes || [];
+    if (outcomes.length > 0) {
+      let totalScenarios = 0;
+      let totalPassed = 0;
+      for (const outcome of outcomes) {
+        const stats = calculateStats(outcome.scenario_outcomes || []);
+        totalScenarios += stats.total;
+        totalPassed += stats.passed;
+      }
+      if (totalScenarios > 0) {
+        console.log(
+          `Results: ${totalPassed}/${totalScenarios} passed (${formatPercent(totalPassed, totalScenarios)})`,
+        );
+      }
+    }
+  }
+}
+
+// Print results table
+function printResultsTable(job: JobData): void {
+  const outcomes = job.benchmark_outcomes || [];
+
+  if (outcomes.length === 0) {
+    console.log(chalk.yellow("No benchmark outcomes found"));
+    return;
+  }
+
+  // Header
+  console.log();
+  console.log(chalk.bold("Benchmark Job Results"));
+  console.log(chalk.dim(`Job ID: ${job.id}`));
+  if (job.name) {
+    console.log(chalk.dim(`Name: ${job.name}`));
+  }
+  console.log(chalk.dim(`State: ${job.state}`));
+  console.log();
+
+  // Table header
+  const agentCol = "Agent / Model".padEnd(40);
+  const passedCol = "Passed".padStart(10);
+  const failedCol = "Failed (0.0)".padStart(14);
+  const errorCol = "Failed (error)".padStart(16);
+  const totalCol = "Total".padStart(8);
+
+  console.log(
+    chalk.bold(agentCol + passedCol + failedCol + errorCol + totalCol),
+  );
+  console.log(chalk.dim("-".repeat(88)));
+
+  // Print each agent's results
+  for (const outcome of outcomes) {
+    const agentName = outcome.agent_name || "unknown";
+    const modelName = outcome.model_name || "default";
+    const scenarioOutcomes = outcome.scenario_outcomes || [];
+
+    const stats = calculateStats(scenarioOutcomes);
+
+    // Format agent/model column
+    let agentModelStr = agentName;
+    if (modelName && modelName !== "default") {
+      agentModelStr += ` (${modelName})`;
+    }
+    if (agentModelStr.length > 38) {
+      agentModelStr = agentModelStr.slice(0, 35) + "...";
+    }
+    const agentModelCol = agentModelStr.padEnd(40);
+
+    // Format stats columns with colors
+    const passedStr = formatPercent(stats.passed, stats.total);
+    const failedZeroStr = formatPercent(stats.failedZero, stats.total);
+    const failedErrorStr = formatPercent(stats.failedError, stats.total);
+
+    const passedColored =
+      stats.passed > 0
+        ? chalk.green(passedStr.padStart(10))
+        : chalk.dim(passedStr.padStart(10));
+
+    const failedZeroColored =
+      stats.failedZero > 0
+        ? chalk.yellow(failedZeroStr.padStart(14))
+        : chalk.dim(failedZeroStr.padStart(14));
+
+    const failedErrorColored =
+      stats.failedError > 0
+        ? chalk.red(failedErrorStr.padStart(16))
+        : chalk.dim(failedErrorStr.padStart(16));
+
+    const totalColStr = String(stats.total).padStart(8);
+
+    console.log(
+      agentModelCol +
+        passedColored +
+        failedZeroColored +
+        failedErrorColored +
+        chalk.dim(totalColStr),
+    );
+
+    // Print individual scenario results underneath (indented)
+    for (const scenario of scenarioOutcomes) {
+      const scenarioName =
+        scenario.scenario_name || scenario.scenario_definition_id || "unknown";
+      const state = scenario.state || "unknown";
+      const score = scenario.score;
+
+      let statusIcon: string;
+      let statusColor: typeof chalk.green;
+
+      if (state.toUpperCase() === "COMPLETED") {
+        if (score === 1.0) {
+          statusIcon = chalk.green("\u2713"); // checkmark
+          statusColor = chalk.green;
+        } else {
+          statusIcon = chalk.yellow("\u2717"); // X
+          statusColor = chalk.yellow;
+        }
+      } else {
+        statusIcon = chalk.red("!");
+        statusColor = chalk.red;
+      }
+
+      const scenarioNameTrunc =
+        scenarioName.length > 50
+          ? scenarioName.slice(0, 47) + "..."
+          : scenarioName;
+
+      const scoreStr =
+        score !== undefined ? `score=${score.toFixed(1)}` : state;
+
+      console.log(
+        chalk.dim("  ") +
+          statusIcon +
+          " " +
+          chalk.dim(scenarioNameTrunc.padEnd(52)) +
+          statusColor(scoreStr),
+      );
+    }
+  }
+
+  console.log();
+}
+
+export async function statusBenchmarkJob(
+  id: string,
+  options: StatusOptions = {},
+) {
+  try {
+    // Initial fetch
+    let job = (await getBenchmarkJob(id)) as unknown as JobData;
+
+    // Check if job is complete
+    const isComplete = COMPLETED_STATES.includes(job.state || "");
+
+    // If not waiting or already complete, just print status/results
+    if (!options.wait || isComplete) {
+      if (options.output && options.output !== "text") {
+        output(job, { format: options.output, defaultFormat: "json" });
+      } else if (isComplete) {
+        printResultsTable(job);
+      } else {
+        printStatus(job);
+      }
+      return;
+    }
+
+    // Wait mode: poll until complete
+    const jobName = job.name || job.id;
+    console.log(chalk.cyan(`Awaiting job "${jobName}" completion...`));
+    console.log(chalk.dim(`Current state: ${job.state}`));
+    console.log();
+
+    const startTime = Date.now();
+
+    while (!COMPLETED_STATES.includes(job.state || "")) {
+      // Check timeout
+      if (Date.now() - startTime > MAX_WAIT_MS) {
+        console.log();
+        outputError(
+          `Timeout waiting for job completion after ${MAX_WAIT_MS / 1000 / 60} minutes`,
+        );
+      }
+
+      await sleep(POLL_INTERVAL_MS);
+      job = (await getBenchmarkJob(id)) as unknown as JobData;
+      process.stdout.write(chalk.dim("."));
+    }
+
+    console.log();
+    console.log();
+
+    // Output based on format
+    if (options.output && options.output !== "text") {
+      output(job, { format: options.output, defaultFormat: "json" });
+    } else {
+      printResultsTable(job);
+    }
+  } catch (error) {
+    outputError("Failed to get benchmark job status", error);
+  }
+}
diff --git a/src/utils/commands.ts b/src/utils/commands.ts
index 07c67d0b..e6bbfc79 100644
--- a/src/utils/commands.ts
+++ b/src/utils/commands.ts
@@ -1012,6 +1012,64 @@ export function createProgram(): Command {
       await installMcpConfig();
     });
 
+  // Benchmark job commands
+  const benchmarkJob = program
+    .command("benchmark-job")
+    .description("Manage benchmark jobs")
+    .alias("bmj");
+
+  benchmarkJob
+    .command("run")
+    .description("Run a benchmark job with an agent")
+    .requiredOption(
+      "--agent <agent>",
+      "Agent to use (claude-code, codex, opencode, goose, gemini-cli)",
+    )
+    .requiredOption("--model <model>", "Model name for the agent")
+    .option("--benchmark <id-or-name>", "Benchmark ID or name to run")
+    .option(
+      "--scenarios <ids...>",
+      "Scenario IDs to run (alternative to --benchmark)",
+    )
+    .option("-n, --job-name <name>", "Job name")
+    .option(
+      "--env-vars <vars...>",
+      "Environment variables (format: KEY=value). Agent-specific API keys are auto-detected from environment.",
+    )
+    .option(
+      "--secrets <secrets...>",
+      "Secrets to inject as environment variables (format: ENV_VAR=SECRET_NAME)",
+    )
+    .option("--timeout <seconds>", "Agent timeout in seconds")
+    .option("--n-attempts <n>", "Number of attempts per scenario")
+    .option("--n-concurrent-trials <n>", "Number of concurrent trials")
+    .option("--timeout-multiplier <n>", "Timeout multiplier")
+    .option(
+      "-o, --output [format]",
+      "Output format: text|json|yaml (default: text)",
+    )
+    .action(async (options) => {
+      const { runBenchmarkJob } = await import(
+        "../commands/benchmark-job/run.js"
+      );
+      await runBenchmarkJob(options);
+    });
+
+  benchmarkJob
+    .command("status <id>")
+    .description("Get benchmark job status and results")
+    .option("-w, --wait", "Wait for job to complete before showing results")
+    .option(
+      "-o, --output [format]",
+      "Output format: text|json|yaml (default: text)",
+    )
+    .action(async (id, options) => {
+      const { statusBenchmarkJob } = await import(
+        "../commands/benchmark-job/status.js"
+      );
+      await statusBenchmarkJob(id, options);
+    });
+
   // Hidden command: 'rli mcp' without subcommand starts the server (for Claude Desktop config compatibility)
   program
     .command("mcp-server", { hidden: true })

From e0d2419ffe12ac6d45ee42ea78f4f5d648a18e97 Mon Sep 17 00:00:00 2001
From: Ross <ross@runloop.ai>
Date: Thu, 5 Mar 2026 12:33:51 -0800
Subject: [PATCH 2/6] cp

---
 src/commands/benchmark-job/run.ts | 167 +++++++++++++++++++-----------
 src/services/benchmarkService.ts  |  34 ++++++
 src/utils/commands.ts             |   2 +-
 3 files changed, 139 insertions(+), 64 deletions(-)

diff --git a/src/commands/benchmark-job/run.ts b/src/commands/benchmark-job/run.ts
index 99f7d466..3982e9e1 100644
--- a/src/commands/benchmark-job/run.ts
+++ b/src/commands/benchmark-job/run.ts
@@ -2,11 +2,19 @@
  * Run benchmark job command
  */
 
+import chalk from "chalk";
 import { createBenchmarkJob } from "../../services/benchmarkJobService.js";
-import { listBenchmarks } from "../../services/benchmarkService.js";
+import {
+  listBenchmarks,
+  listPublicBenchmarks,
+} from "../../services/benchmarkService.js";
+import { getClient } from "../../utils/client.js";
 import { output, outputError } from "../../utils/output.js";
 
-// Supported agents and their required environment variables
+// Secret name prefix for benchmark job secrets
+const SECRET_PREFIX = "BMJ_";
+
+// Supported agents and their required environment variables (mapped to BMJ_* secrets)
 const SUPPORTED_AGENTS = {
   "claude-code": {
     requiredEnvVars: ["ANTHROPIC_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"],
@@ -91,53 +99,55 @@ function validateAgent(agent: string): asserts agent is SupportedAgent {
   }
 }
 
-// Get env vars from current environment for the agent
-function getAgentEnvVars(agent: SupportedAgent): Record<string, string> {
-  const agentConfig = SUPPORTED_AGENTS[agent];
-  const envVars: Record<string, string> = {};
-
-  for (const varName of agentConfig.requiredEnvVars) {
-    const value = process.env[varName];
-    if (value) {
-      envVars[varName] = value;
-    }
-  }
+// Check if a secret exists by name
+async function secretExists(secretName: string): Promise<boolean> {
+  const client = getClient();
+  const result = await client.secrets.list({ limit: 5000 });
+  return result.secrets?.some((s) => s.name === secretName) ?? false;
+}
 
-  return envVars;
+// Create a secret
+async function createSecret(name: string, value: string): Promise<void> {
+  const client = getClient();
+  await client.secrets.create({ name, value });
 }
 
-// Validate that required env vars are present
-function validateEnvVars(
+// Ensure agent secrets exist, creating them from env vars if needed
+// Returns the secrets mapping (ENV_VAR -> BMJ_ENV_VAR)
+async function ensureAgentSecrets(
   agent: SupportedAgent,
-  providedEnvVars: Record<string, string>,
-): void {
+): Promise<Record<string, string>> {
   const agentConfig = SUPPORTED_AGENTS[agent];
-  const allEnvVars = { ...getAgentEnvVars(agent), ...providedEnvVars };
+  const secrets: Record<string, string> = {};
 
-  if (agentConfig.requiresAny) {
-    // At least one of the required env vars must be present
-    const hasAny = agentConfig.requiredEnvVars.some(
-      (varName) => allEnvVars[varName],
-    );
-    if (!hasAny) {
-      throw new Error(
-        `Agent ${agent} requires at least one of: ${agentConfig.requiredEnvVars.join(", ")}. ` +
-          `Set via --env-vars or as environment variables.`,
+  for (const varName of agentConfig.requiredEnvVars) {
+    const secretName = `${SECRET_PREFIX}${varName}`;
+    const envValue = process.env[varName];
+
+    // Check if secret exists
+    const exists = await secretExists(secretName);
+
+    if (exists) {
+      console.log(chalk.dim(`Secret ${secretName} exists`));
+      secrets[varName] = secretName;
+    } else if (envValue) {
+      // Create secret from env var
+      console.log(
+        chalk.cyan(`Creating secret ${secretName} from ${varName} env var`),
       );
-    }
-  } else {
-    // For agents that don't use requiresAny, we just need at least one key
-    // since different models may need different keys
-    const hasAny = agentConfig.requiredEnvVars.some(
-      (varName) => allEnvVars[varName],
-    );
-    if (!hasAny) {
-      throw new Error(
-        `Agent ${agent} requires environment variables. Expected one of: ${agentConfig.requiredEnvVars.join(", ")}. ` +
-          `Set via --env-vars or as environment variables.`,
+      await createSecret(secretName, envValue);
+      secrets[varName] = secretName;
+    } else {
+      // No secret and no env var - skip (will be validated later if required)
+      console.log(
+        chalk.yellow(
+          `Secret ${secretName} not found and ${varName} not set in environment`,
+        ),
       );
     }
   }
+
+  return secrets;
 }
 
 // Resolve benchmark name to ID if needed
@@ -150,27 +160,34 @@ async function resolveBenchmarkId(benchmarkIdOrName: string): Promise<string> {
     return benchmarkIdOrName;
   }
 
-  // Otherwise, search for benchmark by name
-  const result = await listBenchmarks({
-    limit: 100,
-    search: benchmarkIdOrName,
-  });
+  // Search both user benchmarks and public benchmarks
+  const [userResult, publicResult] = await Promise.all([
+    listBenchmarks({
+      limit: 100,
+      search: benchmarkIdOrName,
+    }),
+    listPublicBenchmarks({
+      limit: 100,
+      search: benchmarkIdOrName,
+    }),
+  ]);
+
+  // Combine results
+  const allBenchmarks = [...userResult.benchmarks, ...publicResult.benchmarks];
 
   // Look for exact name match
-  const exactMatch = result.benchmarks.find(
-    (b) => b.name === benchmarkIdOrName,
-  );
+  const exactMatch = allBenchmarks.find((b) => b.name === benchmarkIdOrName);
 
   if (exactMatch) {
     return exactMatch.id;
   }
 
-  if (result.benchmarks.length === 0) {
+  if (allBenchmarks.length === 0) {
     throw new Error(`No benchmark found with name: ${benchmarkIdOrName}`);
   }
 
   // If no exact match but we have results, suggest them
-  const suggestions = result.benchmarks
+  const suggestions = allBenchmarks
     .slice(0, 5)
     .map((b) => `  - ${b.name} (${b.id})`)
     .join("\n");
@@ -193,20 +210,44 @@ export async function runBenchmarkJob(options: RunOptions) {
       ? parseSecrets(options.secrets)
       : {};
 
-    // Merge environment variables (CLI-provided override auto-detected)
-    const environmentVariables = {
-      ...getAgentEnvVars(agent),
-      ...providedEnvVars,
-    };
+    // Ensure agent secrets exist (auto-create from env vars if needed)
+    // Maps ENV_VAR -> BMJ_ENV_VAR (e.g., ANTHROPIC_API_KEY -> BMJ_ANTHROPIC_API_KEY)
+    const agentSecrets = await ensureAgentSecrets(agent);
+
+    // Validate that at least one required secret is available
+    const agentConfig = SUPPORTED_AGENTS[agent];
+    if (agentConfig.requiresAny) {
+      const hasAny = agentConfig.requiredEnvVars.some(
+        (varName) => agentSecrets[varName],
+      );
+      if (!hasAny) {
+        throw new Error(
+          `Agent ${agent} requires at least one of: ${agentConfig.requiredEnvVars.join(", ")}. ` +
+            `Create secrets (${agentConfig.requiredEnvVars.map((v) => `${SECRET_PREFIX}${v}`).join(", ")}) ` +
+            `or set environment variables.`,
+        );
+      }
+    } else {
+      const hasAny = agentConfig.requiredEnvVars.some(
+        (varName) => agentSecrets[varName],
+      );
+      if (!hasAny) {
+        throw new Error(
+          `Agent ${agent} requires secrets. Expected one of: ${agentConfig.requiredEnvVars.map((v) => `${SECRET_PREFIX}${v}`).join(", ")}. ` +
+            `Create secrets or set environment variables.`,
+        );
+      }
+    }
 
-    // Validate required env vars
-    validateEnvVars(agent, providedEnvVars);
+    // Combine agent secrets with user-provided secrets
+    const secrets = {
+      ...agentSecrets,
+      ...providedSecrets,
+    };
 
     // Validate that either benchmark or scenarios is provided, but not both
     if (!options.benchmark && !options.scenarios) {
-      throw new Error(
-        "Either --benchmark or --scenarios must be specified",
-      );
+      throw new Error("Either --benchmark or --scenarios must be specified");
     }
     if (options.benchmark && options.scenarios) {
       throw new Error("Cannot specify both --benchmark and --scenarios");
@@ -242,11 +283,11 @@ export async function runBenchmarkJob(options: RunOptions) {
           timeoutSeconds: options.timeout
             ? parseInt(options.timeout, 10)
             : 1800,
-          environmentVariables,
-          secrets:
-            Object.keys(providedSecrets).length > 0
-              ? providedSecrets
+          environmentVariables:
+            Object.keys(providedEnvVars).length > 0
+              ? providedEnvVars
               : undefined,
+          secrets,
         },
       ],
       orchestratorConfig,
diff --git a/src/services/benchmarkService.ts b/src/services/benchmarkService.ts
index 17a164cd..e6373464 100644
--- a/src/services/benchmarkService.ts
+++ b/src/services/benchmarkService.ts
@@ -179,6 +179,40 @@ export async function getBenchmark(id: string): Promise<Benchmark> {
   return client.benchmarks.retrieve(id);
 }
 
+/**
+ * List public benchmark definitions with pagination
+ */
+export async function listPublicBenchmarks(
+  options: ListBenchmarksOptions,
+): Promise<ListBenchmarksResult> {
+  const client = getClient();
+
+  const queryParams: {
+    limit?: number;
+    starting_after?: string;
+    search?: string;
+  } = {
+    limit: options.limit,
+  };
+
+  if (options.startingAfter) {
+    queryParams.starting_after = options.startingAfter;
+  }
+
+  if (options.search) {
+    queryParams.search = options.search;
+  }
+
+  const page = await client.benchmarks.listPublic(queryParams);
+  const benchmarks = page.benchmarks || [];
+
+  return {
+    benchmarks,
+    totalCount: benchmarks.length,
+    hasMore: page.has_more || false,
+  };
+}
+
 /**
  * Create/start a benchmark run with selected benchmarks
  */
diff --git a/src/utils/commands.ts b/src/utils/commands.ts
index e6bbfc79..31439fe4 100644
--- a/src/utils/commands.ts
+++ b/src/utils/commands.ts
@@ -1034,7 +1034,7 @@ export function createProgram(): Command {
     .option("-n, --job-name <name>", "Job name")
     .option(
       "--env-vars <vars...>",
-      "Environment variables (format: KEY=value). Agent-specific API keys are auto-detected from environment.",
+      "Additional environment variables (format: KEY=value)",
     )
     .option(
       "--secrets <secrets...>",

From 7bcb047b26799956ac9c5ff36bcf71640885d0a1 Mon Sep 17 00:00:00 2001
From: Ross <ross@runloop.ai>
Date: Thu, 5 Mar 2026 12:34:36 -0800
Subject: [PATCH 3/6] cp

---
 src/utils/commands.ts | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/src/utils/commands.ts b/src/utils/commands.ts
index 31439fe4..8f922289 100644
--- a/src/utils/commands.ts
+++ b/src/utils/commands.ts
@@ -1049,9 +1049,8 @@ export function createProgram(): Command {
       "Output format: text|json|yaml (default: text)",
     )
     .action(async (options) => {
-      const { runBenchmarkJob } = await import(
-        "../commands/benchmark-job/run.js"
-      );
+      const { runBenchmarkJob } =
+        await import("../commands/benchmark-job/run.js");
       await runBenchmarkJob(options);
     });
 
@@ -1064,9 +1063,8 @@ export function createProgram(): Command {
       "Output format: text|json|yaml (default: text)",
     )
     .action(async (id, options) => {
-      const { statusBenchmarkJob } = await import(
-        "../commands/benchmark-job/status.js"
-      );
+      const { statusBenchmarkJob } =
+        await import("../commands/benchmark-job/status.js");
       await statusBenchmarkJob(id, options);
     });
 

From 40900430598079c511c2d142a6aefefc4757b080 Mon Sep 17 00:00:00 2001
From: Ross <ross@runloop.ai>
Date: Thu, 5 Mar 2026 12:34:56 -0800
Subject: [PATCH 4/6] cp

---
 README.md | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/README.md b/README.md
index 9f833a11..86d1bb95 100644
--- a/README.md
+++ b/README.md
@@ -181,6 +181,13 @@ rli mcp start                            # Start the MCP server
 rli mcp install                          # Install Runloop MCP server configurat...
 ```
 
+### Benchmark-job Commands (alias: `bmj`)
+
+```bash
+rli benchmark-job run                    # Run a benchmark job with an agent
+rli benchmark-job status <id>            # Get benchmark job status and results
+```
+
 
 ## MCP Server (AI Integration)
 

From 523b89c9567a9d6d96dbc5d8b630ca633ec710a1 Mon Sep 17 00:00:00 2001
From: Ross <ross@runloop.ai>
Date: Thu, 5 Mar 2026 12:41:45 -0800
Subject: [PATCH 5/6] cp

---
 src/commands/benchmark-job/run.ts | 42 +++++++++++++------------------
 1 file changed, 18 insertions(+), 24 deletions(-)

diff --git a/src/commands/benchmark-job/run.ts b/src/commands/benchmark-job/run.ts
index 3982e9e1..79e426bf 100644
--- a/src/commands/benchmark-job/run.ts
+++ b/src/commands/benchmark-job/run.ts
@@ -14,26 +14,28 @@ import { output, outputError } from "../../utils/output.js";
 // Secret name prefix for benchmark job secrets
 const SECRET_PREFIX = "BMJ_";
 
-// Supported agents and their required environment variables (mapped to BMJ_* secrets)
+// Supported agents and their automatic environment variables (mapped to BMJ_* secrets)
+// - automaticEnvVars: env vars that will be auto-populated from secrets or environment
+// - requiresAny: if true, at least one must be set; if false, just try to auto-populate
 const SUPPORTED_AGENTS = {
   "claude-code": {
-    requiredEnvVars: ["ANTHROPIC_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"],
+    automaticEnvVars: ["ANTHROPIC_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"],
     requiresAny: true, // At least one of these is required
   },
   codex: {
-    requiredEnvVars: ["OPENAI_API_KEY"],
-    requiresAny: false,
+    automaticEnvVars: ["OPENAI_API_KEY"],
+    requiresAny: true,
   },
   opencode: {
-    requiredEnvVars: ["ANTHROPIC_API_KEY"],
-    requiresAny: false,
+    automaticEnvVars: ["ANTHROPIC_API_KEY", "OPENAI_API_KEY", "GOOGLE_API_KEY"],
+    requiresAny: false, // Try to auto-populate, but user may configure differently
   },
   goose: {
-    requiredEnvVars: ["ANTHROPIC_API_KEY"],
-    requiresAny: false,
+    automaticEnvVars: ["ANTHROPIC_API_KEY", "OPENAI_API_KEY", "GOOGLE_API_KEY"],
+    requiresAny: false, // Try to auto-populate, but user may configure differently
   },
   "gemini-cli": {
-    requiredEnvVars: ["GEMINI_API_KEY", "GOOGLE_API_KEY"],
+    automaticEnvVars: ["GEMINI_API_KEY", "GOOGLE_API_KEY"],
     requiresAny: true, // At least one of these is required
   },
 } as const;
@@ -120,7 +122,7 @@ async function ensureAgentSecrets(
   const agentConfig = SUPPORTED_AGENTS[agent];
   const secrets: Record<string, string> = {};
 
-  for (const varName of agentConfig.requiredEnvVars) {
+  for (const varName of agentConfig.automaticEnvVars) {
     const secretName = `${SECRET_PREFIX}${varName}`;
     const envValue = process.env[varName];
 
@@ -214,30 +216,22 @@ export async function runBenchmarkJob(options: RunOptions) {
     // Maps ENV_VAR -> BMJ_ENV_VAR (e.g., ANTHROPIC_API_KEY -> BMJ_ANTHROPIC_API_KEY)
     const agentSecrets = await ensureAgentSecrets(agent);
 
-    // Validate that at least one required secret is available
+    // Validate that at least one secret is available (only if requiresAny is true)
     const agentConfig = SUPPORTED_AGENTS[agent];
     if (agentConfig.requiresAny) {
-      const hasAny = agentConfig.requiredEnvVars.some(
+      const hasAny = agentConfig.automaticEnvVars.some(
         (varName) => agentSecrets[varName],
       );
       if (!hasAny) {
         throw new Error(
-          `Agent ${agent} requires at least one of: ${agentConfig.requiredEnvVars.join(", ")}. ` +
-            `Create secrets (${agentConfig.requiredEnvVars.map((v) => `${SECRET_PREFIX}${v}`).join(", ")}) ` +
+          `Agent ${agent} requires at least one of: ${agentConfig.automaticEnvVars.join(", ")}. ` +
+            `Create secrets (${agentConfig.automaticEnvVars.map((v) => `${SECRET_PREFIX}${v}`).join(", ")}) ` +
             `or set environment variables.`,
         );
       }
-    } else {
-      const hasAny = agentConfig.requiredEnvVars.some(
-        (varName) => agentSecrets[varName],
-      );
-      if (!hasAny) {
-        throw new Error(
-          `Agent ${agent} requires secrets. Expected one of: ${agentConfig.requiredEnvVars.map((v) => `${SECRET_PREFIX}${v}`).join(", ")}. ` +
-            `Create secrets or set environment variables.`,
-        );
-      }
     }
+    // If requiresAny is false, we just use whatever secrets were auto-populated
+    // User may be configuring credentials via other means (e.g., --secrets flag)
 
     // Combine agent secrets with user-provided secrets
     const secrets = {

From 3e364c202e5811dc4eb97cfbe3b07fa5a106ab63 Mon Sep 17 00:00:00 2001
From: Ross <ross@runloop.ai>
Date: Thu, 5 Mar 2026 14:02:55 -0800
Subject: [PATCH 6/6] pr feedback

---
 src/commands/benchmark-job/run.ts    | 1 +
 src/commands/benchmark-job/status.ts | 6 +++---
 src/utils/commands.ts                | 2 +-
 3 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/src/commands/benchmark-job/run.ts b/src/commands/benchmark-job/run.ts
index 79e426bf..e0960841 100644
--- a/src/commands/benchmark-job/run.ts
+++ b/src/commands/benchmark-job/run.ts
@@ -104,6 +104,7 @@ function validateAgent(agent: string): asserts agent is SupportedAgent {
 // Check if a secret exists by name
 async function secretExists(secretName: string): Promise<boolean> {
   const client = getClient();
+  // TODO: Fetch by name when API exposed.
   const result = await client.secrets.list({ limit: 5000 });
   return result.secrets?.some((s) => s.name === secretName) ?? false;
 }
diff --git a/src/commands/benchmark-job/status.ts b/src/commands/benchmark-job/status.ts
index fa55ffc4..c65bef8a 100644
--- a/src/commands/benchmark-job/status.ts
+++ b/src/commands/benchmark-job/status.ts
@@ -7,7 +7,7 @@ import { getBenchmarkJob } from "../../services/benchmarkJobService.js";
 import { output, outputError } from "../../utils/output.js";
 
 interface StatusOptions {
-  wait?: boolean;
+  watch?: boolean;
   output?: string;
 }
 
@@ -16,7 +16,7 @@ const COMPLETED_STATES = ["completed", "failed", "canceled", "timeout"];
 
 // Polling config
 const POLL_INTERVAL_MS = 10 * 1000; // 10 seconds
-const MAX_WAIT_MS = 60 * 60 * 1000; // 1 hour
+const MAX_WAIT_MS = 60 * 60 * 4 * 1000; // 4 hours
 
 // Sleep utility
 function sleep(ms: number): Promise<void> {
@@ -247,7 +247,7 @@ export async function statusBenchmarkJob(
     const isComplete = COMPLETED_STATES.includes(job.state || "");
 
     // If not waiting or already complete, just print status/results
-    if (!options.wait || isComplete) {
+    if (!options.watch || isComplete) {
       if (options.output && options.output !== "text") {
         output(job, { format: options.output, defaultFormat: "json" });
       } else if (isComplete) {
diff --git a/src/utils/commands.ts b/src/utils/commands.ts
index 8f922289..80038122 100644
--- a/src/utils/commands.ts
+++ b/src/utils/commands.ts
@@ -1057,7 +1057,7 @@ export function createProgram(): Command {
   benchmarkJob
     .command("status <id>")
     .description("Get benchmark job status and results")
-    .option("-w, --wait", "Wait for job to complete before showing results")
+    .option("-w, --watch", "Watch for job to complete before showing results")
     .option(
       "-o, --output [format]",
       "Output format: text|json|yaml (default: text)",