diff --git a/README.md b/README.md
index 9f833a11..86d1bb95 100644
--- a/README.md
+++ b/README.md
@@ -181,6 +181,13 @@ rli mcp start                            # Start the MCP server
 rli mcp install                          # Install Runloop MCP server configurat...
 ```
 
+### Benchmark-job Commands (alias: `bmj`)
+
+```bash
+rli benchmark-job run                    # Run a benchmark job with an agent
+rli benchmark-job status <id>            # Get benchmark job status and results
+```
+
 
 ## MCP Server (AI Integration)
 
diff --git a/src/commands/benchmark-job/run.ts b/src/commands/benchmark-job/run.ts
new file mode 100644
index 00000000..e0960841
--- /dev/null
+++ b/src/commands/benchmark-job/run.ts
@@ -0,0 +1,300 @@
+/**
+ * Run benchmark job command
+ */
+
+import chalk from "chalk";
+import { createBenchmarkJob } from "../../services/benchmarkJobService.js";
+import {
+  listBenchmarks,
+  listPublicBenchmarks,
+} from "../../services/benchmarkService.js";
+import { getClient } from "../../utils/client.js";
+import { output, outputError } from "../../utils/output.js";
+
+// Secret name prefix for benchmark job secrets
+const SECRET_PREFIX = "BMJ_";
+
+// Supported agents and their automatic environment variables (mapped to BMJ_* secrets)
+// - automaticEnvVars: env vars that will be auto-populated from secrets or environment
+// - requiresAny: if true, at least one must be set; if false, just try to auto-populate
+const SUPPORTED_AGENTS = {
+  "claude-code": {
+    automaticEnvVars: ["ANTHROPIC_API_KEY", "CLAUDE_CODE_OAUTH_TOKEN"],
+    requiresAny: true, // At least one of these is required
+  },
+  codex: {
+    automaticEnvVars: ["OPENAI_API_KEY"],
+    requiresAny: true,
+  },
+  opencode: {
+    automaticEnvVars: ["ANTHROPIC_API_KEY", "OPENAI_API_KEY", "GOOGLE_API_KEY"],
+    requiresAny: false, // Try to auto-populate, but user may configure differently
+  },
+  goose: {
+    automaticEnvVars: ["ANTHROPIC_API_KEY", "OPENAI_API_KEY", "GOOGLE_API_KEY"],
+    requiresAny: false, // Try to auto-populate, but user may configure differently
+  },
+  "gemini-cli": {
+    automaticEnvVars: ["GEMINI_API_KEY", "GOOGLE_API_KEY"],
+    requiresAny: true, // At least one of these is required
+  },
+} as const;
+
+type SupportedAgent = keyof typeof SUPPORTED_AGENTS;
+
+interface RunOptions {
+  agent: string;
+  model: string;
+  benchmark?: string;
+  scenarios?: string[];
+  jobName?: string;
+  envVars?: string[];
+  secrets?: string[];
+  timeout?: string;
+  nAttempts?: string;
+  nConcurrentTrials?: string;
+  timeoutMultiplier?: string;
+  output?: string;
+}
+
+// Parse environment variables from KEY=value format
+function parseEnvVars(envVars: string[]): Record<string, string> {
+  const result: Record<string, string> = {};
+  for (const envVar of envVars) {
+    const eqIndex = envVar.indexOf("=");
+    if (eqIndex === -1) {
+      throw new Error(
+        `Invalid environment variable format: ${envVar}. Expected KEY=value`,
+      );
+    }
+    const key = envVar.substring(0, eqIndex);
+    const value = envVar.substring(eqIndex + 1);
+    result[key] = value;
+  }
+  return result;
+}
+
+// Parse secrets from ENV_VAR=SECRET_NAME format
+function parseSecrets(secrets: string[]): Record<string, string> {
+  const result: Record<string, string> = {};
+  for (const secret of secrets) {
+    const eqIndex = secret.indexOf("=");
+    if (eqIndex === -1) {
+      throw new Error(
+        `Invalid secret format: ${secret}. Expected ENV_VAR=SECRET_NAME`,
+      );
+    }
+    const envVarName = secret.substring(0, eqIndex);
+    const secretName = secret.substring(eqIndex + 1);
+    result[envVarName] = secretName;
+  }
+  return result;
+}
+
+// Validate agent is supported
+function validateAgent(agent: string): asserts agent is SupportedAgent {
+  if (!(agent in SUPPORTED_AGENTS)) {
+    const supportedList = Object.keys(SUPPORTED_AGENTS).join(", ");
+    throw new Error(
+      `Unsupported agent: ${agent}. Supported agents: ${supportedList}`,
+    );
+  }
+}
+
+// Check if a secret exists by name
+async function secretExists(secretName: string): Promise<boolean> {
+  const client = getClient();
+  // TODO: Fetch by name when API exposed.
+  const result = await client.secrets.list({ limit: 5000 });
+  return result.secrets?.some((s) => s.name === secretName) ?? false;
+}
+
+// Create a secret
+async function createSecret(name: string, value: string): Promise<void> {
+  const client = getClient();
+  await client.secrets.create({ name, value });
+}
+
+// Ensure agent secrets exist, creating them from env vars if needed
+// Returns the secrets mapping (ENV_VAR -> BMJ_ENV_VAR)
+async function ensureAgentSecrets(
+  agent: SupportedAgent,
+): Promise<Record<string, string>> {
+  const agentConfig = SUPPORTED_AGENTS[agent];
+  const secrets: Record<string, string> = {};
+
+  for (const varName of agentConfig.automaticEnvVars) {
+    const secretName = `${SECRET_PREFIX}${varName}`;
+    const envValue = process.env[varName];
+
+    // Check if secret exists
+    const exists = await secretExists(secretName);
+
+    if (exists) {
+      console.log(chalk.dim(`Secret ${secretName} exists`));
+      secrets[varName] = secretName;
+    } else if (envValue) {
+      // Create secret from env var
+      console.log(
+        chalk.cyan(`Creating secret ${secretName} from ${varName} env var`),
+      );
+      await createSecret(secretName, envValue);
+      secrets[varName] = secretName;
+    } else {
+      // No secret and no env var - skip (will be validated later if required)
+      console.log(
+        chalk.yellow(
+          `Secret ${secretName} not found and ${varName} not set in environment`,
+        ),
+      );
+    }
+  }
+
+  return secrets;
+}
+
+// Resolve benchmark name to ID if needed
+async function resolveBenchmarkId(benchmarkIdOrName: string): Promise<string> {
+  // If it looks like an ID (starts with bm_ or similar), return as-is
+  if (
+    benchmarkIdOrName.startsWith("bm_") ||
+    benchmarkIdOrName.startsWith("bmk_")
+  ) {
+    return benchmarkIdOrName;
+  }
+
+  // Search both user benchmarks and public benchmarks
+  const [userResult, publicResult] = await Promise.all([
+    listBenchmarks({
+      limit: 100,
+      search: benchmarkIdOrName,
+    }),
+    listPublicBenchmarks({
+      limit: 100,
+      search: benchmarkIdOrName,
+    }),
+  ]);
+
+  // Combine results
+  const allBenchmarks = [...userResult.benchmarks, ...publicResult.benchmarks];
+
+  // Look for exact name match
+  const exactMatch = allBenchmarks.find((b) => b.name === benchmarkIdOrName);
+
+  if (exactMatch) {
+    return exactMatch.id;
+  }
+
+  if (allBenchmarks.length === 0) {
+    throw new Error(`No benchmark found with name: ${benchmarkIdOrName}`);
+  }
+
+  // If no exact match but we have results, suggest them
+  const suggestions = allBenchmarks
+    .slice(0, 5)
+    .map((b) => `  - ${b.name} (${b.id})`)
+    .join("\n");
+  throw new Error(
+    `No exact match for benchmark "${benchmarkIdOrName}". Did you mean:\n${suggestions}`,
+  );
+}
+
+export async function runBenchmarkJob(options: RunOptions) {
+  try {
+    // Validate agent
+    validateAgent(options.agent);
+    const agent = options.agent as SupportedAgent;
+
+    // Parse provided env vars and secrets
+    const providedEnvVars = options.envVars
+      ? parseEnvVars(options.envVars)
+      : {};
+    const providedSecrets = options.secrets
+      ? parseSecrets(options.secrets)
+      : {};
+
+    // Ensure agent secrets exist (auto-create from env vars if needed)
+    // Maps ENV_VAR -> BMJ_ENV_VAR (e.g., ANTHROPIC_API_KEY -> BMJ_ANTHROPIC_API_KEY)
+    const agentSecrets = await ensureAgentSecrets(agent);
+
+    // Validate that at least one secret is available (only if requiresAny is true)
+    const agentConfig = SUPPORTED_AGENTS[agent];
+    if (agentConfig.requiresAny) {
+      const hasAny = agentConfig.automaticEnvVars.some(
+        (varName) => agentSecrets[varName],
+      );
+      if (!hasAny) {
+        throw new Error(
+          `Agent ${agent} requires at least one of: ${agentConfig.automaticEnvVars.join(", ")}. ` +
+            `Create secrets (${agentConfig.automaticEnvVars.map((v) => `${SECRET_PREFIX}${v}`).join(", ")}) ` +
+            `or set environment variables.`,
+        );
+      }
+    }
+    // If requiresAny is false, we just use whatever secrets were auto-populated
+    // User may be configuring credentials via other means (e.g., --secrets flag)
+
+    // Combine agent secrets with user-provided secrets
+    const secrets = {
+      ...agentSecrets,
+      ...providedSecrets,
+    };
+
+    // Validate that either benchmark or scenarios is provided, but not both
+    if (!options.benchmark && !options.scenarios) {
+      throw new Error("Either --benchmark or --scenarios must be specified");
+    }
+    if (options.benchmark && options.scenarios) {
+      throw new Error("Cannot specify both --benchmark and --scenarios");
+    }
+
+    // Resolve benchmark ID if name was provided
+    let benchmarkId: string | undefined;
+    if (options.benchmark) {
+      benchmarkId = await resolveBenchmarkId(options.benchmark);
+    }
+
+    // Build orchestrator config with defaults
+    const orchestratorConfig = {
+      nConcurrentTrials: options.nConcurrentTrials
+        ? parseInt(options.nConcurrentTrials, 10)
+        : 10,
+      nAttempts: options.nAttempts ? parseInt(options.nAttempts, 10) : 1,
+      timeoutMultiplier: options.timeoutMultiplier
+        ? parseFloat(options.timeoutMultiplier)
+        : 1.0,
+      quiet: false,
+    };
+
+    // Create the benchmark job
+    const job = await createBenchmarkJob({
+      name: options.jobName,
+      benchmarkId,
+      scenarioIds: options.scenarios,
+      agentConfigs: [
+        {
+          name: agent,
+          modelName: options.model,
+          timeoutSeconds: options.timeout
+            ? parseInt(options.timeout, 10)
+            : 1800,
+          environmentVariables:
+            Object.keys(providedEnvVars).length > 0
+              ? providedEnvVars
+              : undefined,
+          secrets,
+        },
+      ],
+      orchestratorConfig,
+    });
+
+    // Output result
+    if (!options.output || options.output === "text") {
+      console.log(job.id);
+    } else {
+      output(job, { format: options.output, defaultFormat: "json" });
+    }
+  } catch (error) {
+    outputError("Failed to run benchmark job", error);
+  }
+}
diff --git a/src/commands/benchmark-job/status.ts b/src/commands/benchmark-job/status.ts
new file mode 100644
index 00000000..c65bef8a
--- /dev/null
+++ b/src/commands/benchmark-job/status.ts
@@ -0,0 +1,295 @@
+/**
+ * Status benchmark job command
+ */
+
+import chalk from "chalk";
+import { getBenchmarkJob } from "../../services/benchmarkJobService.js";
+import { output, outputError } from "../../utils/output.js";
+
+interface StatusOptions {
+  watch?: boolean;
+  output?: string;
+}
+
+// Job states that indicate completion
+const COMPLETED_STATES = ["completed", "failed", "canceled", "timeout"];
+
+// Polling config
+const POLL_INTERVAL_MS = 10 * 1000; // 10 seconds
+const MAX_WAIT_MS = 60 * 60 * 4 * 1000; // 4 hours
+
+// Sleep utility
+function sleep(ms: number): Promise<void> {
+  return new Promise((resolve) => setTimeout(resolve, ms));
+}
+
+interface ScenarioOutcome {
+  scenario_name?: string;
+  scenario_definition_id?: string;
+  state?: string;
+  score?: number;
+}
+
+interface BenchmarkOutcome {
+  agent_name?: string;
+  model_name?: string;
+  scenario_outcomes?: ScenarioOutcome[];
+}
+
+interface JobData {
+  id: string;
+  name?: string;
+  state?: string;
+  benchmark_outcomes?: BenchmarkOutcome[];
+}
+
+// Calculate stats for scenario outcomes
+function calculateStats(outcomes: ScenarioOutcome[]): {
+  total: number;
+  passed: number;
+  failedZero: number;
+  failedError: number;
+} {
+  let passed = 0;
+  let failedZero = 0;
+  let failedError = 0;
+
+  for (const outcome of outcomes) {
+    const state = outcome.state?.toUpperCase();
+    const score = outcome.score;
+
+    if (state === "COMPLETED") {
+      if (score === 1.0) {
+        passed++;
+      } else {
+        failedZero++;
+      }
+    } else {
+      // Any non-COMPLETED state is an error
+      failedError++;
+    }
+  }
+
+  return {
+    total: outcomes.length,
+    passed,
+    failedZero,
+    failedError,
+  };
+}
+
+// Format percentage
+function formatPercent(count: number, total: number): string {
+  if (total === 0) return "0.0%";
+  return ((count / total) * 100).toFixed(1) + "%";
+}
+
+// Print current status (brief)
+function printStatus(job: JobData): void {
+  const jobName = job.name || job.id;
+  const state = job.state || "unknown";
+
+  console.log(`Job: ${jobName}`);
+  console.log(`ID: ${job.id}`);
+  console.log(`State: ${state}`);
+
+  if (COMPLETED_STATES.includes(state)) {
+    const outcomes = job.benchmark_outcomes || [];
+    if (outcomes.length > 0) {
+      let totalScenarios = 0;
+      let totalPassed = 0;
+      for (const outcome of outcomes) {
+        const stats = calculateStats(outcome.scenario_outcomes || []);
+        totalScenarios += stats.total;
+        totalPassed += stats.passed;
+      }
+      if (totalScenarios > 0) {
+        console.log(
+          `Results: ${totalPassed}/${totalScenarios} passed (${formatPercent(totalPassed, totalScenarios)})`,
+        );
+      }
+    }
+  }
+}
+
+// Print results table
+function printResultsTable(job: JobData): void {
+  const outcomes = job.benchmark_outcomes || [];
+
+  if (outcomes.length === 0) {
+    console.log(chalk.yellow("No benchmark outcomes found"));
+    return;
+  }
+
+  // Header
+  console.log();
+  console.log(chalk.bold("Benchmark Job Results"));
+  console.log(chalk.dim(`Job ID: ${job.id}`));
+  if (job.name) {
+    console.log(chalk.dim(`Name: ${job.name}`));
+  }
+  console.log(chalk.dim(`State: ${job.state}`));
+  console.log();
+
+  // Table header
+  const agentCol = "Agent / Model".padEnd(40);
+  const passedCol = "Passed".padStart(10);
+  const failedCol = "Failed (0.0)".padStart(14);
+  const errorCol = "Failed (error)".padStart(16);
+  const totalCol = "Total".padStart(8);
+
+  console.log(
+    chalk.bold(agentCol + passedCol + failedCol + errorCol + totalCol),
+  );
+  console.log(chalk.dim("-".repeat(88)));
+
+  // Print each agent's results
+  for (const outcome of outcomes) {
+    const agentName = outcome.agent_name || "unknown";
+    const modelName = outcome.model_name || "default";
+    const scenarioOutcomes = outcome.scenario_outcomes || [];
+
+    const stats = calculateStats(scenarioOutcomes);
+
+    // Format agent/model column
+    let agentModelStr = agentName;
+    if (modelName && modelName !== "default") {
+      agentModelStr += ` (${modelName})`;
+    }
+    if (agentModelStr.length > 38) {
+      agentModelStr = agentModelStr.slice(0, 35) + "...";
+    }
+    const agentModelCol = agentModelStr.padEnd(40);
+
+    // Format stats columns with colors
+    const passedStr = formatPercent(stats.passed, stats.total);
+    const failedZeroStr = formatPercent(stats.failedZero, stats.total);
+    const failedErrorStr = formatPercent(stats.failedError, stats.total);
+
+    const passedColored =
+      stats.passed > 0
+        ? chalk.green(passedStr.padStart(10))
+        : chalk.dim(passedStr.padStart(10));
+
+    const failedZeroColored =
+      stats.failedZero > 0
+        ? chalk.yellow(failedZeroStr.padStart(14))
+        : chalk.dim(failedZeroStr.padStart(14));
+
+    const failedErrorColored =
+      stats.failedError > 0
+        ? chalk.red(failedErrorStr.padStart(16))
+        : chalk.dim(failedErrorStr.padStart(16));
+
+    const totalColStr = String(stats.total).padStart(8);
+
+    console.log(
+      agentModelCol +
+        passedColored +
+        failedZeroColored +
+        failedErrorColored +
+        chalk.dim(totalColStr),
+    );
+
+    // Print individual scenario results underneath (indented)
+    for (const scenario of scenarioOutcomes) {
+      const scenarioName =
+        scenario.scenario_name || scenario.scenario_definition_id || "unknown";
+      const state = scenario.state || "unknown";
+      const score = scenario.score;
+
+      let statusIcon: string;
+      let statusColor: typeof chalk.green;
+
+      if (state.toUpperCase() === "COMPLETED") {
+        if (score === 1.0) {
+          statusIcon = chalk.green("\u2713"); // checkmark
+          statusColor = chalk.green;
+        } else {
+          statusIcon = chalk.yellow("\u2717"); // X
+          statusColor = chalk.yellow;
+        }
+      } else {
+        statusIcon = chalk.red("!");
+        statusColor = chalk.red;
+      }
+
+      const scenarioNameTrunc =
+        scenarioName.length > 50
+          ? scenarioName.slice(0, 47) + "..."
+          : scenarioName;
+
+      const scoreStr =
+        score !== undefined ? `score=${score.toFixed(1)}` : state;
+
+      console.log(
+        chalk.dim("  ") +
+          statusIcon +
+          " " +
+          chalk.dim(scenarioNameTrunc.padEnd(52)) +
+          statusColor(scoreStr),
+      );
+    }
+  }
+
+  console.log();
+}
+
+export async function statusBenchmarkJob(
+  id: string,
+  options: StatusOptions = {},
+) {
+  try {
+    // Initial fetch
+    let job = (await getBenchmarkJob(id)) as unknown as JobData;
+
+    // Check if job is complete
+    const isComplete = COMPLETED_STATES.includes(job.state || "");
+
+    // If not waiting or already complete, just print status/results
+    if (!options.watch || isComplete) {
+      if (options.output && options.output !== "text") {
+        output(job, { format: options.output, defaultFormat: "json" });
+      } else if (isComplete) {
+        printResultsTable(job);
+      } else {
+        printStatus(job);
+      }
+      return;
+    }
+
+    // Wait mode: poll until complete
+    const jobName = job.name || job.id;
+    console.log(chalk.cyan(`Awaiting job "${jobName}" completion...`));
+    console.log(chalk.dim(`Current state: ${job.state}`));
+    console.log();
+
+    const startTime = Date.now();
+
+    while (!COMPLETED_STATES.includes(job.state || "")) {
+      // Check timeout
+      if (Date.now() - startTime > MAX_WAIT_MS) {
+        console.log();
+        outputError(
+          `Timeout waiting for job completion after ${MAX_WAIT_MS / 1000 / 60} minutes`,
+        );
+      }
+
+      await sleep(POLL_INTERVAL_MS);
+      job = (await getBenchmarkJob(id)) as unknown as JobData;
+      process.stdout.write(chalk.dim("."));
+    }
+
+    console.log();
+    console.log();
+
+    // Output based on format
+    if (options.output && options.output !== "text") {
+      output(job, { format: options.output, defaultFormat: "json" });
+    } else {
+      printResultsTable(job);
+    }
+  } catch (error) {
+    outputError("Failed to get benchmark job status", error);
+  }
+}
diff --git a/src/services/benchmarkService.ts b/src/services/benchmarkService.ts
index 17a164cd..e6373464 100644
--- a/src/services/benchmarkService.ts
+++ b/src/services/benchmarkService.ts
@@ -179,6 +179,40 @@ export async function getBenchmark(id: string): Promise<Benchmark> {
   return client.benchmarks.retrieve(id);
 }
 
+/**
+ * List public benchmark definitions with pagination
+ */
+export async function listPublicBenchmarks(
+  options: ListBenchmarksOptions,
+): Promise<ListBenchmarksResult> {
+  const client = getClient();
+
+  const queryParams: {
+    limit?: number;
+    starting_after?: string;
+    search?: string;
+  } = {
+    limit: options.limit,
+  };
+
+  if (options.startingAfter) {
+    queryParams.starting_after = options.startingAfter;
+  }
+
+  if (options.search) {
+    queryParams.search = options.search;
+  }
+
+  const page = await client.benchmarks.listPublic(queryParams);
+  const benchmarks = page.benchmarks || [];
+
+  return {
+    benchmarks,
+    totalCount: benchmarks.length,
+    hasMore: page.has_more || false,
+  };
+}
+
 /**
  * Create/start a benchmark run with selected benchmarks
  */
diff --git a/src/utils/commands.ts b/src/utils/commands.ts
index 07c67d0b..80038122 100644
--- a/src/utils/commands.ts
+++ b/src/utils/commands.ts
@@ -1012,6 +1012,62 @@ export function createProgram(): Command {
       await installMcpConfig();
     });
 
+  // Benchmark job commands
+  const benchmarkJob = program
+    .command("benchmark-job")
+    .description("Manage benchmark jobs")
+    .alias("bmj");
+
+  benchmarkJob
+    .command("run")
+    .description("Run a benchmark job with an agent")
+    .requiredOption(
+      "--agent <agent>",
+      "Agent to use (claude-code, codex, opencode, goose, gemini-cli)",
+    )
+    .requiredOption("--model <model>", "Model name for the agent")
+    .option("--benchmark <id-or-name>", "Benchmark ID or name to run")
+    .option(
+      "--scenarios <ids...>",
+      "Scenario IDs to run (alternative to --benchmark)",
+    )
+    .option("-n, --job-name <name>", "Job name")
+    .option(
+      "--env-vars <vars...>",
+      "Additional environment variables (format: KEY=value)",
+    )
+    .option(
+      "--secrets <secrets...>",
+      "Secrets to inject as environment variables (format: ENV_VAR=SECRET_NAME)",
+    )
+    .option("--timeout <seconds>", "Agent timeout in seconds")
+    .option("--n-attempts <n>", "Number of attempts per scenario")
+    .option("--n-concurrent-trials <n>", "Number of concurrent trials")
+    .option("--timeout-multiplier <n>", "Timeout multiplier")
+    .option(
+      "-o, --output [format]",
+      "Output format: text|json|yaml (default: text)",
+    )
+    .action(async (options) => {
+      const { runBenchmarkJob } =
+        await import("../commands/benchmark-job/run.js");
+      await runBenchmarkJob(options);
+    });
+
+  benchmarkJob
+    .command("status <id>")
+    .description("Get benchmark job status and results")
+    .option("-w, --watch", "Watch for job to complete before showing results")
+    .option(
+      "-o, --output [format]",
+      "Output format: text|json|yaml (default: text)",
+    )
+    .action(async (id, options) => {
+      const { statusBenchmarkJob } =
+        await import("../commands/benchmark-job/status.js");
+      await statusBenchmarkJob(id, options);
+    });
+
   // Hidden command: 'rli mcp' without subcommand starts the server (for Claude Desktop config compatibility)
   program
     .command("mcp-server", { hidden: true })