Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,7 @@ rli mcp install # Install Runloop MCP server configurat
rli benchmark-job run # Run a benchmark job with one or more ...
rli benchmark-job summary <id> # Get benchmark job summary and results
rli benchmark-job watch <id> # Watch benchmark job progress in real-...
rli benchmark-job list # List benchmark jobs
```


Expand Down
270 changes: 270 additions & 0 deletions src/commands/benchmark-job/list.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,270 @@
/**
* List benchmark jobs command
*/

import chalk from "chalk";
import {
listBenchmarkJobs,
type BenchmarkJob,
} from "../../services/benchmarkJobService.js";
import { output, outputError } from "../../utils/output.js";

interface ListOptions {
days?: string;
all?: boolean;
status?: string;
output?: string;
}

const VALID_STATES = [
"initializing",
"queued",
"running",
"completed",
"failed",
"cancelled",
"timeout",
];

const PAGE_SIZE = 100;

// --- Time formatting ---

function formatTimeAgo(timestampMs: number): string {
const diffMs = Date.now() - timestampMs;
const diffMinutes = Math.floor(diffMs / 60_000);
const diffHours = Math.floor(diffMs / 3_600_000);
const diffDays = Math.floor(diffMs / 86_400_000);

if (diffMinutes < 1) return "just now";
if (diffMinutes < 60) return `${diffMinutes}m ago`;
if (diffHours < 24) return `${diffHours}h ago`;
if (diffDays < 7) return `${diffDays}d ago`;

const date = new Date(timestampMs);
return date.toLocaleDateString("en-US", { month: "short", day: "numeric" });
}

// --- Job stats aggregation ---

interface JobStats {
done: number;
total: number;
errors: number;
avgScore: number | null;
}

function aggregateJobStats(job: BenchmarkJob): JobStats {
const outcomes = job.benchmark_outcomes || [];
const scenarioCount = job.job_spec?.scenario_ids?.length || 0;
const agentCount = job.job_spec?.agent_configs?.length || 1;
const total = scenarioCount * agentCount;

let done = 0;
let errors = 0;
let scoreSum = 0;
let scoreCount = 0;

for (const outcome of outcomes) {
done += outcome.n_completed + outcome.n_failed + outcome.n_timeout;
errors += outcome.n_failed + outcome.n_timeout;
if (outcome.average_score !== undefined && outcome.average_score !== null) {
scoreSum += outcome.average_score;
scoreCount++;
}
}

return {
done,
total: total || done,
errors,
avgScore: scoreCount > 0 ? scoreSum / scoreCount : null,
};
}

// --- Status coloring ---

function colorState(state: string): string {
switch (state) {
case "running":
return chalk.yellow(state);
case "completed":
return chalk.green(state);
case "failed":
case "timeout":
return chalk.red(state);
case "cancelled":
return chalk.dim(state);
case "initializing":
case "queued":
return chalk.cyan(state);
default:
return state;
}
}

// --- Table printing ---

// Fixed column widths (excluding NAME which is dynamic)
const COL_ID = 16;
const COL_STARTED = 10;
const COL_STATUS = 14;
const COL_DONE = 9;
const COL_ERRORS = 8;
const COL_SCORE = 7;
const FIXED_WIDTH =
COL_ID + COL_STARTED + COL_STATUS + COL_DONE + COL_ERRORS + COL_SCORE + 6; // 6 for spacing

function truncate(str: string, maxLen: number): string {
if (str.length <= maxLen) return str;
return str.slice(0, maxLen - 1) + "…";
}

function printTable(jobs: BenchmarkJob[]): void {
if (jobs.length === 0) {
console.log(chalk.dim("No benchmark jobs found"));
return;
}

const termWidth = process.stdout.columns || 120;
const nameWidth = Math.max(10, termWidth - FIXED_WIDTH);

// Header
const header =
"ID".padEnd(COL_ID) +
" " +
"NAME".padEnd(nameWidth) +
" " +
"STARTED".padEnd(COL_STARTED) +
" " +
"STATUS".padEnd(COL_STATUS) +
" " +
"DONE".padStart(COL_DONE) +
" " +
"ERRORS".padStart(COL_ERRORS) +
" " +
"SCORE".padStart(COL_SCORE);
console.log(chalk.bold(header));
console.log(chalk.dim("─".repeat(Math.min(header.length, termWidth))));

// Rows
for (const job of jobs) {
const stats = aggregateJobStats(job);

const id = truncate(job.id, COL_ID).padEnd(COL_ID);
const name = truncate(job.name || "", nameWidth).padEnd(nameWidth);
const started = formatTimeAgo(job.create_time_ms).padEnd(COL_STARTED);
const status = colorState(job.state || "unknown");
// Pad status accounting for chalk invisible chars
const statusRaw = job.state || "unknown";
const statusPad = " ".repeat(Math.max(0, COL_STATUS - statusRaw.length));

const doneStr = `${stats.done}/${stats.total}`.padStart(COL_DONE);
const errorsStr = String(stats.errors).padStart(COL_ERRORS);
const coloredErrors =
stats.errors > 0 ? chalk.red(errorsStr) : chalk.dim(errorsStr);

let scoreStr: string;
if (stats.avgScore !== null) {
const pct = Math.round(stats.avgScore * 100);
const pctStr = `${pct}%`.padStart(COL_SCORE);
scoreStr = pct >= 50 ? chalk.green(pctStr) : chalk.yellow(pctStr);
} else {
scoreStr = chalk.dim("N/A".padStart(COL_SCORE));
}

console.log(
`${id} ${name} ${started} ${status}${statusPad} ${doneStr} ${coloredErrors} ${scoreStr}`,
);
}

console.log();
console.log(chalk.dim(`${jobs.length} job${jobs.length !== 1 ? "s" : ""}`));
}

// --- Pagination and filtering ---

async function fetchJobs(
cutoffMs: number | null,
statusFilter: Set<string> | null,
): Promise<BenchmarkJob[]> {
const allJobs: BenchmarkJob[] = [];
let cursor: string | undefined;

while (true) {
const result = await listBenchmarkJobs({
limit: PAGE_SIZE,
startingAfter: cursor,
});

for (const job of result.jobs) {
// Stop pagination if we've passed the time cutoff (API returns newest-first)
if (cutoffMs !== null && job.create_time_ms < cutoffMs) {
return applyStatusFilter(allJobs, statusFilter);
}
allJobs.push(job);
}

if (!result.hasMore || result.jobs.length === 0) break;
cursor = result.jobs[result.jobs.length - 1].id;
}

return applyStatusFilter(allJobs, statusFilter);
}

function applyStatusFilter(
jobs: BenchmarkJob[],
statusFilter: Set<string> | null,
): BenchmarkJob[] {
if (!statusFilter) return jobs;
return jobs.filter((job) => statusFilter.has(job.state?.toLowerCase() || ""));
}

// --- Command entry point ---

export async function listBenchmarkJobsCommand(
options: ListOptions,
): Promise<void> {
try {
// Parse status filter
let statusFilter: Set<string> | null = null;
if (options.status) {
const statuses = options.status
.split(",")
.map((s) => s.trim().toLowerCase());
const invalid = statuses.filter((s) => !VALID_STATES.includes(s));
if (invalid.length > 0) {
outputError(
`Invalid status: ${invalid.join(", ")}. Valid: ${VALID_STATES.join(", ")}`,
);
}
statusFilter = new Set(statuses);
}

// Compute time cutoff
let cutoffMs: number | null = null;
if (!options.all) {
const days = options.days ? parseInt(options.days, 10) : 1;
if (isNaN(days) || days <= 0) {
outputError("--days must be a positive integer");
}
cutoffMs = Date.now() - days * 86_400_000;
}

// Fetch and filter
const jobs = await fetchJobs(cutoffMs, statusFilter);

// Sort ascending by create_time_ms (oldest first, most recent at bottom)
jobs.sort((a, b) => a.create_time_ms - b.create_time_ms);

// Output
const format = options.output || "text";
if (format !== "text") {
output(jobs, { format, defaultFormat: "json" });
} else {
printTable(jobs);
}
} catch (error) {
outputError("Failed to list benchmark jobs", error);
}
}
19 changes: 19 additions & 0 deletions src/utils/commands.ts
Original file line number Diff line number Diff line change
Expand Up @@ -1076,6 +1076,25 @@ export function createProgram(): Command {
await watchBenchmarkJob(id);
});

benchmarkJob
.command("list")
.description("List benchmark jobs")
.option("--days <n>", "Show jobs from the last N days (default: 1)")
.option("--all", "Show all jobs (no time filter)")
.option(
"--status <statuses>",
"Filter by status (comma-separated). Valid: initializing, queued, running, completed, failed, cancelled, timeout",
)
.option(
"-o, --output [format]",
"Output format: text|json|yaml (default: text)",
)
.action(async (options) => {
const { listBenchmarkJobsCommand } =
await import("../commands/benchmark-job/list.js");
await listBenchmarkJobsCommand(options);
});

// Hidden command: 'rli mcp' without subcommand starts the server (for Claude Desktop config compatibility)
program
.command("mcp-server", { hidden: true })
Expand Down
Loading