From 651119436091782a25bbe884555a9347f9225eef Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sat, 14 Mar 2026 06:08:54 +0000 Subject: [PATCH 1/3] feat: self-contained HTML dashboard for eval results (#562) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add html-writer.ts implementing OutputWriter interface that produces a single self-contained .html report file with all CSS/JS inlined. Views: - Overview: stat cards (total/passed/failed/errors/pass rate/duration/tokens/cost), multi-target comparison table, score distribution histogram - Test Cases: filterable/sortable table with per-evaluator score columns, expandable detail rows showing input/output, evaluator reasoning, expectations, and metadata Features: - Tab-based navigation between Overview and Test Cases views - Filter by status (pass/fail/error), target, and search by test ID - Sortable columns with direction indicators - Color-coded scores (green ≥90%, yellow ≥50%, red <50%) - Meta-refresh (2s) during live eval runs, removed on close - Thread-safe concurrent writes via Mutex - No external network requests, works fully offline - Zero new runtime dependencies Registration: - .html/.htm extension auto-detection in createWriterFromPath - 'html' format option in createOutputWriter - Updated CLI help text for --output and --output-format flags Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- apps/cli/src/commands/eval/commands/run.ts | 4 +- apps/cli/src/commands/eval/html-writer.ts | 545 ++++++++++++++++++++ apps/cli/src/commands/eval/output-writer.ts | 12 +- 3 files changed, 557 insertions(+), 4 deletions(-) create mode 100644 apps/cli/src/commands/eval/html-writer.ts diff --git a/apps/cli/src/commands/eval/commands/run.ts b/apps/cli/src/commands/eval/commands/run.ts index da8d39fc9..4a7ec1b50 100644 --- a/apps/cli/src/commands/eval/commands/run.ts +++ b/apps/cli/src/commands/eval/commands/run.ts @@ -53,12 +53,12 @@ export const evalRunCommand = command({ long: 'output', short: 'o', description: - 'Output file path(s). Format inferred from extension: .jsonl, .json, .xml, .yaml', + 'Output file path(s). Format inferred from extension: .jsonl, .json, .xml, .yaml, .html', }), outputFormat: option({ type: optional(string), long: 'output-format', - description: "Output format: 'jsonl' or 'yaml' (default: jsonl)", + description: "Output format: 'jsonl', 'yaml', or 'html' (default: jsonl)", }), dryRun: flag({ long: 'dry-run', diff --git a/apps/cli/src/commands/eval/html-writer.ts b/apps/cli/src/commands/eval/html-writer.ts new file mode 100644 index 000000000..04ad4d296 --- /dev/null +++ b/apps/cli/src/commands/eval/html-writer.ts @@ -0,0 +1,545 @@ +import { mkdir, writeFile } from 'node:fs/promises'; +import path from 'node:path'; + +import { Mutex } from 'async-mutex'; + +import type { EvaluationResult } from '@agentv/core'; + +export class HtmlWriter { + private readonly filePath: string; + private readonly results: EvaluationResult[] = []; + private readonly mutex = new Mutex(); + private closed = false; + private isLive = true; + + private constructor(filePath: string) { + this.filePath = filePath; + } + + static async open(filePath: string): Promise { + await mkdir(path.dirname(filePath), { recursive: true }); + const writer = new HtmlWriter(filePath); + await writer.writeHtml(); + return writer; + } + + async append(result: EvaluationResult): Promise { + await this.mutex.runExclusive(async () => { + if (this.closed) { + throw new Error('Cannot write to closed HTML writer'); + } + this.results.push(result); + await this.writeHtml(); + }); + } + + async close(): Promise { + await this.mutex.runExclusive(async () => { + if (this.closed) { + return; + } + this.closed = true; + this.isLive = false; + await this.writeHtml(); + }); + } + + private async writeHtml(): Promise { + const html = generateHtml(this.results, this.isLive); + await writeFile(this.filePath, html, 'utf8'); + } +} + +function generateHtml(results: readonly EvaluationResult[], isLive: boolean): string { + // Strip heavy fields to keep file size manageable + const lightResults = results.map((r) => { + const { requests, trace, ...rest } = r as EvaluationResult & Record; + return rest; + }); + // Escape in JSON to prevent breaking out of script tag + const dataJson = JSON.stringify(lightResults).replace(/<\//g, '<\\/'); + const metaRefresh = isLive ? ' \n' : ''; + const liveIndicator = isLive + ? '\u25CF LIVE' + : `${escapeHtml(new Date().toISOString())}`; + + return ` + + + + +${metaRefresh} AgentV Evaluation Report + + + +
+
+

AgentV

+ Evaluation Report +
+
${liveIndicator}
+
+ +
+ + +`; +} + +function escapeHtml(s: string): string { + return s + .replace(/&/g, '&') + .replace(//g, '>') + .replace(/"/g, '"'); +} + +// --------------------------------------------------------------------------- +// Embedded CSS +// --------------------------------------------------------------------------- +const STYLES = ` +*{margin:0;padding:0;box-sizing:border-box} +:root{ + --bg:#f6f8fa;--surface:#fff;--border:#d0d7de;--border-light:#e8ebee; + --text:#1f2328;--text-muted:#656d76; + --primary:#0969da;--primary-bg:#ddf4ff; + --success:#1a7f37;--success-bg:#dafbe1; + --danger:#cf222e;--danger-bg:#ffebe9; + --warning:#9a6700;--warning-bg:#fff8c5; + --radius:6px; + --shadow:0 1px 3px rgba(31,35,40,.04),0 1px 2px rgba(31,35,40,.06); + --font:-apple-system,BlinkMacSystemFont,"Segoe UI","Noto Sans",Helvetica,Arial,sans-serif; + --mono:ui-monospace,SFMono-Regular,"SF Mono",Menlo,Consolas,monospace; +} +body{font-family:var(--font);background:var(--bg);color:var(--text);line-height:1.5;font-size:14px} + +/* Header */ +.header{background:var(--surface);border-bottom:1px solid var(--border);padding:12px 24px;display:flex;align-items:center;justify-content:space-between} +.header-left{display:flex;align-items:baseline;gap:12px} +.header-title{font-size:18px;font-weight:600} +.header-subtitle{font-size:14px;color:var(--text-muted)} +.live-badge{color:var(--success);font-size:12px;font-weight:600;animation:pulse 2s infinite} +@keyframes pulse{0%,100%{opacity:1}50%{opacity:.4}} +.timestamp{font-size:12px;color:var(--text-muted);font-family:var(--mono)} + +/* Tabs */ +.tabs{background:var(--surface);border-bottom:1px solid var(--border);padding:0 24px;display:flex} +.tab{background:none;border:none;padding:10px 16px;font-size:14px;color:var(--text-muted);cursor:pointer;border-bottom:2px solid transparent;font-family:var(--font);transition:color .15s,border-color .15s} +.tab:hover{color:var(--text)} +.tab.active{color:var(--text);font-weight:600;border-bottom-color:var(--primary)} + +#app{max-width:1280px;margin:0 auto;padding:24px} + +/* Stat cards */ +.stats-grid{display:grid;grid-template-columns:repeat(auto-fit,minmax(140px,1fr));gap:12px;margin-bottom:24px} +.stat-card{background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);padding:16px;text-align:center;box-shadow:var(--shadow)} +.stat-card.pass .stat-value{color:var(--success)} +.stat-card.fail .stat-value{color:var(--danger)} +.stat-card.error .stat-value{color:var(--danger)} +.stat-card.warn .stat-value{color:var(--warning)} +.stat-card.total .stat-value{color:var(--primary)} +.stat-value{font-size:28px;font-weight:700;line-height:1.2} +.stat-label{font-size:12px;color:var(--text-muted);text-transform:uppercase;letter-spacing:.5px;margin-top:4px} + +/* Sections */ +.section{margin-bottom:24px} +.section-title{font-size:16px;font-weight:600;margin-bottom:12px} + +/* Tables */ +.table-wrap{overflow-x:auto;background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);box-shadow:var(--shadow)} +.data-table{width:100%;border-collapse:collapse;font-size:13px} +.data-table th{background:var(--bg);border-bottom:1px solid var(--border);padding:8px 12px;text-align:left;font-weight:600;font-size:12px;color:var(--text-muted);text-transform:uppercase;letter-spacing:.3px;white-space:nowrap} +.data-table th.sortable{cursor:pointer;user-select:none} +.data-table th.sortable:hover{color:var(--text)} +.data-table td{padding:8px 12px;border-bottom:1px solid var(--border-light);vertical-align:middle} +.data-table tbody tr:last-child td{border-bottom:none} + +/* Status icons */ +.status-icon{display:inline-flex;align-items:center;justify-content:center;width:22px;height:22px;border-radius:50%;font-size:12px;font-weight:700} +.status-icon.pass{background:var(--success-bg);color:var(--success)} +.status-icon.fail{background:var(--danger-bg);color:var(--danger)} +.status-icon.error{background:var(--warning-bg);color:var(--warning)} + +/* Score colors */ +.score-high{color:var(--success);font-weight:600} +.score-mid{color:var(--warning);font-weight:600} +.score-low{color:var(--danger);font-weight:600} + +/* Pass-rate bar */ +.bar-bg{width:100px;height:8px;background:var(--border-light);border-radius:4px;overflow:hidden} +.bar-fill{height:100%;border-radius:4px;transition:width .3s} +.bar-fill.score-high{background:var(--success)} +.bar-fill.score-mid{background:var(--warning)} +.bar-fill.score-low{background:var(--danger)} + +/* Histogram */ +.histogram{background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);padding:16px;box-shadow:var(--shadow)} +.hist-row{display:flex;align-items:center;gap:12px;margin-bottom:8px} +.hist-row:last-child{margin-bottom:0} +.hist-label{width:60px;font-size:12px;color:var(--text-muted);text-align:right;flex-shrink:0} +.hist-bar-bg{flex:1;height:20px;background:var(--border-light);border-radius:3px;overflow:hidden} +.hist-bar{height:100%;border-radius:3px;transition:width .3s} +.hist-count{width:30px;font-size:12px;color:var(--text-muted);text-align:right;flex-shrink:0} + +/* Filters */ +.filter-bar{display:flex;gap:8px;margin-bottom:16px;align-items:center;flex-wrap:wrap} +.filter-select,.filter-search{padding:6px 10px;border:1px solid var(--border);border-radius:var(--radius);font-size:13px;background:var(--surface);color:var(--text);font-family:var(--font)} +.filter-search{flex:1;min-width:200px} +.filter-count{font-size:12px;color:var(--text-muted);margin-left:auto} + +/* Test rows */ +.test-row{cursor:pointer;transition:background .1s} +.test-row:hover{background:var(--bg)!important} +.test-row.expanded{background:var(--primary-bg)!important} +.expand-col{width:32px;text-align:center} +.expand-icon{color:var(--text-muted);font-size:12px} +.fw-medium{font-weight:500} +.text-pass{color:var(--success)}.text-fail{color:var(--danger)}.text-error{color:var(--warning)} + +/* Detail panel */ +.detail-row td{padding:0!important;background:var(--bg)!important} +.detail-panel{padding:16px 24px} +.detail-grid{display:grid;grid-template-columns:1fr 1fr;gap:16px;margin-bottom:16px} +.detail-block h4{font-size:12px;color:var(--text-muted);text-transform:uppercase;letter-spacing:.3px;margin-bottom:6px} +.detail-pre{background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);padding:12px;font-family:var(--mono);font-size:12px;white-space:pre-wrap;word-break:break-word;max-height:300px;overflow-y:auto;line-height:1.6} +.detail-panel h4{font-size:13px;font-weight:600;margin:16px 0 8px} +.eval-table{width:100%;border-collapse:collapse;font-size:13px;background:var(--surface);border:1px solid var(--border);border-radius:var(--radius);margin-bottom:12px} +.eval-table th{background:var(--bg);padding:6px 10px;text-align:left;font-size:11px;font-weight:600;color:var(--text-muted);text-transform:uppercase;border-bottom:1px solid var(--border)} +.eval-table td{padding:8px 10px;border-bottom:1px solid var(--border-light)} +.reasoning-cell{max-width:500px;font-size:12px;color:var(--text-muted)} +.expect-list{list-style:none;padding:0;margin-bottom:12px} +.expect-list li{padding:4px 8px 4px 24px;position:relative;font-size:13px} +.expect-list.pass li::before{content:"\\2713";position:absolute;left:4px;color:var(--success);font-weight:700} +.expect-list.fail li::before{content:"\\2717";position:absolute;left:4px;color:var(--danger);font-weight:700} +.error-box{background:var(--danger-bg);border:1px solid var(--danger);border-radius:var(--radius);padding:12px;margin-bottom:12px} +.error-box h4{color:var(--danger);margin:0 0 6px} +.error-box pre{font-family:var(--mono);font-size:12px;white-space:pre-wrap;word-break:break-word} +.detail-meta{font-size:12px;color:var(--text-muted);margin-top:12px;padding-top:12px;border-top:1px solid var(--border-light)} +.empty-state{text-align:center;padding:48px 24px;color:var(--text-muted)} +.empty-state h3{font-size:16px;margin-bottom:8px;color:var(--text)} +`; + +// --------------------------------------------------------------------------- +// Embedded JavaScript (no template literals — all string concatenation) +// --------------------------------------------------------------------------- +const SCRIPT = ` +(function(){ + /* ---- helpers ---- */ + function esc(s){ + if(s==null)return""; + return String(s).replace(/&/g,"&").replace(//g,">").replace(/"/g,"""); + } + function getStatus(r){ + if(r.executionStatus==="execution_error")return"error"; + if(r.executionStatus==="quality_failure")return"fail"; + if(r.executionStatus==="ok")return"pass"; + if(r.error)return"error"; + return r.score>=0.5?"pass":"fail"; + } + function sIcon(s){ + if(s==="pass")return'\\u2713'; + if(s==="fail")return'\\u2717'; + return'!'; + } + function fmtDur(ms){ + if(ms==null)return"\\u2014"; + if(ms<1000)return ms+"ms"; + if(ms<60000)return(ms/1000).toFixed(1)+"s"; + return Math.floor(ms/60000)+"m "+Math.round((ms%60000)/1000)+"s"; + } + function fmtTok(n){ + if(n==null)return"\\u2014"; + if(n>=1e6)return(n/1e6).toFixed(1)+"M"; + if(n>=1e3)return(n/1e3).toFixed(1)+"K"; + return String(n); + } + function fmtCost(u){if(u==null)return"\\u2014";if(u<0.01)return"<$0.01";return"$"+u.toFixed(2);} + function fmtPct(v){if(v==null)return"\\u2014";return(v*100).toFixed(1)+"%";} + function sCls(v){if(v==null)return"";if(v>=0.9)return"score-high";if(v>=0.5)return"score-mid";return"score-low";} + + /* ---- compute stats ---- */ + function computeStats(d){ + var t=d.length,p=0,f=0,e=0,dur=0,ti=0,to=0,cost=0,sc=[]; + for(var i=0;i0?p/g:0,dur:dur,tokens:ti+to,inTok:ti,outTok:to,cost:cost,scores:sc}; + } + function computeTargets(d){ + var m={}; + for(var i=0;i

No results yet

'+(IS_LIVE?"Waiting for evaluation results\\u2026 Page will auto-refresh.":"Run an evaluation to generate results.")+"

";return;} + if(state.tab==="overview")renderOverview();else renderTests(); + } + + /* ---- stat card helper ---- */ + function card(label,value,type){ + return'
'+value+'
'+label+"
"; + } + + /* ---- overview ---- */ + function renderOverview(){ + var h='
'; + h+=card("Total Tests",stats.total,"total"); + h+=card("Passed",stats.passed,"pass"); + h+=card("Failed",stats.failed,"fail"); + h+=card("Errors",stats.errors,"error"); + var prCls=stats.passRate>=0.9?"pass":stats.passRate>=0.5?"warn":"fail"; + h+=card("Pass Rate",fmtPct(stats.passRate),prCls); + h+=card("Duration",fmtDur(stats.dur),"neutral"); + h+=card("Tokens",fmtTok(stats.tokens),"neutral"); + h+=card("Est. Cost",fmtCost(stats.cost),"neutral"); + h+="
"; + + /* targets table */ + if(tgtStats.length>1){ + h+='

Targets

'; + h+=""; + for(var i=0;i0?t.p/g:0,avg=t.sc>0?t.ts/t.sc:0; + h+="'; + h+='"; + h+='"; + } + h+="
TargetPass RatePassedFailedErrorsAvg ScoreDurationTokensCost
"+esc(t.target)+""+fmtPct(pr)+'
'+t.p+''+t.f+''+t.e+"'+fmtPct(avg)+""+fmtDur(t.dur)+""+fmtTok(t.tok)+""+fmtCost(t.cost)+"
"; + } + + /* histogram */ + if(stats.scores.length>0){ + var bk=[0,0,0,0,0]; + for(var i=0;i0?(bk[i]/mx*100):0; + h+='
'+lb[i]+'
=2?"score-mid":"score-low")+'" style="width:'+pct+'%">
'+bk[i]+"
"; + } + h+=""; + } + app.innerHTML=h; + } + + /* ---- test cases ---- */ + function renderTests(){ + var evalNames=getEvalNames(); + var h='
'; + h+=''; + if(tgtNames.length>1){ + h+='"; + } + h+=''; + h+='
'; + + h+='
'; + h+=''; + h+=sHdr("Status","status"); + h+=sHdr("Test ID","testId"); + if(tgtNames.length>1)h+=sHdr("Target","target"); + h+=sHdr("Score","score"); + for(var i=0;i"+esc(evalNames[i])+""; + h+=sHdr("Duration","durationMs"); + h+=sHdr("Cost","costUsd"); + h+="
"; + app.innerHTML=h; + + /* wire events */ + var selS=document.getElementById("flt-status"); + selS.value=state.filter.status; + selS.addEventListener("change",function(e){state.filter.status=e.target.value;renderRows();}); + var selT=document.getElementById("flt-target"); + if(selT){selT.value=state.filter.target;selT.addEventListener("change",function(e){state.filter.target=e.target.value;renderRows();});} + document.getElementById("flt-search").addEventListener("input",function(e){state.filter.search=e.target.value;renderRows();}); + var ths=document.querySelectorAll("th[data-sort]"); + for(var i=0;i'+label+arrow+""; + } + + function filtered(){ + var out=[]; + for(var i=0;i1?1:0); + document.getElementById("flt-count").textContent=rows.length+" of "+DATA.length+" tests"; + var h=""; + for(var i=0;i'; + h+=''+(exp?"\\u25BE":"\\u25B8")+""; + h+=""+sIcon(s)+""; + h+=''+esc(r.testId)+""; + if(tgtNames.length>1)h+=""+esc(r.target)+""; + h+=''+fmtPct(r.score)+""; + for(var j=0;j'+(es!=null?fmtPct(es):"\\u2014")+""; + } + h+=""+fmtDur(r.durationMs)+""+fmtCost(r.costUsd)+""; + if(exp)h+=''+renderDetail(r)+""; + } + if(rows.length===0)h+='No matching tests'; + tbody.innerHTML=h; + + /* row click */ + var trs=tbody.querySelectorAll(".test-row"); + for(var k=0;k"; + } + h+='

Output

'+esc(r.answer||"")+"
"; + h+=""; + + /* evaluator results */ + if(r.scores&&r.scores.length>0){ + h+="

Evaluator Results

"; + h+=''; + for(var i=0;i=0.5?"pass":"fail"; + h+=""; + } + h+="
EvaluatorScoreStatusReasoning
"+esc(ev.name)+''+fmtPct(ev.score)+""+sIcon(evS)+''+esc(ev.reasoning||"")+"
"; + } + + /* hits / misses */ + if(r.hits&&r.hits.length>0){ + h+='

Passed Expectations

    '; + for(var i=0;i"+esc(r.hits[i])+""; + h+="
"; + } + if(r.misses&&r.misses.length>0){ + h+='

Failed Expectations

    '; + for(var i=0;i"+esc(r.misses[i])+""; + h+="
"; + } + + /* error */ + if(r.error)h+='

Error

'+esc(r.error)+"
"; + + /* metadata */ + h+='
'; + var m=[]; + if(r.tokenUsage)m.push(fmtTok(r.tokenUsage.input)+" in / "+fmtTok(r.tokenUsage.output)+" out tokens"); + if(r.durationMs)m.push(fmtDur(r.durationMs)); + if(r.target)m.push(r.target); + if(r.costUsd)m.push(fmtCost(r.costUsd)); + if(r.timestamp)m.push(r.timestamp); + h+=esc(m.join(" \\u00B7 ")); + h+="
"; + return h; + } + + /* ---- init ---- */ + render(); +})(); +`; diff --git a/apps/cli/src/commands/eval/output-writer.ts b/apps/cli/src/commands/eval/output-writer.ts index e84faccae..ecc4582b1 100644 --- a/apps/cli/src/commands/eval/output-writer.ts +++ b/apps/cli/src/commands/eval/output-writer.ts @@ -2,12 +2,13 @@ import path from 'node:path'; import type { EvaluationResult } from '@agentv/core'; +import { HtmlWriter } from './html-writer.js'; import { JsonWriter } from './json-writer.js'; import { JsonlWriter } from './jsonl-writer.js'; import { JunitWriter } from './junit-writer.js'; import { YamlWriter } from './yaml-writer.js'; -export type OutputFormat = 'jsonl' | 'yaml'; +export type OutputFormat = 'jsonl' | 'yaml' | 'html'; export interface OutputWriter { append(result: EvaluationResult): Promise; @@ -23,6 +24,8 @@ export async function createOutputWriter( return JsonlWriter.open(filePath); case 'yaml': return YamlWriter.open(filePath); + case 'html': + return HtmlWriter.open(filePath); default: { const exhaustiveCheck: never = format; throw new Error(`Unsupported output format: ${exhaustiveCheck}`); @@ -36,6 +39,8 @@ export function getDefaultExtension(format: OutputFormat): string { return '.jsonl'; case 'yaml': return '.yaml'; + case 'html': + return '.html'; default: { const exhaustiveCheck: never = format; throw new Error(`Unsupported output format: ${exhaustiveCheck}`); @@ -43,7 +48,7 @@ export function getDefaultExtension(format: OutputFormat): string { } } -const SUPPORTED_EXTENSIONS = new Set(['.jsonl', '.json', '.xml', '.yaml', '.yml']); +const SUPPORTED_EXTENSIONS = new Set(['.jsonl', '.json', '.xml', '.yaml', '.yml', '.html', '.htm']); export function createWriterFromPath(filePath: string): Promise { const ext = path.extname(filePath).toLowerCase(); @@ -57,6 +62,9 @@ export function createWriterFromPath(filePath: string): Promise { case '.yaml': case '.yml': return YamlWriter.open(filePath); + case '.html': + case '.htm': + return HtmlWriter.open(filePath); default: throw new Error( `Unsupported output file extension "${ext}". Supported: ${[...SUPPORTED_EXTENSIONS].join(', ')}`, From 04c5d629b8c71aabe04f59988c4a4a19a4cb4972 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sat, 14 Mar 2026 08:08:53 +0000 Subject: [PATCH 2/3] =?UTF-8?q?feat:=20support=20JSONL=E2=86=92HTML=20conv?= =?UTF-8?q?ersion=20in=20convert=20command?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit agentv convert results.jsonl -o report.html When --out ends in .html or .htm, routes through HtmlWriter instead of the default YAML conversion. Falls back to a .html-suffixed path when no --out is given and the output extension is html. Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- apps/cli/src/commands/convert/index.ts | 33 ++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/apps/cli/src/commands/convert/index.ts b/apps/cli/src/commands/convert/index.ts index 615ffe137..897f46991 100644 --- a/apps/cli/src/commands/convert/index.ts +++ b/apps/cli/src/commands/convert/index.ts @@ -4,6 +4,23 @@ import { isAgentSkillsFormat, normalizeLineEndings, parseAgentSkillsEvals } from import { command, option, optional, positional, string } from 'cmd-ts'; import { stringify as stringifyYaml } from 'yaml'; +import { HtmlWriter } from '../eval/html-writer.js'; + +async function convertJsonlToHtml(inputPath: string, outputPath: string): Promise { + const content = readFileSync(inputPath, 'utf8'); + const lines = content + .trim() + .split('\n') + .filter((line) => line.trim()); + + const writer = await HtmlWriter.open(outputPath); + for (const line of lines) { + await writer.append(JSON.parse(line)); + } + await writer.close(); + return lines.length; +} + function convertJsonlToYaml(inputPath: string, outputPath: string): number { const content = readFileSync(inputPath, 'utf8'); const lines = content @@ -147,7 +164,7 @@ export function convertEvalsJsonToYaml(inputPath: string): string { export const convertCommand = command({ name: 'convert', - description: 'Convert between evaluation formats (JSONL→YAML, evals.json→EVAL.yaml)', + description: 'Convert between evaluation formats (JSONL→YAML, JSONL→HTML, evals.json→EVAL.yaml)', args: { input: positional({ type: string, @@ -158,7 +175,7 @@ export const convertCommand = command({ type: optional(string), long: 'out', short: 'o', - description: 'Output file path (defaults to stdout for evals.json, or .yaml for JSONL)', + description: 'Output file path (defaults to stdout for evals.json, .yaml or .html for JSONL)', }), }, handler: async ({ input, out }) => { @@ -181,6 +198,18 @@ export const convertCommand = command({ } if (ext === '.jsonl') { + const outExt = out ? path.extname(out).toLowerCase() : '.yaml'; + if (outExt === '.html' || outExt === '.htm') { + const outputPath = out ?? input.replace(/\.jsonl$/, '.html'); + try { + const count = await convertJsonlToHtml(input, outputPath); + console.log(`Converted ${count} records to ${path.resolve(outputPath)}`); + } catch (error) { + console.error(`Error: ${(error as Error).message}`); + process.exit(1); + } + return; + } const outputPath = out ?? input.replace(/\.jsonl$/, '.yaml'); try { const count = convertJsonlToYaml(input, outputPath); From 2ea9afcaa9477477f61ae5d8ea9e75eb25eedb48 Mon Sep 17 00:00:00 2001 From: Christopher Tso Date: Sat, 14 Mar 2026 09:09:00 +0000 Subject: [PATCH 3/3] docs: document HTML output format and convert command Co-authored-by: Copilot <223556219+Copilot@users.noreply.github.com> --- README.md | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/README.md b/README.md index d284f1886..0f330d836 100644 --- a/README.md +++ b/README.md @@ -193,6 +193,32 @@ agentv eval --dry-run evals/my-eval.yaml See `agentv eval --help` for all options: workers, timeouts, output formats, trace dumping, and more. +#### Output Formats + +Write results to different formats using the `-o` flag (format auto-detected from extension): + +```bash +# JSONL (default streaming format) +agentv eval evals/my-eval.yaml -o results.jsonl + +# Self-contained HTML dashboard (opens in any browser, no server needed) +agentv eval evals/my-eval.yaml -o report.html + +# Multiple formats simultaneously +agentv eval evals/my-eval.yaml -o results.jsonl -o report.html + +# JUnit XML for CI/CD integration +agentv eval evals/my-eval.yaml -o results.xml +``` + +The HTML report auto-refreshes every 2 seconds during a live run, then locks once the run completes. + +You can also convert an existing JSONL results file to HTML after the fact: + +```bash +agentv convert results.jsonl -o report.html +``` + #### Timeouts AgentV does not apply a default top-level evaluation timeout. If you want one, set it explicitly