Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
52 changes: 52 additions & 0 deletions src/agent-profile.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,52 @@
import { describe, expect, it } from 'vitest'
import { type AgentProfile, agentProfileHash } from './agent-profile'

const base: AgentProfile = {
id: 'sonnet-baseline',
model: 'claude-sonnet-4-6@2025-04-15',
skills: ['intake', 'drafting'],
promptVersion: 'v3',
tools: ['vault', 'search'],
}

describe('agentProfileHash', () => {
it('is deterministic for the same profile', () => {
expect(agentProfileHash(base)).toBe(agentProfileHash({ ...base }))
})

it('is insensitive to skill + tool order', () => {
expect(
agentProfileHash({ ...base, skills: ['drafting', 'intake'], tools: ['search', 'vault'] }),
).toBe(agentProfileHash(base))
})

it('ignores the human-facing id label — behaviour identity, not name', () => {
expect(agentProfileHash({ ...base, id: 'a-different-label' })).toBe(agentProfileHash(base))
})

it('changes when the model changes', () => {
expect(agentProfileHash({ ...base, model: 'claude-opus-4-6@2025-04-15' })).not.toBe(
agentProfileHash(base),
)
})

it('changes when a skill is added — the primary behaviour lever', () => {
expect(agentProfileHash({ ...base, skills: ['intake', 'drafting', 'redline'] })).not.toBe(
agentProfileHash(base),
)
})

it('changes when the prompt version changes', () => {
expect(agentProfileHash({ ...base, promptVersion: 'v4' })).not.toBe(agentProfileHash(base))
})

it('treats an absent optional field and an empty one identically', () => {
const a: AgentProfile = { id: 'a', model: 'm@2025-01-01' }
const b: AgentProfile = { id: 'b', model: 'm@2025-01-01', skills: [], tools: [] }
expect(agentProfileHash(a)).toBe(agentProfileHash(b))
})

it('throws on a profile with no model — an unkeyable profile fails loud', () => {
expect(() => agentProfileHash({ id: 'broken', model: ' ' })).toThrow(/no model/)
})
})
59 changes: 59 additions & 0 deletions src/agent-profile.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,59 @@
/**
* @stable
*
* AgentProfile — the eval harness's unit of variation.
*
* A profile pins everything that changes agent behaviour for a benchmark
* cell: the model, the active skills, the prompt version, the available
* tools. Vary the profile — swap a model, add a skill — and re-run the suite
* to benchmark the change. The scorecard keys a cell on
* `(scenarioId, profileHash)`, so the model is not a separate axis: it lives
* inside the profile, and two profiles with the same model but different
* skills are different cells.
*
* `agentProfileHash` is the profile's behaviour identity. Two profiles that
* produce the same agent behaviour share a hash (and a scorecard cell);
* reordering `skills` or `tools` does not change it; the human-facing `id`
* label does not affect it.
*/

import { createHash } from 'node:crypto'
import { ValidationError } from './errors'
import { canonicalize } from './pre-registration'

export interface AgentProfile {
/** Human-facing label, e.g. `sonnet-legal-skills-v3`. Not part of the hash. */
id: string
/** Model snapshot id this profile pins, e.g. `claude-sonnet-4-6@2025-04-15`. */
model: string
/** Skill ids/versions active in this profile — the primary behaviour lever. */
skills?: string[]
/** Prompt version identifier. */
promptVersion?: string
/** Tool ids available to the agent. */
tools?: string[]
/** Any other behaviour-bearing knobs that should fingerprint into the hash. */
metadata?: Record<string, string | number | boolean>
}

/**
* Deterministic behaviour identity of a profile — a sha256 over the
* behaviour-bearing fields. `skills` and `tools` are order-insensitive; the
* `id` label is excluded. Throws on a profile with no `model` — an unkeyable
* profile must fail loud rather than collapse into a blank-model cell.
*/
export function agentProfileHash(profile: AgentProfile): string {
if (typeof profile.model !== 'string' || profile.model.trim().length === 0) {
throw new ValidationError(`AgentProfile "${profile.id}" has no model — cannot hash`)
}
const behaviour = {
model: profile.model.trim(),
skills: [...(profile.skills ?? [])].sort(),
promptVersion: profile.promptVersion ?? null,
tools: [...(profile.tools ?? [])].sort(),
metadata: profile.metadata ?? {},
}
return createHash('sha256')
.update(JSON.stringify(canonicalize(behaviour)))
.digest('hex')
}
42 changes: 42 additions & 0 deletions src/index.ts
Original file line number Diff line number Diff line change
Expand Up @@ -241,6 +241,26 @@
MODEL_PRICING,
TokenCounter,
} from './metrics'
export type {
PrReviewAuditCase,
PrReviewBenchmarkSummary,
PrReviewComment,
PrReviewMatchedFinding,
PrReviewOutcome,
PrReviewReferenceFinding,
PrReviewScore,
PrReviewScoreWeights,
PrReviewSeverity,
PrReviewSource,
} from './pr-review-benchmark'

Check failure on line 255 in src/index.ts

View workflow job for this annotation

GitHub Actions / ci

Cannot find module './pr-review-benchmark' or its corresponding type declarations.
export {
aggregatePrReviewScore,
commentsForSource,
DEFAULT_PR_REVIEW_SCORE_WEIGHTS,
scorePrReviewComments,
scorePrReviewSource,
summarizePrReviewBenchmark,
} from './pr-review-benchmark'

Check failure on line 263 in src/index.ts

View workflow job for this annotation

GitHub Actions / ci

Cannot find module './pr-review-benchmark' or its corresponding type declarations.
/**
* @experimental
*/
Expand Down Expand Up @@ -553,6 +573,8 @@

// ── Auxiliary statistical + decision modules ─────────────────────────

export type { AgentProfile } from './agent-profile'
export { agentProfileHash } from './agent-profile'
export type { BaselineOptions, BaselineReport, MetricSamples, MetricVerdict } from './baseline'
export { compareToBaseline, iqr, welchsTTest } from './baseline'
export type { CostEntry, CostSummary, ScenarioCost, TokenSpec } from './cost-tracker'
Expand Down Expand Up @@ -580,6 +602,26 @@
} from './oracle'
export type { Direction, Objective, ParetoResult } from './pareto'
export { dominates, paretoFrontier } from './pareto'
// ── Eval scorecard — (persona × profile) score timeline ──────────────
export type {
CellVerdict,
DiffScorecardOptions,
RecordRunsOptions,
Scorecard,
ScorecardCell,
ScorecardCellDiff,
ScorecardDiff,
ScorecardEntry,
ScorecardLogLine,
} from './scorecard'
export {
appendScorecard,
diffScorecard,
formatScorecardDiff,
loadScorecard,
recordRuns,
recordRunsToScorecard,
} from './scorecard'
export type { SeriesConvergenceOptions, SeriesConvergenceResult } from './series-convergence'
export { analyzeSeries } from './series-convergence'
export type { Slo, SloCheckResult, SloComparator, SloReport, SloSeverity } from './slo'
Expand Down
183 changes: 183 additions & 0 deletions src/scorecard.test.ts
Original file line number Diff line number Diff line change
@@ -0,0 +1,183 @@
import { mkdtempSync, readFileSync, writeFileSync } from 'node:fs'
import { tmpdir } from 'node:os'
import { join } from 'node:path'
import { afterEach, describe, expect, it } from 'vitest'
import type { AgentProfile } from './agent-profile'
import { agentProfileHash } from './agent-profile'
import type { RunRecord } from './run-record'
import {
diffScorecard,
formatScorecardDiff,
loadScorecard,
recordRuns,
recordRunsToScorecard,
} from './scorecard'

const profile: AgentProfile = {
id: 'sonnet-v3',
model: 'claude-sonnet-4-6@2025-04-15',
skills: ['intake', 'drafting'],
promptVersion: 'v3',
}

/** Minimal RunRecord-shaped object — only the fields the scorecard reads. */
function makeRun(scenarioId: string, seed: number, score: number): RunRecord {
return {
runId: `${scenarioId}-seed${seed}`,
experimentId: 'test',
candidateId: 'cand',
scenarioId,
seed,
model: profile.model,
promptHash: 'p',
configHash: 'c',
commitSha: 'sha',
wallMs: 1,
costUsd: 0,
tokenUsage: { input: 1, output: 1 },
outcome: { holdoutScore: score, raw: { score } },
splitTag: 'holdout',
} as RunRecord
}

const tmpDirs: string[] = []
function tmpLog(): string {
const dir = mkdtempSync(join(tmpdir(), 'scorecard-'))
tmpDirs.push(dir)
return join(dir, 'scorecard.jsonl')
}

afterEach(() => {
tmpDirs.length = 0
})

describe('recordRuns', () => {
it('groups runs by scenario into one entry per cell', () => {
const runs = [
makeRun('persona-a', 0, 0.8),
makeRun('persona-a', 1, 0.9),
makeRun('persona-b', 0, 0.5),
]
const lines = recordRuns(runs, { profile, commitSha: 'abc123' })
expect(lines).toHaveLength(2)
const a = lines.find((l) => l.scenarioId === 'persona-a')!
expect(a.entry.scores).toEqual([0.8, 0.9])
expect(a.entry.composite).toBeCloseTo(0.85, 6) // median of [0.8, 0.9]
expect(a.entry.runIds).toEqual(['persona-a-seed0', 'persona-a-seed1'])
expect(a.profileHash).toBe(agentProfileHash(profile))
expect(a.model).toBe(profile.model)
})
})

describe('loadScorecard', () => {
it('returns an empty scorecard for a missing file', () => {
expect(loadScorecard(join(tmpdir(), 'does-not-exist-xyz.jsonl'))).toEqual({
cells: [],
profiles: {},
})
})

it('round-trips appended runs and sorts each timeline chronologically', () => {
const log = tmpLog()
recordRunsToScorecard(log, [makeRun('persona-a', 0, 0.7)], {
profile,
commitSha: 'c1',
timestamp: '2026-05-20T00:00:00Z',
})
recordRunsToScorecard(log, [makeRun('persona-a', 0, 0.9)], {
profile,
commitSha: 'c2',
timestamp: '2026-05-21T00:00:00Z',
})
const card = loadScorecard(log)
expect(card.cells).toHaveLength(1)
expect(card.cells[0]!.timeline.map((e) => e.commitSha)).toEqual(['c1', 'c2'])
expect(card.profiles[agentProfileHash(profile)]?.id).toBe('sonnet-v3')
})

it('skips a malformed line rather than failing the whole read', () => {
const log = tmpLog()
recordRunsToScorecard(log, [makeRun('persona-a', 0, 0.7)], { profile, commitSha: 'c1' })
writeFileSync(log, `not json at all\n{"partial":true}\n${readFileSync(log, 'utf8')}`)
const card = loadScorecard(log)
expect(card.cells).toHaveLength(1)
})
})

describe('diffScorecard', () => {
function build(commits: Array<{ sha: string; scores: number[] }>) {
const log = tmpLog()
for (const [i, commit] of commits.entries()) {
recordRunsToScorecard(
log,
commit.scores.map((s, seed) => makeRun('persona-a', seed, s)),
{ profile, commitSha: commit.sha, timestamp: `2026-05-${20 + i}T00:00:00Z` },
)
}
return loadScorecard(log)
}

it('marks a cell with only one entry as new', () => {
const diff = diffScorecard(build([{ sha: 'c1', scores: [0.8, 0.81, 0.82] }]))
expect(diff.cells[0]!.verdict).toBe('new')
expect(diff.summary.new).toBe(1)
})

it('flags a real regression — large effect, significant', () => {
const diff = diffScorecard(
build([
{ sha: 'c1', scores: [0.88, 0.91, 0.9, 0.89] },
{ sha: 'c2', scores: [0.58, 0.62, 0.6, 0.59] },
]),
)
const cell = diff.cells[0]!
expect(cell.verdict).toBe('regressed')
expect(cell.delta).toBeLessThan(0)
expect(cell.cohensD).not.toBeNull()
expect(diff.summary.regressed).toBe(1)
})

it('flags a real improvement', () => {
const diff = diffScorecard(
build([
{ sha: 'c1', scores: [0.58, 0.62, 0.6, 0.59] },
{ sha: 'c2', scores: [0.88, 0.91, 0.9, 0.89] },
]),
)
expect(diff.cells[0]!.verdict).toBe('improved')
})

it('calls an overlapping, tiny move flat — not a regression', () => {
const diff = diffScorecard(
build([
{ sha: 'c1', scores: [0.8, 0.82, 0.81, 0.8] },
{ sha: 'c2', scores: [0.81, 0.79, 0.8, 0.81] },
]),
)
expect(diff.cells[0]!.verdict).toBe('flat')
expect(diff.summary.regressed).toBe(0)
})

it('can diff against a named baseline commit, not just the predecessor', () => {
const card = build([
{ sha: 'c1', scores: [0.88, 0.91, 0.9, 0.89] },
{ sha: 'c2', scores: [0.87, 0.9, 0.89, 0.88] },
{ sha: 'c3', scores: [0.58, 0.62, 0.6, 0.59] },
])
const diff = diffScorecard(card, { baselineCommit: 'c1' })
expect(diff.cells[0]!.baselineCommit).toBe('c1')
expect(diff.cells[0]!.verdict).toBe('regressed')
})

it('formatScorecardDiff surfaces a regression in the report', () => {
const diff = diffScorecard(
build([
{ sha: 'c1', scores: [0.88, 0.91, 0.9, 0.89] },
{ sha: 'c2', scores: [0.58, 0.62, 0.6, 0.59] },
]),
)
const report = formatScorecardDiff(diff)
expect(report).toMatch(/1 regressed/)
expect(report).toMatch(/REGRESSED.*persona-a/)
})
})
Loading
Loading