tangle-network · drewstone · May 22, 2026 · May 22, 2026
diff --git a/src/agent-profile.test.ts b/src/agent-profile.test.ts
@@ -0,0 +1,52 @@
+import { describe, expect, it } from 'vitest'
+import { type AgentProfile, agentProfileHash } from './agent-profile'
+
+const base: AgentProfile = {
+  id: 'sonnet-baseline',
+  model: 'claude-sonnet-4-6@2025-04-15',
+  skills: ['intake', 'drafting'],
+  promptVersion: 'v3',
+  tools: ['vault', 'search'],
+}
+
+describe('agentProfileHash', () => {
+  it('is deterministic for the same profile', () => {
+    expect(agentProfileHash(base)).toBe(agentProfileHash({ ...base }))
+  })
+
+  it('is insensitive to skill + tool order', () => {
+    expect(
+      agentProfileHash({ ...base, skills: ['drafting', 'intake'], tools: ['search', 'vault'] }),
+    ).toBe(agentProfileHash(base))
+  })
+
+  it('ignores the human-facing id label — behaviour identity, not name', () => {
+    expect(agentProfileHash({ ...base, id: 'a-different-label' })).toBe(agentProfileHash(base))
+  })
+
+  it('changes when the model changes', () => {
+    expect(agentProfileHash({ ...base, model: 'claude-opus-4-6@2025-04-15' })).not.toBe(
+      agentProfileHash(base),
+    )
+  })
+
+  it('changes when a skill is added — the primary behaviour lever', () => {
+    expect(agentProfileHash({ ...base, skills: ['intake', 'drafting', 'redline'] })).not.toBe(
+      agentProfileHash(base),
+    )
+  })
+
+  it('changes when the prompt version changes', () => {
+    expect(agentProfileHash({ ...base, promptVersion: 'v4' })).not.toBe(agentProfileHash(base))
+  })
+
+  it('treats an absent optional field and an empty one identically', () => {
+    const a: AgentProfile = { id: 'a', model: 'm@2025-01-01' }
+    const b: AgentProfile = { id: 'b', model: 'm@2025-01-01', skills: [], tools: [] }
+    expect(agentProfileHash(a)).toBe(agentProfileHash(b))
+  })
+
+  it('throws on a profile with no model — an unkeyable profile fails loud', () => {
+    expect(() => agentProfileHash({ id: 'broken', model: '  ' })).toThrow(/no model/)
+  })
+})
diff --git a/src/agent-profile.ts b/src/agent-profile.ts
@@ -0,0 +1,59 @@
+/**
+ * @stable
+ *
+ * AgentProfile — the eval harness's unit of variation.
+ *
+ * A profile pins everything that changes agent behaviour for a benchmark
+ * cell: the model, the active skills, the prompt version, the available
+ * tools. Vary the profile — swap a model, add a skill — and re-run the suite
+ * to benchmark the change. The scorecard keys a cell on
+ * `(scenarioId, profileHash)`, so the model is not a separate axis: it lives
+ * inside the profile, and two profiles with the same model but different
+ * skills are different cells.
+ *
+ * `agentProfileHash` is the profile's behaviour identity. Two profiles that
+ * produce the same agent behaviour share a hash (and a scorecard cell);
+ * reordering `skills` or `tools` does not change it; the human-facing `id`
+ * label does not affect it.
+ */
+
+import { createHash } from 'node:crypto'
+import { ValidationError } from './errors'
+import { canonicalize } from './pre-registration'
+
+export interface AgentProfile {
+  /** Human-facing label, e.g. `sonnet-legal-skills-v3`. Not part of the hash. */
+  id: string
+  /** Model snapshot id this profile pins, e.g. `claude-sonnet-4-6@2025-04-15`. */
+  model: string
+  /** Skill ids/versions active in this profile — the primary behaviour lever. */
+  skills?: string[]
+  /** Prompt version identifier. */
+  promptVersion?: string
+  /** Tool ids available to the agent. */
+  tools?: string[]
+  /** Any other behaviour-bearing knobs that should fingerprint into the hash. */
+  metadata?: Record<string, string | number | boolean>
+}
+
+/**
+ * Deterministic behaviour identity of a profile — a sha256 over the
+ * behaviour-bearing fields. `skills` and `tools` are order-insensitive; the
+ * `id` label is excluded. Throws on a profile with no `model` — an unkeyable
+ * profile must fail loud rather than collapse into a blank-model cell.
+ */
+export function agentProfileHash(profile: AgentProfile): string {
+  if (typeof profile.model !== 'string' || profile.model.trim().length === 0) {
+    throw new ValidationError(`AgentProfile "${profile.id}" has no model — cannot hash`)
+  }
+  const behaviour = {
+    model: profile.model.trim(),
+    skills: [...(profile.skills ?? [])].sort(),
+    promptVersion: profile.promptVersion ?? null,
+    tools: [...(profile.tools ?? [])].sort(),
+    metadata: profile.metadata ?? {},
+  }
+  return createHash('sha256')
+    .update(JSON.stringify(canonicalize(behaviour)))
+    .digest('hex')
+}
diff --git a/src/index.ts b/src/index.ts
@@ -241,6 +241,26 @@
   MODEL_PRICING,
   TokenCounter,
 } from './metrics'
+export type {
+  PrReviewAuditCase,
+  PrReviewBenchmarkSummary,
+  PrReviewComment,
+  PrReviewMatchedFinding,
+  PrReviewOutcome,
+  PrReviewReferenceFinding,
+  PrReviewScore,
+  PrReviewScoreWeights,
+  PrReviewSeverity,
+  PrReviewSource,
+} from './pr-review-benchmark'
+export {
+  aggregatePrReviewScore,
+  commentsForSource,
+  DEFAULT_PR_REVIEW_SCORE_WEIGHTS,
+  scorePrReviewComments,
+  scorePrReviewSource,
+  summarizePrReviewBenchmark,
+} from './pr-review-benchmark'
 /**
  * @experimental
  */
@@ -553,6 +573,8 @@
 
 // ── Auxiliary statistical + decision modules ─────────────────────────
 
+export type { AgentProfile } from './agent-profile'
+export { agentProfileHash } from './agent-profile'
 export type { BaselineOptions, BaselineReport, MetricSamples, MetricVerdict } from './baseline'
 export { compareToBaseline, iqr, welchsTTest } from './baseline'
 export type { CostEntry, CostSummary, ScenarioCost, TokenSpec } from './cost-tracker'
@@ -580,6 +602,26 @@
 } from './oracle'
 export type { Direction, Objective, ParetoResult } from './pareto'
 export { dominates, paretoFrontier } from './pareto'
+// ── Eval scorecard — (persona × profile) score timeline ──────────────
+export type {
+  CellVerdict,
+  DiffScorecardOptions,
+  RecordRunsOptions,
+  Scorecard,
+  ScorecardCell,
+  ScorecardCellDiff,
+  ScorecardDiff,
+  ScorecardEntry,
+  ScorecardLogLine,
+} from './scorecard'
+export {
+  appendScorecard,
+  diffScorecard,
+  formatScorecardDiff,
+  loadScorecard,
+  recordRuns,
+  recordRunsToScorecard,
+} from './scorecard'
 export type { SeriesConvergenceOptions, SeriesConvergenceResult } from './series-convergence'
 export { analyzeSeries } from './series-convergence'
 export type { Slo, SloCheckResult, SloComparator, SloReport, SloSeverity } from './slo'

diff --git a/src/scorecard.test.ts b/src/scorecard.test.ts
@@ -0,0 +1,183 @@
+import { mkdtempSync, readFileSync, writeFileSync } from 'node:fs'
+import { tmpdir } from 'node:os'
+import { join } from 'node:path'
+import { afterEach, describe, expect, it } from 'vitest'
+import type { AgentProfile } from './agent-profile'
+import { agentProfileHash } from './agent-profile'
+import type { RunRecord } from './run-record'
+import {
+  diffScorecard,
+  formatScorecardDiff,
+  loadScorecard,
+  recordRuns,
+  recordRunsToScorecard,
+} from './scorecard'
+
+const profile: AgentProfile = {
+  id: 'sonnet-v3',
+  model: 'claude-sonnet-4-6@2025-04-15',
+  skills: ['intake', 'drafting'],
+  promptVersion: 'v3',
+}
+
+/** Minimal RunRecord-shaped object — only the fields the scorecard reads. */
+function makeRun(scenarioId: string, seed: number, score: number): RunRecord {
+  return {
+    runId: `${scenarioId}-seed${seed}`,
+    experimentId: 'test',
+    candidateId: 'cand',
+    scenarioId,
+    seed,
+    model: profile.model,
+    promptHash: 'p',
+    configHash: 'c',
+    commitSha: 'sha',
+    wallMs: 1,
+    costUsd: 0,
+    tokenUsage: { input: 1, output: 1 },
+    outcome: { holdoutScore: score, raw: { score } },
+    splitTag: 'holdout',
+  } as RunRecord
+}
+
+const tmpDirs: string[] = []
+function tmpLog(): string {
+  const dir = mkdtempSync(join(tmpdir(), 'scorecard-'))
+  tmpDirs.push(dir)
+  return join(dir, 'scorecard.jsonl')
+}
+
+afterEach(() => {
+  tmpDirs.length = 0
+})
+
+describe('recordRuns', () => {
+  it('groups runs by scenario into one entry per cell', () => {
+    const runs = [
+      makeRun('persona-a', 0, 0.8),
+      makeRun('persona-a', 1, 0.9),
+      makeRun('persona-b', 0, 0.5),
+    ]
+    const lines = recordRuns(runs, { profile, commitSha: 'abc123' })
+    expect(lines).toHaveLength(2)
+    const a = lines.find((l) => l.scenarioId === 'persona-a')!
+    expect(a.entry.scores).toEqual([0.8, 0.9])
+    expect(a.entry.composite).toBeCloseTo(0.85, 6) // median of [0.8, 0.9]
+    expect(a.entry.runIds).toEqual(['persona-a-seed0', 'persona-a-seed1'])
+    expect(a.profileHash).toBe(agentProfileHash(profile))
+    expect(a.model).toBe(profile.model)
+  })
+})
+
+describe('loadScorecard', () => {
+  it('returns an empty scorecard for a missing file', () => {
+    expect(loadScorecard(join(tmpdir(), 'does-not-exist-xyz.jsonl'))).toEqual({
+      cells: [],
+      profiles: {},
+    })
+  })
+
+  it('round-trips appended runs and sorts each timeline chronologically', () => {
+    const log = tmpLog()
+    recordRunsToScorecard(log, [makeRun('persona-a', 0, 0.7)], {
+      profile,
+      commitSha: 'c1',
+      timestamp: '2026-05-20T00:00:00Z',
+    })
+    recordRunsToScorecard(log, [makeRun('persona-a', 0, 0.9)], {
+      profile,
+      commitSha: 'c2',
+      timestamp: '2026-05-21T00:00:00Z',
+    })
+    const card = loadScorecard(log)
+    expect(card.cells).toHaveLength(1)
+    expect(card.cells[0]!.timeline.map((e) => e.commitSha)).toEqual(['c1', 'c2'])
+    expect(card.profiles[agentProfileHash(profile)]?.id).toBe('sonnet-v3')
+  })
+
+  it('skips a malformed line rather than failing the whole read', () => {
+    const log = tmpLog()
+    recordRunsToScorecard(log, [makeRun('persona-a', 0, 0.7)], { profile, commitSha: 'c1' })
+    writeFileSync(log, `not json at all\n{"partial":true}\n${readFileSync(log, 'utf8')}`)
+    const card = loadScorecard(log)
+    expect(card.cells).toHaveLength(1)
+  })
+})
+
+describe('diffScorecard', () => {
+  function build(commits: Array<{ sha: string; scores: number[] }>) {
+    const log = tmpLog()
+    for (const [i, commit] of commits.entries()) {
+      recordRunsToScorecard(
+        log,
+        commit.scores.map((s, seed) => makeRun('persona-a', seed, s)),
+        { profile, commitSha: commit.sha, timestamp: `2026-05-${20 + i}T00:00:00Z` },
+      )
+    }
+    return loadScorecard(log)
+  }
+
+  it('marks a cell with only one entry as new', () => {
+    const diff = diffScorecard(build([{ sha: 'c1', scores: [0.8, 0.81, 0.82] }]))
+    expect(diff.cells[0]!.verdict).toBe('new')
+    expect(diff.summary.new).toBe(1)
+  })
+
+  it('flags a real regression — large effect, significant', () => {
+    const diff = diffScorecard(
+      build([
+        { sha: 'c1', scores: [0.88, 0.91, 0.9, 0.89] },
+        { sha: 'c2', scores: [0.58, 0.62, 0.6, 0.59] },
+      ]),
+    )
+    const cell = diff.cells[0]!
+    expect(cell.verdict).toBe('regressed')
+    expect(cell.delta).toBeLessThan(0)
+    expect(cell.cohensD).not.toBeNull()
+    expect(diff.summary.regressed).toBe(1)
+  })
+
+  it('flags a real improvement', () => {
+    const diff = diffScorecard(
+      build([
+        { sha: 'c1', scores: [0.58, 0.62, 0.6, 0.59] },
+        { sha: 'c2', scores: [0.88, 0.91, 0.9, 0.89] },
+      ]),
+    )
+    expect(diff.cells[0]!.verdict).toBe('improved')
+  })
+
+  it('calls an overlapping, tiny move flat — not a regression', () => {
+    const diff = diffScorecard(
+      build([
+        { sha: 'c1', scores: [0.8, 0.82, 0.81, 0.8] },
+        { sha: 'c2', scores: [0.81, 0.79, 0.8, 0.81] },
+      ]),
+    )
+    expect(diff.cells[0]!.verdict).toBe('flat')
+    expect(diff.summary.regressed).toBe(0)
+  })
+
+  it('can diff against a named baseline commit, not just the predecessor', () => {
+    const card = build([
+      { sha: 'c1', scores: [0.88, 0.91, 0.9, 0.89] },
+      { sha: 'c2', scores: [0.87, 0.9, 0.89, 0.88] },
+      { sha: 'c3', scores: [0.58, 0.62, 0.6, 0.59] },
+    ])
+    const diff = diffScorecard(card, { baselineCommit: 'c1' })
+    expect(diff.cells[0]!.baselineCommit).toBe('c1')
+    expect(diff.cells[0]!.verdict).toBe('regressed')
+  })
+
+  it('formatScorecardDiff surfaces a regression in the report', () => {
+    const diff = diffScorecard(
+      build([
+        { sha: 'c1', scores: [0.88, 0.91, 0.9, 0.89] },
+        { sha: 'c2', scores: [0.58, 0.62, 0.6, 0.59] },
+      ]),
+    )
+    const report = formatScorecardDiff(diff)
+    expect(report).toMatch(/1 regressed/)
+    expect(report).toMatch(/REGRESSED.*persona-a/)
+  })
+})