diff --git a/docs/product-eval-adoption.md b/docs/product-eval-adoption.md index 8c352da..d604c4c 100644 --- a/docs/product-eval-adoption.md +++ b/docs/product-eval-adoption.md @@ -115,6 +115,24 @@ const record = controlRunToRunRecord(controlResult, { }) ``` +When a run evaluates an agent persona/profile, stamp the full profile cell on +the record: + +```ts +const agentProfile = await buildAgentProfileCell({ + profileId: 'gtm-founder-v1', + sourceProfile: { kind: 'sandbox-agent-profile', profile: gtmAgentProfile }, + harness: { id: 'gtm-agent-eval', version: '0.3.0' }, + model: 'gpt-4o-2024-11-20', + promptHash, + dimensions: { personaSuite: 'business-owner' }, +}) +``` + +Use `agentProfile.cellId` as the longitudinal grouping key for persona sweeps. +It changes when the hashed source profile, harness, model, prompt hash, or +explicit reporting dimensions change. + ## Datasets And Holdouts Use four splits: diff --git a/src/agent-profile-cell.ts b/src/agent-profile-cell.ts new file mode 100644 index 0000000..cf3a6f1 --- /dev/null +++ b/src/agent-profile-cell.ts @@ -0,0 +1,359 @@ +import { ValidationError } from './errors' +import { hashJson } from './pre-registration' + +export type AgentProfileCellSchemaVersion = 'agent-profile-cell/v1' + +export type AgentProfileJson = + | string + | number + | boolean + | null + | AgentProfileJson[] + | { [key: string]: AgentProfileJson } + +export type AgentProfileDimensionValue = string | number | boolean | null + +export interface AgentProfileSource { + /** Runtime/profile contract being fingerprinted, e.g. `sandbox-agent-profile`. */ + kind: string + /** sha256 over the canonical source profile object. */ + hash: string +} + +export interface AgentProfileSourceInput { + kind: string + /** Precomputed sha256 for callers that already sign their profile artifact. */ + hash?: string + /** Full canonical runtime profile; hashed and then discarded from the cell. */ + profile?: AgentProfileJson +} + +export interface AgentProfileHarness { + id: string + version?: string + hash?: string +} + +export interface AgentProfileCellInput { + profileId: string + sourceProfile: AgentProfileSourceInput + harness?: AgentProfileHarness + model?: string + promptHash?: string + dimensions?: Record +} + +export interface AgentProfileCell { + schemaVersion: AgentProfileCellSchemaVersion + cellId: string + profileId: string + sourceProfile: AgentProfileSource + harness?: AgentProfileHarness + model?: string + promptHash?: string + dimensions?: Record +} + +export class AgentProfileCellValidationError extends ValidationError { + readonly path: string + constructor(message: string, path = '') { + super(path ? `${message} (at ${path})` : message) + this.path = path + } +} + +const SHA256_HEX = /^[0-9a-f]{64}$/ +const CELL_ID = /^agent-profile-cell:sha256:[0-9a-f]{64}$/ + +export async function buildAgentProfileCell( + input: AgentProfileCellInput, +): Promise { + const material = await normalizeAgentProfileCellInput(input) + const cellId = `agent-profile-cell:sha256:${await hashJson(material)}` + return { ...material, cellId } +} + +export function agentProfileCellHashMaterial( + cell: AgentProfileCell, +): Omit { + const { cellId: _cellId, ...material } = cell + void _cellId + return normalizeAgentProfileCell(material) +} + +export async function verifyAgentProfileCell(cell: AgentProfileCell): Promise { + validateAgentProfileCell(cell) + return ( + cell.cellId === + `agent-profile-cell:sha256:${await hashJson(agentProfileCellHashMaterial(cell))}` + ) +} + +export function validateAgentProfileCell(input: unknown): AgentProfileCell { + if (input === null || typeof input !== 'object') { + throw new AgentProfileCellValidationError('expected object') + } + const obj = input as Record + expectLiteral(obj.schemaVersion, 'agent-profile-cell/v1', 'schemaVersion') + if (typeof obj.cellId !== 'string' || !CELL_ID.test(obj.cellId)) { + throw new AgentProfileCellValidationError( + 'cellId must match agent-profile-cell:sha256:<64 lowercase hex chars>', + 'cellId', + ) + } + expectString(obj.profileId, 'profileId') + validateSource(obj.sourceProfile, 'sourceProfile') + if (obj.harness !== undefined) validateHarness(obj.harness, 'harness') + if (obj.model !== undefined) expectString(obj.model, 'model') + if (obj.promptHash !== undefined) expectString(obj.promptHash, 'promptHash') + if (obj.dimensions !== undefined) validateDimensions(obj.dimensions, 'dimensions') + return input as AgentProfileCell +} + +export function requireAgentProfileCell(record: { + runId: string + agentProfile?: AgentProfileCell +}): AgentProfileCell { + if (!record.agentProfile) { + throw new AgentProfileCellValidationError( + `run "${record.runId}" is missing agentProfile; profile-cell grouping requires explicit profile identity`, + 'agentProfile', + ) + } + return validateAgentProfileCell(record.agentProfile) +} + +export function agentProfileCellKey(record: { + runId: string + agentProfile?: AgentProfileCell +}): string { + return requireAgentProfileCell(record).cellId +} + +export async function assertRunAgentProfileCell(record: { + runId: string + model: string + promptHash: string + agentProfile?: AgentProfileCell +}): Promise { + const profile = requireAgentProfileCell(record) + if (!(await verifyAgentProfileCell(profile))) { + throw new AgentProfileCellValidationError( + `run "${record.runId}" has an agentProfile.cellId that does not match its content`, + 'agentProfile.cellId', + ) + } + if (profile.model !== undefined && profile.model !== record.model) { + throw new AgentProfileCellValidationError( + `run "${record.runId}" agentProfile.model "${profile.model}" does not match model "${record.model}"`, + 'agentProfile.model', + ) + } + if (profile.promptHash !== undefined && profile.promptHash !== record.promptHash) { + throw new AgentProfileCellValidationError( + `run "${record.runId}" agentProfile.promptHash "${profile.promptHash}" does not match promptHash "${record.promptHash}"`, + 'agentProfile.promptHash', + ) + } + return profile +} + +export function groupRunsByAgentProfileCell< + T extends { runId: string; agentProfile?: AgentProfileCell }, +>(records: readonly T[]): Map { + const groups = new Map() + for (const record of records) { + const key = agentProfileCellKey(record) + const bucket = groups.get(key) + if (bucket) bucket.push(record) + else groups.set(key, [record]) + } + return groups +} + +async function normalizeAgentProfileCellInput( + input: AgentProfileCellInput, +): Promise> { + return normalizeAgentProfileCell({ + schemaVersion: 'agent-profile-cell/v1', + profileId: input.profileId, + sourceProfile: await normalizeSourceInput(input.sourceProfile), + harness: input.harness, + model: input.model, + promptHash: input.promptHash, + dimensions: input.dimensions, + }) +} + +function normalizeAgentProfileCell( + input: Omit, +): Omit { + return compactObject({ + schemaVersion: 'agent-profile-cell/v1' as const, + profileId: requireNonEmpty(input.profileId, 'profileId'), + sourceProfile: normalizeSource(input.sourceProfile), + harness: input.harness ? normalizeHarness(input.harness, 'harness') : undefined, + model: optionalNonEmpty(input.model, 'model'), + promptHash: optionalNonEmpty(input.promptHash, 'promptHash'), + dimensions: input.dimensions + ? nonEmptyRecord(normalizeDimensions(input.dimensions)) + : undefined, + }) +} + +async function normalizeSourceInput(input: AgentProfileSourceInput): Promise { + const kind = requireNonEmpty(input.kind, 'sourceProfile.kind') + if (input.hash !== undefined && input.profile !== undefined) { + throw new AgentProfileCellValidationError( + 'sourceProfile must provide either hash or profile, not both', + 'sourceProfile', + ) + } + if (input.hash !== undefined) { + return { kind, hash: requireSha256Hex(input.hash, 'sourceProfile.hash') } + } + if (input.profile === undefined) { + throw new AgentProfileCellValidationError( + 'sourceProfile must provide hash or profile', + 'sourceProfile', + ) + } + assertJson(input.profile, 'sourceProfile.profile') + return { kind, hash: await hashJson(input.profile) } +} + +function normalizeSource(input: AgentProfileSource): AgentProfileSource { + return { + kind: requireNonEmpty(input.kind, 'sourceProfile.kind'), + hash: requireSha256Hex(input.hash, 'sourceProfile.hash'), + } +} + +function normalizeHarness(input: AgentProfileHarness, path: string): AgentProfileHarness { + return compactObject({ + id: requireNonEmpty(input.id, `${path}.id`), + version: optionalNonEmpty(input.version, `${path}.version`), + hash: optionalNonEmpty(input.hash, `${path}.hash`), + }) +} + +function normalizeDimensions( + input: Record, +): Record { + const out: Record = {} + for (const key of Object.keys(input).sort()) { + const value = input[key] + requireNonEmpty(key, 'dimensions.') + if ( + value !== null && + typeof value !== 'string' && + typeof value !== 'number' && + typeof value !== 'boolean' + ) { + throw new AgentProfileCellValidationError( + 'expected primitive dimension value', + `dimensions.${key}`, + ) + } + if (typeof value === 'number' && !Number.isFinite(value)) { + throw new AgentProfileCellValidationError('expected finite number', `dimensions.${key}`) + } + out[key] = value + } + return out +} + +function compactObject>(input: T): T { + const out: Record = {} + for (const [key, value] of Object.entries(input)) { + if (value !== undefined) out[key] = value + } + return out as T +} + +function nonEmptyRecord>(input: T): T | undefined { + return Object.keys(input).length > 0 ? input : undefined +} + +function validateSource(value: unknown, path: string): void { + if (value === null || typeof value !== 'object' || Array.isArray(value)) { + throw new AgentProfileCellValidationError('expected object', path) + } + const rec = value as Record + expectString(rec.kind, `${path}.kind`) + requireSha256Hex(rec.hash, `${path}.hash`) +} + +function validateHarness(value: unknown, path: string): void { + if (value === null || typeof value !== 'object' || Array.isArray(value)) { + throw new AgentProfileCellValidationError('expected object', path) + } + const rec = value as Record + expectString(rec.id, `${path}.id`) + if (rec.version !== undefined) expectString(rec.version, `${path}.version`) + if (rec.hash !== undefined) expectString(rec.hash, `${path}.hash`) +} + +function validateDimensions(value: unknown, path: string): void { + if (value === null || typeof value !== 'object' || Array.isArray(value)) { + throw new AgentProfileCellValidationError('expected object', path) + } + normalizeDimensions(value as Record) +} + +function assertJson(value: AgentProfileJson, path: string): void { + if (value === null) return + const type = typeof value + if (type === 'string' || type === 'boolean') return + if (type === 'number') { + if (!Number.isFinite(value)) { + throw new AgentProfileCellValidationError('expected finite number', path) + } + return + } + if (Array.isArray(value)) { + value.forEach((item, index) => { + assertJson(item, `${path}[${index}]`) + }) + return + } + if (type === 'object') { + for (const [key, nested] of Object.entries(value)) { + requireNonEmpty(key, `${path}.`) + assertJson(nested, `${path}.${key}`) + } + return + } + throw new AgentProfileCellValidationError('expected JSON-compatible value', path) +} + +function expectLiteral(value: unknown, expected: string, path: string): void { + if (value !== expected) { + throw new AgentProfileCellValidationError(`expected ${expected}`, path) + } +} + +function expectString(value: unknown, path: string): void { + if (typeof value !== 'string' || value.length === 0) { + throw new AgentProfileCellValidationError('expected non-empty string', path) + } +} + +function requireNonEmpty(value: string, path: string): string { + if (typeof value !== 'string' || value.length === 0) { + throw new AgentProfileCellValidationError('expected non-empty string', path) + } + return value +} + +function optionalNonEmpty(value: string | undefined, path: string): string | undefined { + if (value === undefined) return undefined + return requireNonEmpty(value, path) +} + +function requireSha256Hex(value: unknown, path: string): string { + if (typeof value !== 'string' || !SHA256_HEX.test(value)) { + throw new AgentProfileCellValidationError('expected 64 lowercase sha256 hex chars', path) + } + return value +} diff --git a/src/analyst/analyst.test.ts b/src/analyst/analyst.test.ts index dc10844..ff71a24 100644 --- a/src/analyst/analyst.test.ts +++ b/src/analyst/analyst.test.ts @@ -630,7 +630,13 @@ describe('AnalystRegistry.runStream', () => { expect(result.findings.map((f) => f.finding_id)).toEqual( streamResult?.findings.map((f) => f.finding_id), ) - expect(result.per_analyst).toEqual(streamResult?.per_analyst) + expect(result.per_analyst.map(({ latency_ms: _latencyMs, ...summary }) => summary)).toEqual( + streamResult?.per_analyst.map(({ latency_ms: _latencyMs, ...summary }) => summary), + ) + expect(result.per_analyst.every((summary) => Number.isFinite(summary.latency_ms))).toBe(true) + expect(streamResult?.per_analyst.every((summary) => Number.isFinite(summary.latency_ms))).toBe( + true, + ) }) it('honours backpressure: slow consumer between events preserves ordering', async () => { diff --git a/src/eval-campaign.ts b/src/eval-campaign.ts index 8c6ac99..6298cae 100644 --- a/src/eval-campaign.ts +++ b/src/eval-campaign.ts @@ -38,6 +38,12 @@ * - LLM-call retry beyond what `LlmClient` already does */ +import { + type AgentProfileCell, + type AgentProfileCellInput, + buildAgentProfileCell, + verifyAgentProfileCell, +} from './agent-profile-cell' import { assertLlmRoute, type LlmClientOptions, type LlmRouteRequirements } from './llm-client' import { canonicalize, hashJson } from './pre-registration' import type { @@ -48,6 +54,7 @@ import type { RunSplitTag, RunTokenUsage, } from './run-record' +import { validateRunRecord } from './run-record' import { type ResearchReport, type ResearchReportOptions, researchReport } from './summary-report' import type { RunCompleteHook } from './trace/emitter' import { TraceEmitter } from './trace/emitter' @@ -127,6 +134,12 @@ export interface CampaignRunOutcome { * Single-judge or scalar-only runs leave this unset. */ judgeScores?: JudgeScoresRecord + /** + * Agent profile cell observed by the runner. When supplied, it overrides + * `EvalCampaignOptions.agentProfile` for this run and must match the + * outcome's `model` and `promptHash`. + */ + agentProfile?: AgentProfileCell | AgentProfileCellInput } export type CampaignRunner = (ctx: CampaignRunContext) => Promise @@ -216,6 +229,24 @@ export interface EvalCampaignOptions { now?: () => number /** Override the runId generator. Tests pin this. */ runId?: (params: CampaignFactoryParams) => string + /** + * Agent profile cell for campaign runs. Static profiles can pass an object; + * routers or variant-specific harnesses can pass a factory. The campaign + * stamps the built cell onto every `RunRecord` and rejects profile/model or + * profile/prompt contradictions. + */ + agentProfile?: + | AgentProfileCell + | AgentProfileCellInput + | (( + params: CampaignFactoryParams & { + variant: V + scenarioTags: Record + }, + ) => + | AgentProfileCell + | AgentProfileCellInput + | Promise) } export interface CampaignFactoryParams { @@ -484,7 +515,25 @@ export async function runEvalCampaign( splitTag, scenarioId: cell.scenario.scenarioId, } - return { record, integrity: integrityReport } + const profileSource = + outcome.agentProfile ?? + (typeof opts.agentProfile === 'function' + ? await opts.agentProfile({ + campaignId: opts.campaignId, + runId, + variantId: cell.variant.id, + scenarioId: cell.scenario.scenarioId, + seed: cell.seed, + variant: cell.variant.payload, + scenarioTags: cell.scenario.tags ?? {}, + }) + : opts.agentProfile) + if (profileSource !== undefined) { + const agentProfile = await resolveAgentProfileCell(profileSource) + assertAgentProfileMatchesRun(agentProfile, outcome.model, outcome.promptHash) + record.agentProfile = agentProfile + } + return { record: validateRunRecord(record), integrity: integrityReport } } const workers = Array.from({ length: Math.min(concurrency, cells.length) }, () => worker()) @@ -543,6 +592,41 @@ function defaultRawSinkFactory(workDir: string | undefined) { } } +async function resolveAgentProfileCell( + input: AgentProfileCell | AgentProfileCellInput, +): Promise { + if (isAgentProfileCell(input)) { + if (!(await verifyAgentProfileCell(input))) { + throw new Error(`runEvalCampaign: agentProfile.cellId does not match its content`) + } + return input + } + return buildAgentProfileCell(input) +} + +function isAgentProfileCell( + input: AgentProfileCell | AgentProfileCellInput, +): input is AgentProfileCell { + return 'schemaVersion' in input && 'cellId' in input +} + +function assertAgentProfileMatchesRun( + profile: AgentProfileCell, + model: string, + promptHash: string, +): void { + if (profile.model !== undefined && profile.model !== model) { + throw new Error( + `runEvalCampaign: agentProfile.model "${profile.model}" does not match outcome.model "${model}"`, + ) + } + if (profile.promptHash !== undefined && profile.promptHash !== promptHash) { + throw new Error( + `runEvalCampaign: agentProfile.promptHash "${profile.promptHash}" does not match outcome.promptHash "${promptHash}"`, + ) + } +} + function defaultRunId(params: CampaignFactoryParams): string { // Stable across re-runs: fingerprint of (campaignId, variantId, scenarioId, seed). // Caller can override via opts.runId for non-deterministic IDs. diff --git a/src/index.ts b/src/index.ts index 9f06c83..da114cf 100644 --- a/src/index.ts +++ b/src/index.ts @@ -2,6 +2,27 @@ export type { ActionExecutionPolicy, ActionPolicyDecision } from './action-policy' export { evaluateActionPolicy } from './action-policy' +export type { + AgentProfileCell, + AgentProfileCellInput, + AgentProfileCellSchemaVersion, + AgentProfileDimensionValue, + AgentProfileHarness, + AgentProfileJson, + AgentProfileSource, + AgentProfileSourceInput, +} from './agent-profile-cell' +export { + AgentProfileCellValidationError, + agentProfileCellHashMaterial, + agentProfileCellKey, + assertRunAgentProfileCell, + buildAgentProfileCell, + groupRunsByAgentProfileCell, + requireAgentProfileCell, + validateAgentProfileCell, + verifyAgentProfileCell, +} from './agent-profile-cell' export type { JudgeAdapterOpts, RunCriticAdapterOpts, diff --git a/src/run-record.ts b/src/run-record.ts index 52280da..ed187f2 100644 --- a/src/run-record.ts +++ b/src/run-record.ts @@ -23,6 +23,10 @@ * dependency. Round-trip tested in `tests/run-record.test.ts`. */ +import type { AgentProfileCell } from './agent-profile-cell' +import { validateAgentProfileCell } from './agent-profile-cell' +import { ValidationError } from './errors' + /** Search/dev/holdout split tag. 'search' is the paper-grade alias for the * combined train+test pool that the optimizer is allowed to read. */ export type RunSplitTag = 'search' | 'dev' | 'holdout' @@ -163,6 +167,14 @@ export interface RunRecord { * or `experimentId`. */ scenarioId?: string + /** + * Canonical identity for the agent profile cell that produced this row: + * profile artifact hash plus optional harness/model/prompt/reporting + * dimensions. Use `agentProfile.cellId` to group persona sweeps and + * longitudinal reports by the complete source profile, not by a loose + * candidate label or opaque config hash. + */ + agentProfile?: AgentProfileCell } // ── Validation ─────────────────────────────────────────────────────── @@ -183,8 +195,6 @@ const MANDATORY_TOP_LEVEL = [ 'splitTag', ] as const -import { ValidationError } from './errors' - const SPLIT_TAGS: ReadonlyArray = ['search', 'dev', 'holdout'] export class RunRecordValidationError extends ValidationError { @@ -292,6 +302,30 @@ export function validateRunRecord(input: unknown): RunRecord { // Failure mode optional. if (obj.failureMode !== undefined) expectString(obj.failureMode, 'failureMode') + if (obj.agentProfile !== undefined) { + try { + const profile = validateAgentProfileCell(obj.agentProfile) + if (profile.model !== undefined && profile.model !== obj.model) { + throw new RunRecordValidationError( + `agentProfile.model "${profile.model}" does not match model "${obj.model}"`, + 'agentProfile.model', + ) + } + if (profile.promptHash !== undefined && profile.promptHash !== obj.promptHash) { + throw new RunRecordValidationError( + `agentProfile.promptHash "${profile.promptHash}" does not match promptHash "${obj.promptHash}"`, + 'agentProfile.promptHash', + ) + } + } catch (error) { + if (error instanceof RunRecordValidationError) throw error + if (error instanceof Error) { + throw new RunRecordValidationError(error.message, 'agentProfile') + } + throw error + } + } + // Split tag. if (typeof obj.splitTag !== 'string' || !SPLIT_TAGS.includes(obj.splitTag as RunSplitTag)) { throw new RunRecordValidationError( diff --git a/tests/agent-profile-cell.test.ts b/tests/agent-profile-cell.test.ts new file mode 100644 index 0000000..4173b93 --- /dev/null +++ b/tests/agent-profile-cell.test.ts @@ -0,0 +1,128 @@ +import { describe, expect, it } from 'vitest' +import { + type AgentProfileCellInput, + AgentProfileCellValidationError, + agentProfileCellKey, + assertRunAgentProfileCell, + buildAgentProfileCell, + groupRunsByAgentProfileCell, + requireAgentProfileCell, + validateAgentProfileCell, + verifyAgentProfileCell, +} from '../src/agent-profile-cell' + +const SOURCE_PROFILE = { + name: 'gtm-agent', + version: '1.0.0', + permissions: { bash: 'ask' }, + subagents: { + icp: { prompt: 'icp specialist', permissions: { web: 'allow' } }, + }, + resources: { + files: [{ path: 'knowledge/icp.md', resource: { kind: 'inline', name: 'icp', content: 'x' } }], + }, +} + +const INPUT: AgentProfileCellInput = { + profileId: 'gtm-founder-v1', + sourceProfile: { kind: 'sandbox-agent-profile', profile: SOURCE_PROFILE }, + harness: { id: 'gtm-agent-eval', version: '0.3.0' }, + model: 'claude-sonnet-4-6@2025-04-15', + promptHash: 'p'.repeat(64), + dimensions: { personaSuite: 'business-owner', approvalsEnabled: true }, +} + +describe('agent profile cells', () => { + it('hashes the full source profile and builds a stable cell id', async () => { + const a = await buildAgentProfileCell(INPUT) + const b = await buildAgentProfileCell({ + ...INPUT, + dimensions: { approvalsEnabled: true, personaSuite: 'business-owner' }, + }) + + expect(a.cellId).toMatch(/^agent-profile-cell:sha256:[0-9a-f]{64}$/) + expect(a.sourceProfile.hash).toMatch(/^[0-9a-f]{64}$/) + expect(a.cellId).toBe(b.cellId) + expect(await verifyAgentProfileCell(a)).toBe(true) + }) + + it('changes the cell id when the source profile changes outside the projection', async () => { + const baseline = await buildAgentProfileCell(INPUT) + const changedPermission = await buildAgentProfileCell({ + ...INPUT, + sourceProfile: { + kind: 'sandbox-agent-profile', + profile: { ...SOURCE_PROFILE, permissions: { bash: 'allow' } }, + }, + }) + + expect(changedPermission.sourceProfile.hash).not.toBe(baseline.sourceProfile.hash) + expect(changedPermission.cellId).not.toBe(baseline.cellId) + }) + + it('rejects ambiguous or malformed source profile inputs', async () => { + await expect( + buildAgentProfileCell({ + ...INPUT, + sourceProfile: { kind: 'sandbox-agent-profile' }, + }), + ).rejects.toThrow(/hash or profile/) + await expect( + buildAgentProfileCell({ + ...INPUT, + sourceProfile: { kind: 'sandbox-agent-profile', hash: 'h', profile: SOURCE_PROFILE }, + }), + ).rejects.toThrow(/either hash or profile/) + await expect( + buildAgentProfileCell({ + ...INPUT, + sourceProfile: { kind: 'sandbox-agent-profile', hash: 'not-a-sha' }, + }), + ).rejects.toThrow(/sha256/) + }) + + it('rejects malformed cells and tampered cell ids', async () => { + const cell = await buildAgentProfileCell(INPUT) + expect(() => validateAgentProfileCell({ ...cell, profileId: '' })).toThrow( + AgentProfileCellValidationError, + ) + expect(() => validateAgentProfileCell({ ...cell, cellId: `${cell.cellId}0` })).toThrow(/cellId/) + const tamperedCellId = `${cell.cellId.slice(0, -1)}${cell.cellId.endsWith('0') ? '1' : '0'}` + expect(await verifyAgentProfileCell({ ...cell, cellId: tamperedCellId })).toBe(false) + }) + + it('requires explicit profile identity for cell grouping', async () => { + const cell = await buildAgentProfileCell(INPUT) + const records = [ + { runId: 'r1', agentProfile: cell, score: 0.7 }, + { runId: 'r2', agentProfile: cell, score: 0.8 }, + ] + expect(agentProfileCellKey(records[0]!)).toBe(cell.cellId) + expect(groupRunsByAgentProfileCell(records).get(cell.cellId)).toHaveLength(2) + expect(() => requireAgentProfileCell({ runId: 'missing' })).toThrow(/missing agentProfile/) + }) + + it('asserts stored run records against the profile content hash and run fields', async () => { + const cell = await buildAgentProfileCell(INPUT) + const record = { + runId: 'r1', + model: 'claude-sonnet-4-6@2025-04-15', + promptHash: 'p'.repeat(64), + agentProfile: cell, + } + + await expect(assertRunAgentProfileCell(record)).resolves.toBe(cell) + await expect( + assertRunAgentProfileCell({ + ...record, + agentProfile: { + ...cell, + cellId: `${cell.cellId.slice(0, -1)}${cell.cellId.endsWith('0') ? '1' : '0'}`, + }, + }), + ).rejects.toThrow(/does not match its content/) + await expect( + assertRunAgentProfileCell({ ...record, model: 'gpt-4o-2024-11-20' }), + ).rejects.toThrow(/does not match model/) + }) +}) diff --git a/tests/eval-campaign.test.ts b/tests/eval-campaign.test.ts index 96b3a66..912f6d8 100644 --- a/tests/eval-campaign.test.ts +++ b/tests/eval-campaign.test.ts @@ -1,4 +1,5 @@ import { describe, expect, it } from 'vitest' +import { buildAgentProfileCell } from '../src/agent-profile-cell' import { runEvalCampaign } from '../src/eval-campaign' import { InMemoryTraceStore } from '../src/trace/store' import { @@ -111,6 +112,39 @@ describe('runEvalCampaign — happy path', () => { expect(result.report?.runFingerprint).toMatch(/^[0-9a-f]{64}$/) }) + it('stamps every run with a canonical agent profile cell', async () => { + const result = await runEvalCampaign(baseOpts({ + agentProfile: ({ variantId }) => ({ + profileId: `gtm-${variantId}`, + sourceProfile: { + kind: 'sandbox-agent-profile', + profile: { name: 'gtm-agent', variantId, permissions: { bash: 'ask' } }, + }, + harness: { id: 'gtm-agent-eval', version: '0.3.0' }, + model: 'test-model@2026-05-08', + promptHash: 'p'.repeat(64), + }), + })) + + expect(new Set(result.runs.map((r) => r.agentProfile?.cellId)).size).toBe(2) + expect(result.runs.every((r) => r.agentProfile?.model === r.model)).toBe(true) + expect(result.runs.every((r) => r.agentProfile?.promptHash === r.promptHash)).toBe(true) + }) + + it('rejects a prebuilt agent profile cell when it contradicts the observed run', async () => { + const agentProfile = await buildAgentProfileCell({ + profileId: 'gtm-bad-cell', + sourceProfile: { kind: 'sandbox-agent-profile', profile: { name: 'gtm-agent' } }, + harness: { id: 'gtm-agent-eval', version: '0.3.0' }, + model: 'different-model@2026-05-08', + promptHash: 'p'.repeat(64), + }) + + await expect(runEvalCampaign(baseOpts({ agentProfile }))).rejects.toThrow( + /does not match outcome.model/, + ) + }) + it('embeds preregistration hash in the report when supplied', async () => { const result = await runEvalCampaign(baseOpts({ preregistrationHash: 'preregabc', diff --git a/tests/run-record.test.ts b/tests/run-record.test.ts index 157fa9d..a7ec0c1 100644 --- a/tests/run-record.test.ts +++ b/tests/run-record.test.ts @@ -1,4 +1,5 @@ import { describe, expect, it } from 'vitest' +import { buildAgentProfileCell } from '../src/agent-profile-cell' import { validateRunRecord, isRunRecord, @@ -54,6 +55,19 @@ describe('validateRunRecord — happy path', () => { expect(out).toEqual(r) }) + it('accepts an agentProfile cell that matches model and promptHash', async () => { + const agentProfile = await buildAgentProfileCell({ + profileId: 'gtm-founder-v1', + sourceProfile: { kind: 'sandbox-agent-profile', profile: { name: 'gtm-agent' } }, + harness: { id: 'gtm-agent-eval', version: '0.3.0' }, + model: 'claude-sonnet-4-6@2025-04-15', + promptHash: 'a'.repeat(64), + }) + const r = makeRecord({ agentProfile }) + expect(validateRunRecord(r).agentProfile?.cellId).toBe(agentProfile.cellId) + expect(roundTripRunRecord(r).agentProfile).toEqual(agentProfile) + }) + it('isRunRecord returns true for a valid record', () => { expect(isRunRecord(makeRecord())).toBe(true) }) @@ -154,6 +168,22 @@ describe('validateRunRecord — mandatory field enforcement', () => { expect(() => validateRunRecord(r)).toThrow(/fallback must be boolean/) }) + it('rejects an agentProfile cell that contradicts the executed model or prompt', async () => { + const agentProfile = await buildAgentProfileCell({ + profileId: 'gtm-founder-v1', + sourceProfile: { kind: 'sandbox-agent-profile', profile: { name: 'gtm-agent' } }, + harness: { id: 'gtm-agent-eval', version: '0.3.0' }, + model: 'claude-sonnet-4-6@2025-04-15', + promptHash: 'a'.repeat(64), + }) + expect(() => + validateRunRecord(makeRecord({ model: 'gpt-4o-2024-11-20', agentProfile })), + ).toThrow(/does not match model/) + expect(() => + validateRunRecord(makeRecord({ promptHash: 'b'.repeat(64), agentProfile })), + ).toThrow(/does not match promptHash/) + }) + it('parseRunRecordSafe returns ok=false on validation error', () => { const r = makeRecord({ runId: '' }) const result = parseRunRecordSafe(r)