diff --git a/package.json b/package.json index a895f47..787573f 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@tangle-network/agent-eval", - "version": "0.7.1", + "version": "0.7.2", "description": "Trace-first evaluation framework for Tangle agents. Core (spans, pipelines, sandbox harness, OTLP export), trust (dataset, red-team, calibration, behavior DSL), builder-of-builders (three-layer eval, resumable sessions, meta-runtime correlation), and frontier (meta-eval correlation study, Process Reward Modeling, bisector).", "type": "module", "main": "./dist/index.js", diff --git a/src/cost-tracker.ts b/src/cost-tracker.ts index 29dde18..7327fdc 100644 --- a/src/cost-tracker.ts +++ b/src/cost-tracker.ts @@ -72,6 +72,38 @@ export class CostTracker { bucket.completed = completed } + /** + * Convenience: record + markOutcome in one call from a + * `{ usage, verdict }`-shaped response (starter-foundry's + * `invokeMetaJudge` returns this shape; consumers that wrap any + * judge/critic can follow the same convention). + * + * `usage.model` must be present in `MODEL_PRICING` for cost math to + * populate; otherwise totalCostUsd stays at 0 for the entry but + * tokens still aggregate. + */ + recordVerdict( + verdict: { + usage?: { inputTokens: number; outputTokens: number; model: string; cachedTokens?: number; reasoningTokens?: number } + verdict?: 'pass' | 'fail' | 'borderline' | string + }, + scenarioId: string, + tags?: Record, + ): CostEntry | null { + if (!verdict.usage) return null + const entry = this.record({ + scenarioId, + model: verdict.usage.model, + inputTokens: verdict.usage.inputTokens, + outputTokens: verdict.usage.outputTokens, + cachedTokens: verdict.usage.cachedTokens, + reasoningTokens: verdict.usage.reasoningTokens, + tags, + }) + this.markOutcome(scenarioId, verdict.verdict === 'pass') + return entry + } + get(scenarioId: string): ScenarioCost | undefined { return this.byScenario.get(scenarioId) } diff --git a/src/index.ts b/src/index.ts index bca7428..717aca8 100644 --- a/src/index.ts +++ b/src/index.ts @@ -266,6 +266,19 @@ export type { CostEntry, ScenarioCost, CostSummary, TokenSpec } from './cost-tra export { dominates, paretoFrontier } from './pareto' export type { Direction, Objective, ParetoResult } from './pareto' +export { + scanForMuffledGates, + formatFindings, + DEFAULT_FINDERS, + UNIVERSAL_FINDERS, + findFallbackToPass, + findLiteralTruePass, + findConstructorCwdDropped, + findAutoMatchNoExpectation, + findSkipCountsAsPass, +} from './muffled-gate-scanner' +export type { MuffledFinding, MuffledFinder, ScanOptions } from './muffled-gate-scanner' + export { analyzeSeries } from './series-convergence' export type { SeriesConvergenceOptions, SeriesConvergenceResult } from './series-convergence' diff --git a/src/muffled-gate-scanner.ts b/src/muffled-gate-scanner.ts new file mode 100644 index 0000000..e5d926c --- /dev/null +++ b/src/muffled-gate-scanner.ts @@ -0,0 +1,289 @@ +/** + * muffled-gate-scanner — test helper that greps consumer source for + * gate + measurement anti-patterns and fails with file:line locations. + * + * Named pattern lives at starter-foundry's `.evolve/patterns/muffled-gate.md`; + * same shape applies to every consumer (a gate that should fail loud + * returns silent success; a metric that should emit a real number + * reports noise/empty). + * + * Usage (in a consumer project's test file): + * + * import { scanForMuffledGates, DEFAULT_FINDERS } from '@tangle-network/agent-eval' + * + * test('no muffled gates in eval surface', () => { + * const findings = scanForMuffledGates({ + * repoRoot: process.cwd(), + * scanFiles: ['src/eval/scaffold.ts', 'scripts/promote.mjs'], + * finders: DEFAULT_FINDERS, + * }) + * if (findings.length) assert.fail(formatFindings(findings)) + * }) + * + * Customize by passing your own `finders` — each finder is + * `(file, text) => Finding[]` and runs per-file. + * + * Escape hatch: any line containing `muffle-ok:` is excluded from all + * finders, letting consumers opt a legitimate fallback out explicitly. + */ + +import { readFileSync, existsSync, readdirSync, statSync } from 'node:fs' +import { join } from 'node:path' + +export interface MuffledFinding { + file: string + line: number + lineText: string + pattern: string +} + +export type MuffledFinder = (file: string, text: string) => MuffledFinding[] + +export interface ScanOptions { + /** Absolute path to the repo root. */ + repoRoot: string + /** Explicit file list (paths relative to repoRoot) for context-specific finders. */ + scanFiles: string[] + /** + * Auto-derived scan: walk these dirs for files matching importGlob + the + * string `importsContain` and run the universal finders on them. Pattern + * from starter-foundry H4 (research/decisions/001) — catches new files + * with agent-eval import that would otherwise escape context-specific + * scan lists. + */ + autoDerive?: { + roots: string[] // e.g. ['src', 'scripts'] + extensions: RegExp // e.g. /\.(ts|mjs|js)$/ + importsContain: string // e.g. '@tangle-network/agent-eval' + universalFinders: MuffledFinder[] + } + /** Per-file finders (context-specific patterns). */ + finders: MuffledFinder[] +} + +/** + * Strip line comments + block-comment continuation lines from a single line + * so finders don't match prose about the pattern. + */ +function codeOf(line: string): string { + return line.replace(/\/\/.*$/, '').replace(/^\s*\*.*$/, '') +} + +/** Skip if the line carries the `muffle-ok:` escape hatch. */ +function isMuffleOk(line: string): boolean { + return line.includes('muffle-ok:') +} + +/** + * Default finder: `command || true` in a testCommand/setupCommand/cmd/command + * string. Swallows exit codes. + */ +export const findFallbackToPass: MuffledFinder = (file, text) => { + const out: MuffledFinding[] = [] + const lines = text.split('\n') + for (let i = 0; i < lines.length; i++) { + const line = lines[i]! + if (isMuffleOk(line)) continue + const code = codeOf(line) + if (!code.trim()) continue + if (/\|\| true/.test(code) && /(testCommand|setupCommand|cmd|command)/.test(code)) { + out.push({ file, line: i + 1, lineText: line.trim(), pattern: 'fallback-to-pass (|| true in command string)' }) + } + } + return out +} + +/** + * `testCommand: 'true'` literal silent-pass — an unknown-language dispatch + * arm that returns a no-op instead of throwing. + */ +export const findLiteralTruePass: MuffledFinder = (file, text) => { + const out: MuffledFinding[] = [] + const lines = text.split('\n') + for (let i = 0; i < lines.length; i++) { + const line = lines[i]! + if (isMuffleOk(line)) continue + const code = codeOf(line) + if (!code.trim()) continue + if (/testCommand\s*:\s*['"]true['"]/.test(code)) { + out.push({ file, line: i + 1, lineText: line.trim(), pattern: 'literal-true-pass (testCommand: "true")' }) + } + } + return out +} + +/** + * `new SubprocessSandboxDriver({ cwd: ... })` — constructor arg silently + * dropped in agent-eval <0.7.1. 0.7.1+ honors as fallback, but the form + * still invites confusion; prefer `new SubprocessSandboxDriver()` with + * cwd in the per-call HarnessConfig. + */ +export const findConstructorCwdDropped: MuffledFinder = (file, text) => { + const out: MuffledFinding[] = [] + const lines = text.split('\n') + for (let i = 0; i < lines.length; i++) { + const line = lines[i]! + if (isMuffleOk(line)) continue + const code = codeOf(line) + if (!code.trim()) continue + if (/new\s+SubprocessSandboxDriver\s*\(\s*\{[^}]*cwd\s*:/.test(code)) { + out.push({ + file, + line: i + 1, + lineText: line.trim(), + pattern: 'construct-vs-call cwd dropped (driver.exec reads config.cwd, not constructor.cwd)', + }) + } + } + return out +} + +/** + * `if (!expected) return true` — matcher auto-passes when ground truth is + * absent. Inflates accuracy metrics for scenarios without expectations. + */ +export const findAutoMatchNoExpectation: MuffledFinder = (file, text) => { + const out: MuffledFinding[] = [] + const lines = text.split('\n') + for (let i = 0; i < lines.length; i++) { + const line = lines[i]! + if (isMuffleOk(line)) continue + const code = codeOf(line) + if (!code.trim()) continue + if (/if\s*\(\s*!expected\s*\)\s*return\s+true/.test(code)) { + out.push({ + file, + line: i + 1, + lineText: line.trim(), + pattern: 'auto-match-no-expectation (if (!expected) return true)', + }) + } + } + return out +} + +/** + * `if (p.skipped) return true` — skip-counts-as-pass in quality scorers. + * Use three-valued `true | false | 'skipped'` return + explicit partial + * credit instead. + */ +export const findSkipCountsAsPass: MuffledFinder = (file, text) => { + const out: MuffledFinding[] = [] + const lines = text.split('\n') + for (let i = 0; i < lines.length; i++) { + const line = lines[i]! + if (isMuffleOk(line)) continue + const code = codeOf(line) + if (!code.trim()) continue + if (/if\s*\(\s*\w+\.skipped\s*\)\s*return\s+true/.test(code)) { + out.push({ + file, + line: i + 1, + lineText: line.trim(), + pattern: 'skip-counts-as-pass (if (.skipped) return true)', + }) + } + } + return out +} + +/** + * The canonical default bundle. Callers can import these individually, + * replace them, or append custom finders for project-specific patterns. + */ +export const DEFAULT_FINDERS: MuffledFinder[] = [ + findFallbackToPass, + findLiteralTruePass, + findAutoMatchNoExpectation, + findSkipCountsAsPass, +] + +/** Finders that should run on EVERY file with the target import, not just SCAN_FILES. */ +export const UNIVERSAL_FINDERS: MuffledFinder[] = [ + findConstructorCwdDropped, +] + +/** + * Walk `roots` under `repoRoot` and return file paths (relative to repoRoot) + * whose contents include `importsContain`. + */ +function autoDeriveImporters( + repoRoot: string, + roots: string[], + extensions: RegExp, + importsContain: string, +): string[] { + const matches: string[] = [] + const walk = (rel: string) => { + const abs = join(repoRoot, rel) + if (!existsSync(abs)) return + for (const entry of readdirSync(abs)) { + const sub = join(rel, entry) + const subAbs = join(repoRoot, sub) + let st + try { st = statSync(subAbs) } catch { continue } + if (st.isDirectory()) { + if (entry === 'node_modules' || entry === 'dist' || entry === 'dist-tests' || entry.startsWith('.')) continue + walk(sub) + } else if (st.isFile() && extensions.test(entry)) { + if (entry.endsWith('.test.ts') || entry.endsWith('.test.mjs') || entry.endsWith('.test.js')) continue + let text: string + try { text = readFileSync(subAbs, 'utf8') } catch { continue } + if (text.includes(importsContain)) matches.push(sub) + } + } + } + for (const r of roots) walk(r) + return matches +} + +/** + * Run all finders against the configured files. Returns a flat list of + * findings. Callers format + assert as they prefer. + */ +export function scanForMuffledGates(opts: ScanOptions): MuffledFinding[] { + const findings: MuffledFinding[] = [] + const scanned = new Set() + + // Context-specific: run all finders on explicit SCAN_FILES. + for (const file of opts.scanFiles) { + const abs = join(opts.repoRoot, file) + if (!existsSync(abs)) continue + const text = readFileSync(abs, 'utf8') + for (const find of opts.finders) findings.push(...find(file, text)) + scanned.add(file) + } + + // Auto-derived: run universal finders on every importer not already scanned. + if (opts.autoDerive) { + const importers = autoDeriveImporters( + opts.repoRoot, + opts.autoDerive.roots, + opts.autoDerive.extensions, + opts.autoDerive.importsContain, + ) + for (const file of importers) { + if (scanned.has(file)) continue + const abs = join(opts.repoRoot, file) + if (!existsSync(abs)) continue + const text = readFileSync(abs, 'utf8') + for (const find of opts.autoDerive.universalFinders) findings.push(...find(file, text)) + } + } + + return findings +} + +/** + * Format findings into a single assert.fail-ready message. Each finding + * carries file:line + pattern name + the offending line. + */ +export function formatFindings(findings: MuffledFinding[]): string { + if (findings.length === 0) return '' + return [ + `Found ${findings.length} muffled-gate pattern(s).`, + `Fix each or annotate the line with "// muffle-ok: ".`, + '', + ...findings.map((f) => ` ${f.file}:${f.line} — ${f.pattern}\n ${f.lineText}`), + ].join('\n') +} diff --git a/tests/cost-tracker.test.ts b/tests/cost-tracker.test.ts new file mode 100644 index 0000000..b9055c8 --- /dev/null +++ b/tests/cost-tracker.test.ts @@ -0,0 +1,59 @@ +import { describe, it, expect } from 'vitest' +import { CostTracker } from '../src/cost-tracker' + +describe('CostTracker.recordVerdict', () => { + it('records + markOutcome in one call from verdict.usage + verdict.verdict', () => { + const t = new CostTracker() + const entry = t.recordVerdict( + { + usage: { inputTokens: 1000, outputTokens: 500, model: 'gpt-4o-mini' }, + verdict: 'pass', + }, + 'scn-1', + { phase: 'meta-judge' }, + ) + expect(entry).not.toBeNull() + expect(entry!.scenarioId).toBe('scn-1') + expect(entry!.inputTokens).toBe(1000) + expect(entry!.tags?.phase).toBe('meta-judge') + + const s = t.summary() + expect(s.scenarioCount).toBe(1) + expect(s.completedCount).toBe(1) // verdict === 'pass' → markOutcome(true) + }) + + it('returns null + no-ops when verdict has no usage (e.g. compile-gate short-circuit)', () => { + const t = new CostTracker() + const entry = t.recordVerdict({ verdict: 'fail' }, 'scn-no-usage') + expect(entry).toBeNull() + expect(t.summary().scenarioCount).toBe(0) + }) + + it('verdict !== "pass" → markOutcome(false)', () => { + const t = new CostTracker() + t.recordVerdict( + { usage: { inputTokens: 100, outputTokens: 50, model: 'gpt-4o-mini' }, verdict: 'borderline' }, + 'scn-border', + ) + expect(t.summary().completedCount).toBe(0) + }) + + it('propagates cachedTokens + reasoningTokens to the underlying record', () => { + const t = new CostTracker() + t.recordVerdict( + { + usage: { + inputTokens: 500, + outputTokens: 200, + cachedTokens: 100, + reasoningTokens: 50, + model: 'claude-sonnet-4-20250514', + }, + verdict: 'pass', + }, + 'scn-cache', + ) + const bucket = t.get('scn-cache') + expect(bucket!.totalCachedTokens).toBe(100) + }) +}) diff --git a/tests/muffled-gate-scanner.test.ts b/tests/muffled-gate-scanner.test.ts new file mode 100644 index 0000000..e4f94a3 --- /dev/null +++ b/tests/muffled-gate-scanner.test.ts @@ -0,0 +1,166 @@ +import { describe, it, expect } from 'vitest' +import { mkdtempSync, writeFileSync, mkdirSync, rmSync } from 'node:fs' +import { tmpdir } from 'node:os' +import { join } from 'node:path' +import { + scanForMuffledGates, + formatFindings, + DEFAULT_FINDERS, + UNIVERSAL_FINDERS, + findFallbackToPass, + findConstructorCwdDropped, + findSkipCountsAsPass, +} from '../src/muffled-gate-scanner' + +/** + * Build an isolated temp repo with the given file map and return its path. + */ +function fixture(files: Record): string { + const root = mkdtempSync(join(tmpdir(), 'muffled-gate-scanner-')) + for (const [rel, content] of Object.entries(files)) { + const abs = join(root, rel) + mkdirSync(join(abs, '..'), { recursive: true }) + writeFileSync(abs, content) + } + return root +} + +describe('muffled-gate-scanner', () => { + it('finds `|| true` in a testCommand string', () => { + const root = fixture({ + 'src/runner.ts': ` + const config = { + testCommand: 'pnpm run validate || pnpm run build || true', + } + `, + }) + try { + const findings = scanForMuffledGates({ + repoRoot: root, + scanFiles: ['src/runner.ts'], + finders: [findFallbackToPass], + }) + expect(findings).toHaveLength(1) + expect(findings[0]!.pattern).toMatch(/fallback-to-pass/) + expect(findings[0]!.line).toBe(3) + } finally { + rmSync(root, { recursive: true, force: true }) + } + }) + + it('`muffle-ok:` annotation on the same line excludes the finding', () => { + const root = fixture({ + 'src/runner.ts': ` + const config = { + testCommand: 'forge install || true', // muffle-ok: setup is best-effort; forge build is the real gate + } + `, + }) + try { + const findings = scanForMuffledGates({ + repoRoot: root, + scanFiles: ['src/runner.ts'], + finders: [findFallbackToPass], + }) + expect(findings).toHaveLength(0) + } finally { + rmSync(root, { recursive: true, force: true }) + } + }) + + it('auto-derive walks importers + applies universal finders to files not on explicit list', () => { + const root = fixture({ + 'src/a.ts': ` + import { SubprocessSandboxDriver } from '@tangle-network/agent-eval' + const driver = new SubprocessSandboxDriver({ cwd: '/tmp' }) + `, + 'src/b.ts': ` + import assert from 'node:assert' + const noop = true + `, + 'scripts/c.mjs': ` + import { SubprocessSandboxDriver } from '@tangle-network/agent-eval' + const driver2 = new SubprocessSandboxDriver({ cwd: '/tmp' }) + `, + }) + try { + const findings = scanForMuffledGates({ + repoRoot: root, + scanFiles: [], // empty — rely entirely on auto-derive + finders: [], + autoDerive: { + roots: ['src', 'scripts'], + extensions: /\.(ts|mjs|js)$/, + importsContain: '@tangle-network/agent-eval', + universalFinders: [findConstructorCwdDropped], + }, + }) + // b.ts does NOT import agent-eval → skipped. + // a.ts and c.mjs both import + both have the bug → 2 findings. + expect(findings).toHaveLength(2) + expect(findings.map((f) => f.file).sort()).toEqual(['scripts/c.mjs', 'src/a.ts']) + } finally { + rmSync(root, { recursive: true, force: true }) + } + }) + + it('scanFiles takes precedence over auto-derive (dedup — no double-scan)', () => { + const root = fixture({ + 'src/a.ts': ` + import { SubprocessSandboxDriver } from '@tangle-network/agent-eval' + const driver = new SubprocessSandboxDriver({ cwd: '/tmp' }) + `, + }) + try { + const findings = scanForMuffledGates({ + repoRoot: root, + scanFiles: ['src/a.ts'], // explicit + finders: [findConstructorCwdDropped], // applied via explicit + autoDerive: { + roots: ['src'], + extensions: /\.ts$/, + importsContain: '@tangle-network/agent-eval', + universalFinders: [findConstructorCwdDropped], // also applied via auto — should NOT double-count + }, + }) + expect(findings).toHaveLength(1) + } finally { + rmSync(root, { recursive: true, force: true }) + } + }) + + it('DEFAULT_FINDERS is a stable bundle that catches the common cases', () => { + const root = fixture({ + 'src/scorer.ts': ` + function phaseOk(p) { + if (p.skipped) return true + return p.ok === true + } + `, + }) + try { + const findings = scanForMuffledGates({ + repoRoot: root, + scanFiles: ['src/scorer.ts'], + finders: DEFAULT_FINDERS, + }) + expect(findings.length).toBeGreaterThan(0) + expect(findings.some((f) => f.pattern.includes('skip-counts-as-pass'))).toBe(true) + } finally { + rmSync(root, { recursive: true, force: true }) + } + }) + + it('formatFindings returns assert.fail-ready message with file:line + pattern + line body', () => { + const msg = formatFindings([ + { file: 'src/a.ts', line: 42, lineText: "testCommand: 'foo || true',", pattern: 'fallback-to-pass' }, + ]) + expect(msg).toMatch(/src\/a\.ts:42/) + expect(msg).toMatch(/fallback-to-pass/) + expect(msg).toMatch(/muffle-ok:/) // escape-hatch hint included + }) + + it('exports UNIVERSAL_FINDERS which includes the construct-vs-call cwd finder', () => { + expect(UNIVERSAL_FINDERS).toContain(findConstructorCwdDropped) + }) +})