From 6aedafaa9bf0e378d0b9774e4a1c9202ca74b213 Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Fri, 24 Apr 2026 00:55:33 -0600 Subject: [PATCH 1/3] feat(0.7.1): SKILL.md as sole doc + SubprocessSandboxDriver constructor fallbacks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Motivation: meta-analysis of starter-foundry's Gen 6→Round-0-post-Gen-9 arc surfaced 10+ incident-driven lessons about using this package. They lived nowhere canonical (README/CLAUDE.md described a stale 0.2-era API surface; the actual v0.7 builder-of-builders + sandbox harness exports had zero usage docs). Two shipped bugs traced to the same driver construct-vs-call cwd footgun. Consolidating into one authoritative doc + closing the footgun at the source. Changes: - .claude/skills/agent-eval/SKILL.md (NEW, sole source of truth) - minimal builder-of-builders path - 4 footguns (cwd-in-constructor, fallback-to-pass, fidelity-without- compile-gate, blob-vs-files channel) - 3 rules (both gates, single-source dispatch, Phase 1.5 walks entry points) - three-layer eval contract (builder → app-build → app-runtime) - regression tests every consumer should carry - extend-don't-duplicate index over the 100+ exports - muffled-gate pattern catalog (7 sub-shapes from shipped bugs) - README.md + CLAUDE.md → pointers to SKILL.md. No duplicated content. - SubprocessSandboxDriver constructor now accepts `{cwd?, env?}` as FALLBACKS when HarnessConfig omits them. Per-call config always wins. Pre-0.7.1 the constructor took no declared args, so TS tolerated `new Driver({cwd})` and silently dropped the arg at runtime — the exact shape of the Gen 8b promoter + Round-0 runtime eval bugs in starter-foundry. 0.7.1 makes the natural misuse do the obvious thing. New type: `SubprocessDriverDefaults`. Zero breaking changes for code that already reads cwd from HarnessConfig (the documented path). - tests/sandbox-harness.test.ts: +3 tests guarding the new defaults contract — default.cwd honored, per-call wins over default, defaults.env merges correctly. 322/322 tests pass (was 319; +3 new). typecheck clean. Version: 0.7.0 → 0.7.1. --- .claude/skills/agent-eval/SKILL.md | 340 +++++++++++++++++++++++++++++ CLAUDE.md | 70 ++---- README.md | 51 ++--- package.json | 2 +- src/index.ts | 2 +- src/sandbox-harness.ts | 32 ++- tests/sandbox-harness.test.ts | 54 +++++ 7 files changed, 454 insertions(+), 97 deletions(-) create mode 100644 .claude/skills/agent-eval/SKILL.md diff --git a/.claude/skills/agent-eval/SKILL.md b/.claude/skills/agent-eval/SKILL.md new file mode 100644 index 0000000..3a2b440 --- /dev/null +++ b/.claude/skills/agent-eval/SKILL.md @@ -0,0 +1,340 @@ +--- +name: agent-eval +description: Trace-first evaluation framework. Use for any code generator / LLM-in-the-loop evaluation: sandbox harness + build gates, BuilderSession (builder-of-builders), three-layer scoring (builder → app-build → app-runtime), meta-judge with compile short-circuit, workspace snapshots + assertions. Directives encode 10+ shipped-bug lessons — read before writing integration code. +--- + +# agent-eval — usage directives + +One authoritative doc. The README is a pointer to here. No JSDoc essay +duplicates this content — APIs use short pointer comments (`see SKILL.md +§
`). Update this file, not a sidecar. + +The rules below were paid for in real bugs. Each one has a shipped-and-caught +incident log entry behind it; skip one and the bug class reappears. + +--- + +## When to use agent-eval + +- **Code generator evaluation**: LLM emits a scaffold, manifest, config, or + patch; you need to know if it compiles, runs, and matches the intent. +- **Multi-turn agent benchmarks**: scenario fixtures + judges + convergence + over turns; see `BenchmarkRunner`, `executeScenario`. +- **Builder-of-builders** ("an agent that builds an app"): runs nest — a + builder run contains child build runs contains grandchild runtime runs; + see `BuilderSession`, `three-layer-eval`. +- **Offline A/B on prompts or models**: `ExperimentTracker`, + `PromptOptimizer`, `PairwiseSteeringOptimizer`. +- **Guardrails on LLM quality**: `createAntiSlopJudge`, `RunCritic`, + `red-team`, `contamination-guard`. + +If your use case is any of these, don't build a parallel harness. Extend +this one (see §"Extend, don't duplicate"). + +--- + +## Minimal working path (builder-of-builders) + +The pattern that ships in every closed-loop generation project: + +```ts +import { + InMemoryTraceStore, BuilderSession, SubprocessSandboxDriver, + runAssertions, fileExists, +} from '@tangle-network/agent-eval' + +const store = new InMemoryTraceStore() +const driver = new SubprocessSandboxDriver() // ← no constructor arg +const session = new BuilderSession(store, { projectId: 'my-app' }, driver) + +await session.startChat() +const ship = await session.ship({ + harness: { + setupCommand: 'pnpm install --prefer-offline', + testCommand: 'pnpm exec tsc --noEmit', // ← strict, fail-loud + cwd: composedScaffoldDir, // ← MUST be in HarnessConfig + timeoutMs: 180_000, + }, +}) +await session.endChat({ pass: ship.result.passed, score: ship.result.score }) + +// Structural check complements the build gate +const assertions = [fileExists('package.json'), fileExists('src/main.ts')] +const structural = runAssertions(snapshot, assertions) +``` + +Three things this example is deliberately doing: +1. **Driver takes no args.** `cwd` goes in `HarnessConfig`, not the + constructor. (§Footgun 1) +2. **testCommand is strict** — no `|| true` swallow. (§Footgun 2) +3. **Structural + build gates are both run.** Build-only misses missing + manifest files; structural-only misses broken code. (§Rule: both) + +--- + +## Footgun 1: `cwd` belongs in `HarnessConfig`, not the driver constructor + +```ts +// BROKEN — cwd is silently dropped (pre-0.7.1) +// FIXED in 0.7.1 — constructor honors cwd/env as fallbacks +new SubprocessSandboxDriver({ cwd: dir }) + +// CORRECT — cwd travels with the call +new SubprocessSandboxDriver() +session.ship({ harness: { cwd: dir, testCommand: 'pnpm exec tsc --noEmit', ... } }) +``` + +**Why this matters**: `SubprocessSandboxDriver.exec(phase, command, config)` +spawns with `cwd: config.cwd`. The driver is stateless-per-call by design so +one driver can serve many concurrent sandboxes. Constructor args used to +be silently dropped. 0.7.1 accepts `{cwd?, env?}` as FALLBACKS when the +per-call config omits them — per-call config always wins. + +**Shipped incidents**: starter-foundry Gen 8b (promoters), starter-foundry +Round 0 post-Gen-9 (runtime eval). Both silent-passed broken scaffolds with +`exitCode=0` because the constructor arg was dropped and spawn inherited +node's cwd, where the same tsc passed against the wrong project. + +**Regression guard**: if your project has a `tests/muffled-gate-invariant.*` +file (recommended — see §"Regression tests worth writing"), add a finder +for the `new SubprocessSandboxDriver({cwd: ...})` shape. A comment +annotation `// muffle-ok: ` opts out. + +--- + +## Footgun 2: Build gate must fail loud + +Every build command in an eval harness must propagate nonzero exit codes. + +```ts +// BROKEN — swallows every failure +testCommand: 'pnpm run validate || pnpm run build || true' + +// CORRECT — strict typecheck +testCommand: 'pnpm exec tsc --noEmit' + +// OK — legitimate best-effort setup, annotated +setupCommand: 'forge install --no-git || true', // muffle-ok: forge build is the real gate +``` + +**Why this matters**: the fidelity/meta judge cannot reliably spot compile +errors. If the build gate returns 0, a broken scaffold scores ~0.8 on +fidelity, passes the gate, ships. Shipped 3 bugs (React 17 imports, .ts +JSX, `esbuild.loader` hallucination) through a `|| true` gate before the +pattern was closed. + +**Rule**: the build gate is the signal of truth. If you `|| true` it, you +have no signal. If a specific failure is expected and tolerable, catch +it explicitly in the runner, not in the shell chain. + +--- + +## Footgun 3: Pair the meta judge with a build outcome + +`invokeMetaJudge` (or any LLM-as-judge on code) **must** short-circuit on +`buildOutcome.passed=false`: + +```ts +if (buildOutcome?.passed === false) { + return { + verdict: 'fail', + overall: 0, + issues: [{ severity: 'high', description: `build failed: ${buildOutcome.stderr.slice(-400)}` }], + rationale: 'Build/typecheck failed — scaffold cannot run. LLM scoring skipped.', + } +} +``` + +**Why**: LLM judges rate code they CAN'T run. A scaffold that doesn't +compile can still "look right" — imports from the right package, plausible +component structure, idiomatic layout. The judge happily gives 0.85. +Goodhart's Law: the metric (fidelity) rewards what looks right, not what +works. Pair fidelity with a ground-truth gate or the metric lies. + +--- + +## Footgun 4: Snapshot blobs ≠ files + +`WorkspaceSnapshot` has two channels: + +- `files: Record` — UTF-8-valid text, full content. +- `blobs: Record` — binaries. Size + + hash only; no content. + +Assertions like `fileExists(path)` check both. Assertions like +`fileContains(path, needle)` only work against `files`. If a test seems to +fail inexplicably on a `.wasm` / `.zkey` / `.png`, it's in the blob channel +— verify with `snapshot.blobs[path]` not `snapshot.files[path]`. + +--- + +## Rule: both gates, not either + +Every scaffold eval must run **both** `SandboxHarness.run()` (build gate) +and `runAssertions()` (structural gate). They catch orthogonal failure +classes: + +- Build-only misses: manifest promised 10 files; scaffold wrote 7. Build + passes. Users notice when they open an empty file. +- Structural-only misses: all files exist, one has a TS error. Assertions + pass. Build would have caught it. + +`runAssertions(snapshot, [fileExists(...)...])` is cheap (no subprocess); +run it unconditionally. + +--- + +## Rule: single source of truth for per-language dispatch + +If you have a table mapping `taxonomy.language → {setupCommand, testCommand, +timeoutMs}`, export it ONCE from a single module and import everywhere. +Do not copy-paste it into promoters, audit scripts, and CI configs. + +Incident: starter-foundry had three copies; Gen 8b fixed two; Gen 9 +discovered the third was still muffled. The Gen 9 invariant test now +asserts exactly one copy exists. + +--- + +## Rule: Phase 1.5 audit walks entry-point scripts + +Before calling a structural fix complete, grep every file that imports +from `@tangle-network/agent-eval` — not just the lib/ layer. + +Incident: Gen 9 scanned `src/**` and skipped `scripts/agent-eval-scaffold.mjs`, +an entry point. Round 0 found the same cwd bug live there. + +**Heuristic for scan roots**: `rg -l '@tangle-network/agent-eval' --type ts +--type mjs --type js`. Every match goes in the invariant scanner's +scan list. + +--- + +## Three-layer eval contract + +`BuilderSession` emits three layers of traces: + +``` +builder (L0) # startChat → endChat + └── app-build (L1) # ship({harness}) → harness exit + └── app-runtime (L2) # runAppScenario — only after ship succeeds +``` + +Contract: +- `startChat()` before anything else. +- `ship()` at most once per `startChat()` (idempotent: re-call throws). +- `runAppScenario()` only after `ship()` returns `passed=true`. Call + order guarded; throws if you call runtime without a successful build. +- `endChat({pass, score})` closes the builder run. The builder's pass/ + score is YOUR aggregation of the child layers — agent-eval doesn't + force one. + +`scoreProject` / `scoreAllProjects` compute defensible aggregates across +the three layers; pass `kind: 'scaffold-only'` when you only ran +build + structural (no app-runtime). + +--- + +## Regression tests worth writing + +Every consumer of agent-eval should carry these tests in its own suite: + +### 1. `HarnessConfig.cwd` is honored end-to-end + +Not source-grep — real spawn. Prevents regressions where agent-eval's +driver stops reading per-call cwd (unlikely but would be silent): + +```ts +it('driver honors HarnessConfig.cwd at spawn time', async () => { + // macOS: `/var/folders` symlinks to `/private/var/folders` and bash's + // pwd resolves it. Use realpathSync or the test fails on Darwin. + const dir = realpathSync(mkdtempSync(join(tmpdir(), 't-'))) + const r = await new SubprocessSandboxDriver().exec('run', 'pwd', { cwd: dir }) + expect(r.stdout.trim()).toBe(dir) +}) +``` + +### 2. Muffled-gate invariant scanner + +Code-grep test scanning your repo for the patterns that shipped bugs: +`|| true` in command strings, `testCommand: 'true'` literal, `?? 'starter'` +or similar permissive defaults, `if (!expected) return true` in matchers, +`if (p.skipped) return true` in scorers, duplicate per-language switch +tables, `new SubprocessSandboxDriver({cwd: ...})` constructor-drop. + +Template implementation: starter-foundry's +`tests/muffled-gate-invariant.test.ts`. Escape hatch: `// muffle-ok: +` on the same line opts a legitimate fallback out. + +### 3. Planted-regression smoke + +Once the invariant test exists, periodically revert one of the fixes and +confirm the invariant catches it with exact `file:line`. An invariant +that's never been seen failing might not actually work. + +--- + +## Extend, don't duplicate + +Check what agent-eval already has before adding new machinery. The +highest-leverage thing you can do is extend the existing harness, not +fork it. Canonical primitives: + +| Need | Use | +|------|-----| +| Run a build with structured result | `SandboxHarness` + `SubprocessSandboxDriver` | +| Parse test output (vitest/jest/pytest) | `composeParsers(...)`, `vitestTestParser`, `pytestTestParser`, `jestTestParser` | +| Score a scaffold | `scoreProject` / `scoreAllProjects` (three-layer-eval) | +| Grade multi-turn agents | `BenchmarkRunner` + judges + `ConvergenceTracker` | +| LLM-as-judge | `createCustomJudge`, `createAntiSlopJudge`, `RunCritic` | +| Meta-agent driving a product | `AgentDriver` | +| Prompt A/B | `ExperimentTracker` + `PromptOptimizer` | +| Find the commit that broke a metric | `bisector` | +| Detect contamination / memorization | `contamination-guard` | +| Red team a model or agent | `red-team` | +| Budget tokens/$ | `BudgetGuard`, `CostTracker` | +| Track completion over turns | `ConvergenceTracker` | +| Export traces | OTLP export via `observability` / trace store | + +Don't build a "my-project-eval-runner.ts". If something you need isn't +here, the PR that adds it to agent-eval is more valuable than a local +copy that will drift. + +--- + +## Common bug classes (muffled-gate pattern) + +Seven shapes observed in one closed-loop generation project. Audit for +these before shipping any gate: + +1. **Fallback-to-pass**: `command || true` — swallows exit codes. +2. **Default-missing-to-permissive**: `options.kind ?? 'starter'` — missing + value becomes a specific permissive one. +3. **Skip-counts-as-pass**: `if (p.skipped) return true` in a scorer. +4. **Auto-match no-expectation**: `if (!expected) return true` in a matcher + — inflates accuracy for unlabeled scenarios. +5. **Duplicate drift**: same dispatch table in N files; a fix to N−1 + silently regresses the Nth. +6. **Unknown-case silent default**: `default: return noop` for a value + that should never be unknown. +7. **Construct-vs-call dropped arg**: `new Driver({cwd})` when `cwd` + lives on the per-call config. See Footgun 1. + +The common shape is "something that should fail loud returns silent +success." Write the gate to fail closed; use `// muffle-ok: ` +for the rare legitimate exception. + +--- + +## Status of this doc + +**Sole source of truth for agent-eval usage directives.** + +- `README.md` is a pointer to this file. +- `CLAUDE.md` is a pointer to this file. +- Inline JSDoc uses `see .claude/skills/agent-eval/SKILL.md §
`. + +If you update the API and this file goes out of sync, the API change is +incomplete. Same rule for the 10 footguns/rules above — they were written +from shipped incidents. Extending the list is welcome; silently +deleting an entry is not. diff --git a/CLAUDE.md b/CLAUDE.md index 25f3662..42aef99 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -1,69 +1,29 @@ # @tangle-network/agent-eval -Reusable evaluation framework for Tangle agent applications. +Claude Code agents working in this repo: the usage directives are in +**[`.claude/skills/agent-eval/SKILL.md`](./.claude/skills/agent-eval/SKILL.md)** +and are auto-discovered by Claude Code as the `/agent-eval` skill. -## What This Is +That file is the sole source of truth for: +- minimal builder-of-builders integration path +- the seven muffled-gate footguns (from shipped bugs) +- three-layer eval contract (`BuilderSession` → `app-build` → `app-runtime`) +- regression tests every consumer should carry +- "when to use what" index of the 100+ exports -Domain-agnostic eval package that any Tangle agent (film, GTM, legal, tax) imports to get multi-turn scenario execution, multi-judge scoring, agent driver testing, and convergence tracking. Each agent provides its own scenarios, judges, and system prompts. +Do not duplicate content from SKILL.md here. Update SKILL.md; this file is +a pointer. -## Architecture - -``` -@tangle-network/agent-eval -├── ProductClient — configurable HTTP client (routes are config) -├── ScenarioRegistry — auto-discovery + filtering -├── executeScenario — multi-turn executor with artifact collection -├── BenchmarkRunner — orchestrates scenarios + judges + scoring -├── AgentDriver — meta-agent that plays personas against real product -├── MetricsCollector — per-turn product state metrics -├── ConvergenceTracker — completion% over turns -├── Reporter — markdown + console output -└── Judges — domain expert (configurable), code execution, coherence, adversarial -``` - -## Key Files - -- `src/types.ts` — all shared types -- `src/client.ts` — ProductClient + e2e workflow harness -- `src/judges.ts` — 4 built-in judges + createCustomJudge factory -- `src/executor.ts` — scenario execution with configurable system prompt -- `src/benchmark.ts` — BenchmarkRunner class -- `src/driver.ts` — AgentDriver (meta-agent turn loop) -- `src/metrics.ts` — MetricsCollector -- `src/convergence.ts` — ConvergenceTracker -- `src/registry.ts` — ScenarioRegistry -- `src/reporter.ts` — report formatting - -## Tech Stack +## Tech stack (unchanging) - TypeScript strict, no semicolons, single quotes, 2-space indent -- tsup for bundling -- vitest for tests -- @tangle-network/tcloud for LLM calls (judges + driver) +- tsup (bundling), vitest (tests) +- `@tangle-network/tcloud` for LLM calls (judges, driver) ## Commands ```bash -pnpm build # tsup build +pnpm build # tsup pnpm test # vitest pnpm typecheck # tsc --noEmit ``` - -## How Agents Use This - -```typescript -import { BenchmarkRunner, ProductClient, defaultJudges } from '@tangle-network/agent-eval' - -const client = new ProductClient({ - baseUrl: 'https://my-agent.tangle.tools', - routes: { signup: '/api/auth/sign-up/email', chat: '/api/chat', ... }, -}) - -const runner = new BenchmarkRunner(tc, { - scenarios: myScenarios, - judges: defaultJudges('film production'), - systemPrompt: MY_SYSTEM_PROMPT, -}) - -const report = await runner.run() -``` diff --git a/README.md b/README.md index b5bbb13..d20fe62 100644 --- a/README.md +++ b/README.md @@ -1,57 +1,32 @@ # @tangle-network/agent-eval -Domain-agnostic evaluation framework for Tangle agent apps. Multi-turn scenario execution, multi-judge scoring, agent-driver meta-testing, convergence tracking. Every agent (tax, legal, film, gtm) imports this to get a reproducible quality harness. +Trace-first evaluation framework for Tangle agents. Core (spans, pipelines, sandbox harness, OTLP export), trust (dataset, red-team, calibration, behavior DSL), builder-of-builders (three-layer eval, resumable sessions, meta-runtime correlation), and frontier (meta-eval correlation study, Process Reward Modeling, bisector). ## Install ```bash -npm install @tangle-network/agent-eval +pnpm add @tangle-network/agent-eval ``` ## Usage -```ts -import { BenchmarkRunner, ProductClient, defaultJudges } from '@tangle-network/agent-eval' - -const client = new ProductClient({ - baseUrl: 'https://my-agent.tangle.tools', - routes: { - signup: '/api/auth/sign-up/email', - chat: '/api/chat', - // ... - }, -}) - -const runner = new BenchmarkRunner(client, { - scenarios: myScenarios, - judges: defaultJudges('film production'), - systemPrompt: MY_SYSTEM_PROMPT, -}) - -const report = await runner.run() -``` - -## What's in the box +**→ [`.claude/skills/agent-eval/SKILL.md`](./.claude/skills/agent-eval/SKILL.md)** — single source of truth for every usage pattern. Covers: minimal builder-of-builders path, the seven muffled-gate footguns paid for in shipped bugs, the three-layer eval contract, regression tests worth writing, and "when to use what" for the 100+ exports. -- **ProductClient** — configurable HTTP client (routes are config, not code) -- **ScenarioRegistry** — auto-discovery + filtering -- **executeScenario** — multi-turn executor with artifact collection -- **BenchmarkRunner** — orchestrates scenarios + judges + scoring -- **AgentDriver** — meta-agent that plays personas against a real product -- **MetricsCollector** — per-turn product state metrics -- **ConvergenceTracker** — completion% over turns -- **Reporter** — markdown + console output -- **Judges** — 4 built-in (domain expert, code execution, coherence, adversarial) + `createCustomJudge` factory +If you're an LLM or agent reading this, load the skill file before writing integration code — it encodes 10+ incident-driven directives that will save you from rediscovering them. -## Tier +## Dev -Marketplace tier of the [agent-builder](https://github.com/drewstone/tangle-agent-builder) three-tier architecture. Uses [`@tangle-network/tcloud`](https://github.com/tangle-network/tcloud) for judge LLM calls. +```bash +pnpm build # tsup +pnpm test # vitest +pnpm typecheck # tsc --noEmit +``` ## Related -- [`@tangle-network/agent-gateway`](https://github.com/tangle-network/agent-gateway) — the gateway agents published through -- [`@tangle-network/agent-client`](https://github.com/tangle-network/agent-client) — consumer SDK for those endpoints -- [`@tangle-network/tcloud`](https://github.com/tangle-network/tcloud) — platform SDK (used internally by judges) +- [`@tangle-network/agent-gateway`](https://github.com/tangle-network/agent-gateway) +- [`@tangle-network/agent-client`](https://github.com/tangle-network/agent-client) +- [`@tangle-network/tcloud`](https://github.com/tangle-network/tcloud) ## License diff --git a/package.json b/package.json index 8ca9287..a895f47 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@tangle-network/agent-eval", - "version": "0.7.0", + "version": "0.7.1", "description": "Trace-first evaluation framework for Tangle agents. Core (spans, pipelines, sandbox harness, OTLP export), trust (dataset, red-team, calibration, behavior DSL), builder-of-builders (three-layer eval, resumable sessions, meta-runtime correlation), and frontier (meta-eval correlation study, Process Reward Modeling, bisector).", "type": "module", "main": "./dist/index.js", diff --git a/src/index.ts b/src/index.ts index cccc338..c059fdc 100644 --- a/src/index.ts +++ b/src/index.ts @@ -202,7 +202,7 @@ export * from './trace' // ── 0.3 producers ──────────────────────────────────────────────────── export { SandboxHarness, SubprocessSandboxDriver, DockerSandboxDriver, composeParsers, vitestTestParser, pytestTestParser, jestTestParser } from './sandbox-harness' -export type { HarnessConfig, SandboxDriver, SandboxResult, SandboxHarnessResult, TestOutputParser } from './sandbox-harness' +export type { HarnessConfig, SandboxDriver, SandboxResult, SandboxHarnessResult, SubprocessDriverDefaults, TestOutputParser } from './sandbox-harness' export { runTestGradedScenario } from './test-graded-scenario' export type { TestGradedScenario, TestGradedRunOptions, TestGradedRunResult } from './test-graded-scenario' diff --git a/src/sandbox-harness.ts b/src/sandbox-harness.ts index 61bcc83..4f57523 100644 --- a/src/sandbox-harness.ts +++ b/src/sandbox-harness.ts @@ -114,8 +114,34 @@ export function composeParsers(...parsers: TestOutputParser[]): TestOutputParser // ── Drivers ────────────────────────────────────────────────────────── +/** + * Driver defaults applied when a per-call `HarnessConfig` does not specify + * them. Per-call config always wins over defaults — the driver never + * silently overrides an explicit `cwd` or `env` in the config. + * + * History: pre-0.7.1 this constructor accepted no args. Callers that wrote + * `new SubprocessSandboxDriver({ cwd })` (the natural mistake — the class + * is a `SandboxDriver` and `cwd` is a `HarnessConfig` field, not a driver + * field) got zero-arg TS tolerance and silent runtime drop, because the + * arg was never read. Two shipped-and-caught bugs (starter-foundry Gen 8b + * promoter + Round-0 runtime eval) were this exact shape. 0.7.1 honors + * the args as fallbacks — the footgun now does the obvious thing instead + * of silently failing. + */ +export interface SubprocessDriverDefaults { + /** Default cwd when `HarnessConfig.cwd` is unset. */ + cwd?: string + /** Default env vars merged into every exec (after `process.env`, before `HarnessConfig.env`). */ + env?: Record +} + export class SubprocessSandboxDriver implements SandboxDriver { id = 'subprocess' + private readonly defaults: SubprocessDriverDefaults + + constructor(defaults: SubprocessDriverDefaults = {}) { + this.defaults = defaults + } async exec(phase: SandboxResult['phase'], command: string, config: HarnessConfig): Promise { const { spawn } = await import('node:child_process') @@ -123,8 +149,10 @@ export class SubprocessSandboxDriver implements SandboxDriver { return await new Promise((resolve) => { const child = spawn(command, { shell: true, - cwd: config.cwd, - env: { ...process.env, ...(config.env ?? {}) }, + // Per-call config.cwd wins; driver default is a fallback for + // callers that set a single project cwd once at construction. + cwd: config.cwd ?? this.defaults.cwd, + env: { ...process.env, ...(this.defaults.env ?? {}), ...(config.env ?? {}) }, }) let stdout = '' let stderr = '' diff --git a/tests/sandbox-harness.test.ts b/tests/sandbox-harness.test.ts index da90aa2..1563dea 100644 --- a/tests/sandbox-harness.test.ts +++ b/tests/sandbox-harness.test.ts @@ -128,4 +128,58 @@ describe('SubprocessSandboxDriver', () => { expect(result.exitCode).toBe(0) expect(result.stdout).toContain('hello') }) + + it('constructor defaults.cwd is honored when HarnessConfig.cwd is unset (0.7.1 footgun fix)', async () => { + // Pre-0.7.1: constructor took no args — `new Driver({cwd})` compiled, + // silent-dropped the arg, spawn inherited node's cwd. Two shipped bugs + // traced to this. 0.7.1 honors defaults as fallbacks. + const { mkdtempSync, realpathSync, rmSync } = await import('node:fs') + const { tmpdir } = await import('node:os') + const { join } = await import('node:path') + const dir = realpathSync(mkdtempSync(join(tmpdir(), 'driver-default-cwd-'))) + try { + const driver = new SubprocessSandboxDriver({ cwd: dir }) + const result = await driver.exec('run', 'pwd', {}) + expect(result.exitCode).toBe(0) + expect(result.stdout.trim()).toBe(dir) + } finally { + rmSync(dir, { recursive: true, force: true }) + } + }) + + it('per-call HarnessConfig.cwd wins over constructor default', async () => { + const { mkdtempSync, realpathSync, rmSync } = await import('node:fs') + const { tmpdir } = await import('node:os') + const { join } = await import('node:path') + const defaultDir = realpathSync(mkdtempSync(join(tmpdir(), 'driver-default-cwd-'))) + const callDir = realpathSync(mkdtempSync(join(tmpdir(), 'driver-call-cwd-'))) + try { + const driver = new SubprocessSandboxDriver({ cwd: defaultDir }) + const result = await driver.exec('run', 'pwd', { cwd: callDir }) + expect(result.exitCode).toBe(0) + expect(result.stdout.trim()).toBe(callDir) + } finally { + rmSync(defaultDir, { recursive: true, force: true }) + rmSync(callDir, { recursive: true, force: true }) + } + }) + + it('constructor defaults.env is merged; per-call env wins on conflict', async () => { + const driver = new SubprocessSandboxDriver({ env: { FROM_DEFAULT: 'd', SHARED: 'default' } }) + // `env | grep` form survives missing vars; `printenv A B C` exits on + // the first miss and swallows the rest, making the assertion flaky. + const result = await driver.exec('run', 'env | grep -E "^(FROM_|SHARED=)" | sort', { + env: { FROM_CALL: 'c', SHARED: 'call' }, + }) + expect(result.exitCode).toBe(0) + const vars = Object.fromEntries( + result.stdout.trim().split('\n').map((l) => { + const eq = l.indexOf('=') + return [l.slice(0, eq), l.slice(eq + 1)] + }), + ) + expect(vars.FROM_DEFAULT).toBe('d') + expect(vars.FROM_CALL).toBe('c') + expect(vars.SHARED).toBe('call') // per-call wins over driver default + }) }) From 31cf9274fc2ee3b3a00c4258c62a4a3186ea338c Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Fri, 24 Apr 2026 02:04:44 -0600 Subject: [PATCH 2/3] feat: add harness optimization primitives --- src/harness-optimizer.ts | 216 ++++++++++++++++++++++++++++++++ src/index.ts | 20 +++ src/run-critic.ts | 28 +++++ src/run-score.ts | 8 ++ tests/harness-optimizer.test.ts | 106 ++++++++++++++++ tests/run-critic.test.ts | 54 ++++++++ 6 files changed, 432 insertions(+) create mode 100644 src/harness-optimizer.ts create mode 100644 tests/harness-optimizer.test.ts create mode 100644 tests/run-critic.test.ts diff --git a/src/harness-optimizer.ts b/src/harness-optimizer.ts new file mode 100644 index 0000000..21b3664 --- /dev/null +++ b/src/harness-optimizer.ts @@ -0,0 +1,216 @@ +import { paretoFrontier, type Objective, type ParetoResult } from './pareto' +import { aggregateRunScore, type RunScore, type RunScoreWeights } from './run-score' +import { RunCritic, type RunTrace } from './run-critic' +import type { SteeringBundle } from './steering' + +export type HarnessIntervention = + | 'continue' + | 'plan' + | 'audit' + | 'recover' + | 'repair' + | 'verify' + | 'final_gate' + | 'wait_for_measurement' + | 'abort' + +export interface WorkflowTopology { + id: string + interventions: HarnessIntervention[] + maxParallelBranches?: number + metadata?: Record +} + +export interface MeasurementPolicy { + required: string[] + optional?: string[] + promoteOn?: Array +} + +export interface HarnessVariant { + id: string + steering?: SteeringBundle + topology?: WorkflowTopology + measurement?: MeasurementPolicy + budgets?: Record + models?: Record + reviewers?: Record + metadata?: Record +} + +export interface HarnessScenario { + id: string + task: string + split?: 'train' | 'validation' | 'test' | string + metadata?: Record +} + +export interface HarnessRunRequest { + variant: HarnessVariant + scenario: HarnessScenario + trialIndex: number +} + +export interface HarnessAdapter { + run(request: HarnessRunRequest): Promise +} + +export interface HarnessRunResult { + variant: HarnessVariant + scenario: HarnessScenario + trialIndex: number + trace: RunTrace + score: RunScore + aggregate: number +} + +export interface HarnessVariantReport { + variant: HarnessVariant + runs: HarnessRunResult[] + aggregateMean: number + passRate: number + costUsdMean: number + wallSecondsMean: number + scoreMean: RunScore +} + +export interface HarnessSelection { + winner: HarnessVariantReport + frontier: ParetoResult + reports: HarnessVariantReport[] +} + +export interface HarnessExperimentResult { + results: HarnessRunResult[] + selection: HarnessSelection +} + +export interface HarnessExperimentConfig { + adapter: HarnessAdapter + variants: HarnessVariant[] + scenarios: HarnessScenario[] + trialsPerScenario?: number + parallelism?: number + weights?: Partial + objectives?: Array> + score?: (trace: RunTrace, request: HarnessRunRequest) => RunScore | Promise + onResult?: (result: HarnessRunResult) => void | Promise +} + +export const DEFAULT_HARNESS_OBJECTIVES: Array> = [ + { name: 'aggregate', direction: 'maximize', value: (r) => r.aggregateMean }, + { name: 'pass_rate', direction: 'maximize', value: (r) => r.passRate }, + { name: 'cost', direction: 'minimize', value: (r) => r.costUsdMean }, + { name: 'wall', direction: 'minimize', value: (r) => r.wallSecondsMean }, +] + +export async function runHarnessExperiment(config: HarnessExperimentConfig): Promise { + const jobs = buildJobs(config) + const critic = new RunCritic({ weights: config.weights }) + const score = config.score ?? ((trace: RunTrace) => critic.scoreTrace(trace)) + const results = await mapLimit(jobs, config.parallelism ?? 1, async (request) => { + const trace = await config.adapter.run(request) + const runScore = await score(trace, request) + const result: HarnessRunResult = { + variant: request.variant, + scenario: request.scenario, + trialIndex: request.trialIndex, + trace, + score: runScore, + aggregate: aggregateRunScore(runScore, config.weights), + } + await config.onResult?.(result) + return result + }) + return { results, selection: selectHarnessVariant(results, config.objectives) } +} + +export function selectHarnessVariant( + results: HarnessRunResult[], + objectives: Array> = DEFAULT_HARNESS_OBJECTIVES, +): HarnessSelection { + const reports = summarizeHarnessResults(results) + if (reports.length === 0) throw new Error('selectHarnessVariant: no results') + const frontier = paretoFrontier(reports, objectives) + const candidates = frontier.frontier.length ? frontier.frontier : reports + const winner = [...candidates].sort((a, b) => b.aggregateMean - a.aggregateMean)[0] + if (!winner) throw new Error('selectHarnessVariant: no winner') + return { winner, frontier, reports } +} + +export function summarizeHarnessResults(results: HarnessRunResult[]): HarnessVariantReport[] { + const byVariant = new Map() + for (const result of results) { + byVariant.set(result.variant.id, [...(byVariant.get(result.variant.id) ?? []), result]) + } + return [...byVariant.values()] + .map((runs) => { + const variant = runs[0]?.variant + if (!variant) throw new Error('summarizeHarnessResults: empty variant bucket') + return { + variant, + runs, + aggregateMean: mean(runs.map((r) => r.aggregate)), + passRate: mean(runs.map((r) => r.score.success)), + costUsdMean: mean(runs.map((r) => r.score.costUsd)), + wallSecondsMean: mean(runs.map((r) => r.score.wallSeconds)), + scoreMean: meanRunScore(runs.map((r) => r.score)), + } + }) + .sort((a, b) => b.aggregateMean - a.aggregateMean) +} + +function buildJobs(config: HarnessExperimentConfig): HarnessRunRequest[] { + if (config.variants.length === 0) throw new Error('runHarnessExperiment: at least one variant required') + if (config.scenarios.length === 0) throw new Error('runHarnessExperiment: at least one scenario required') + const trials = Math.max(1, Math.floor(config.trialsPerScenario ?? 1)) + const jobs: HarnessRunRequest[] = [] + for (const variant of config.variants) { + for (const scenario of config.scenarios) { + for (let trialIndex = 0; trialIndex < trials; trialIndex++) { + jobs.push({ variant, scenario, trialIndex }) + } + } + } + return jobs +} + +async function mapLimit( + items: T[], + limit: number, + fn: (item: T) => Promise, +): Promise { + const results: R[] = new Array(items.length) + let next = 0 + const workerCount = Math.max(1, Math.min(Math.floor(limit), items.length)) + await Promise.all(Array.from({ length: workerCount }, async () => { + while (next < items.length) { + const index = next++ + const item = items[index] + if (item === undefined) continue + results[index] = await fn(item) + } + })) + return results +} + +function mean(values: number[]): number { + return values.length ? values.reduce((sum, value) => sum + value, 0) / values.length : 0 +} + +function meanRunScore(scores: RunScore[]): RunScore { + return { + success: mean(scores.map((s) => s.success)), + goalProgress: mean(scores.map((s) => s.goalProgress)), + repoGroundedness: mean(scores.map((s) => s.repoGroundedness)), + driftPenalty: mean(scores.map((s) => s.driftPenalty)), + toolUseQuality: mean(scores.map((s) => s.toolUseQuality)), + patchQuality: mean(scores.map((s) => s.patchQuality)), + testReality: mean(scores.map((s) => s.testReality)), + finalGate: mean(scores.map((s) => s.finalGate)), + reviewerBlockers: mean(scores.map((s) => s.reviewerBlockers)), + costUsd: mean(scores.map((s) => s.costUsd)), + wallSeconds: mean(scores.map((s) => s.wallSeconds)), + notes: scores.flatMap((s) => s.notes ?? []), + } +} diff --git a/src/index.ts b/src/index.ts index 30ebe3b..bca7428 100644 --- a/src/index.ts +++ b/src/index.ts @@ -138,6 +138,26 @@ export type { SteeringOptimizerConfig, AxSteeringOptimizerConfig, } from './steering-optimizer' +export { + DEFAULT_HARNESS_OBJECTIVES, + runHarnessExperiment, + selectHarnessVariant, + summarizeHarnessResults, +} from './harness-optimizer' +export type { + HarnessAdapter, + HarnessExperimentConfig, + HarnessExperimentResult, + HarnessIntervention, + HarnessRunRequest, + HarnessRunResult, + HarnessScenario, + HarnessSelection, + HarnessVariant, + HarnessVariantReport, + MeasurementPolicy, + WorkflowTopology, +} from './harness-optimizer' export { JudgeRunner, runJudgeFleet, diff --git a/src/run-critic.ts b/src/run-critic.ts index 8a5089f..8abd883 100644 --- a/src/run-critic.ts +++ b/src/run-critic.ts @@ -50,6 +50,9 @@ export class RunCritic { const toolSpans = trace.spans.filter((s): s is Extract => s.kind === 'tool') const judgeSpans = trace.spans.filter((s): s is Extract => s.kind === 'judge') const sandboxSpans = trace.spans.filter((s): s is Extract => s.kind === 'sandbox') + const finalGateSpans = judgeSpans.filter((span) => + span.dimension === 'final_gate' || span.attributes?.finalGate === true, + ) const success = trace.run.outcome?.pass === true ? 1 : trace.run.status === 'completed' ? 0.5 : 0 if (!success) notes.push('run did not complete with pass=true') @@ -78,6 +81,17 @@ export class RunCritic { : 0 if (!testReality) notes.push('no real test/build evidence recorded') + const blockerSpans = judgeSpans.filter((span) => + isBlockingJudge(span), + ) + const finalGateBlockers = finalGateSpans.filter((span) => isBlockingJudge(span)) + const finalGate = finalGateSpans.length ? (finalGateBlockers.length ? 0 : 1) : success + if (finalGateBlockers.length) notes.push(`final gate blocked by ${finalGateBlockers.length} reviewer(s)`) + else if (!finalGateSpans.length) notes.push('no final gate judgment recorded') + + const reviewerBlockers = judgeSpans.length ? blockerSpans.length / judgeSpans.length : 0 + if (reviewerBlockers) notes.push(`detected ${blockerSpans.length} blocking reviewer signal(s)`) + const positiveGroundingSignals = patchEvidence + sandboxSpans.length + @@ -108,6 +122,8 @@ export class RunCritic { toolUseQuality, patchQuality, testReality, + finalGate, + reviewerBlockers, costUsd, wallSeconds, notes, @@ -130,3 +146,15 @@ function normalizeJudgeScore(score: number): number { function looksRepoGrounded(text: string): boolean { return /(?:src\/|tests?\/|package\.json|tsconfig|\.ts\b|\.tsx\b|git status|pnpm |npm |vitest|pytest|jest)/i.test(text) } + +function isBlockingJudge(span: Extract): boolean { + return span.attributes?.blocking === true || + span.attributes?.verdict === 'BLOCKING' || + positiveNumber(span.attributes?.blockingFindings) || + positiveNumber(span.attributes?.highFindings) || + span.score <= 2 +} + +function positiveNumber(value: unknown): boolean { + return typeof value === 'number' && value > 0 +} diff --git a/src/run-score.ts b/src/run-score.ts index 665bac7..610771d 100644 --- a/src/run-score.ts +++ b/src/run-score.ts @@ -6,6 +6,8 @@ export interface RunScore { toolUseQuality: number patchQuality: number testReality: number + finalGate: number + reviewerBlockers: number costUsd: number wallSeconds: number notes?: string[] @@ -19,6 +21,8 @@ export interface RunScoreWeights { toolUseQuality: number patchQuality: number testReality: number + finalGate: number + reviewerBlockers: number costUsd: number wallSeconds: number } @@ -31,6 +35,8 @@ export const DEFAULT_RUN_SCORE_WEIGHTS: RunScoreWeights = { toolUseQuality: 1, patchQuality: 1.25, testReality: 1.5, + finalGate: 3, + reviewerBlockers: -2, costUsd: -0.2, wallSeconds: -0.1, } @@ -48,6 +54,8 @@ export function aggregateRunScore( w.toolUseQuality * clamp01(score.toolUseQuality) + w.patchQuality * clamp01(score.patchQuality) + w.testReality * clamp01(score.testReality) + + w.finalGate * clamp01(score.finalGate) + + w.reviewerBlockers * clamp01(score.reviewerBlockers) + w.costUsd * Math.max(0, score.costUsd) + w.wallSeconds * Math.max(0, score.wallSeconds / 60) ) diff --git a/tests/harness-optimizer.test.ts b/tests/harness-optimizer.test.ts new file mode 100644 index 0000000..3ca9585 --- /dev/null +++ b/tests/harness-optimizer.test.ts @@ -0,0 +1,106 @@ +import { describe, expect, it } from 'vitest' +import { + runHarnessExperiment, + selectHarnessVariant, + summarizeHarnessResults, + type HarnessAdapter, + type HarnessRunRequest, +} from '../src/harness-optimizer' +import type { RunScore } from '../src/run-score' +import type { RunTrace } from '../src/run-critic' + +describe('harness optimizer', () => { + it('runs the full variant x scenario x trial matrix and promotes the best topology', async () => { + const seen: string[] = [] + const adapter: HarnessAdapter = { + async run(request) { + seen.push(`${request.variant.id}:${request.scenario.id}:${request.trialIndex}`) + return traceFor(request) + }, + } + + const result = await runHarnessExperiment({ + adapter, + variants: [ + { id: 'linear', topology: { id: 'linear', interventions: ['continue', 'verify'] } }, + { id: 'adaptive', topology: { id: 'adaptive', interventions: ['audit', 'repair', 'final_gate'] } }, + ], + scenarios: [ + { id: 'privacy', task: 'remove local PII' }, + { id: 'site-rip', task: 'extract component library' }, + ], + trialsPerScenario: 2, + parallelism: 3, + }) + + expect(seen).toHaveLength(8) + expect(result.results).toHaveLength(8) + expect(result.selection.winner.variant.id).toBe('adaptive') + expect(result.selection.reports[0]?.variant.id).toBe('adaptive') + }) + + it('keeps cost/latency tradeoffs on the Pareto frontier while choosing highest aggregate', () => { + const reports = summarizeHarnessResults([ + run('accurate', 0.95, 0.8, 0.4, 120), + run('cheap', 0.7, 0.7, 0.01, 10), + run('weak', 0.2, 0.2, 1, 300), + ]) + + const selection = selectHarnessVariant(reports.flatMap((r) => r.runs)) + expect(selection.winner.variant.id).toBe('accurate') + expect(selection.frontier.frontier.map((r) => r.variant.id).sort()).toEqual(['accurate', 'cheap']) + expect(selection.frontier.dominated.map((r) => r.variant.id)).toEqual(['weak']) + }) +}) + +function traceFor(request: HarnessRunRequest): RunTrace { + const strong = request.variant.id === 'adaptive' + return { + run: { + runId: `${request.variant.id}-${request.scenario.id}-${request.trialIndex}`, + scenarioId: request.scenario.id, + variantId: request.variant.id, + startedAt: 1_000, + endedAt: 2_000, + status: 'completed', + outcome: { pass: strong, score: strong ? 0.95 : 0.55 }, + }, + spans: [ + { runId: 'r', spanId: 'tool', kind: 'tool', name: 'apply_patch', toolName: 'apply_patch', args: {}, startedAt: 1, status: 'ok' }, + { runId: 'r', spanId: 'test', kind: 'sandbox', name: 'pnpm test', command: 'pnpm test', testsTotal: 10, testsPassed: strong ? 10 : 6, startedAt: 2 }, + ], + events: [], + artifacts: [{ artifactId: 'patch', runId: 'r', contentType: 'text/x-diff', sizeBytes: 10, hash: 'abc' }], + budget: [{ runId: 'r', dimension: 'usd', limit: 1, consumed: strong ? 0.2 : 0.1, remaining: 0.8, timestamp: 3, breached: false }], + } +} + +function run(id: string, success: number, progress: number, costUsd: number, wallSeconds: number) { + const score: RunScore = { + success, + goalProgress: progress, + repoGroundedness: progress, + driftPenalty: 1 - progress, + toolUseQuality: progress, + patchQuality: progress, + testReality: success, + finalGate: success, + reviewerBlockers: success > 0.5 ? 0 : 1, + costUsd, + wallSeconds, + } + return { + variant: { id }, + scenario: { id: 's', task: 'task' }, + trialIndex: 0, + trace: { + run: { runId: id, scenarioId: 's', startedAt: 0, status: 'completed', outcome: { pass: success > 0.5, score: progress } }, + spans: [], + events: [], + artifacts: [], + budget: [], + }, + score, + aggregate: success * 4 + progress, + } +} diff --git a/tests/run-critic.test.ts b/tests/run-critic.test.ts new file mode 100644 index 0000000..2fa7c6f --- /dev/null +++ b/tests/run-critic.test.ts @@ -0,0 +1,54 @@ +import { describe, expect, it } from 'vitest' +import { RunCritic } from '../src/run-critic' +import type { RunTrace } from '../src/run-critic' + +describe('RunCritic', () => { + it('scores final gate pass and reviewer blockers from judge spans', () => { + const score = new RunCritic().scoreTrace(trace()) + expect(score.finalGate).toBe(1) + expect(score.reviewerBlockers).toBe(0.5) + expect(score.notes).toContain('detected 1 blocking reviewer signal(s)') + }) +}) + +function trace(): RunTrace { + return { + run: { + runId: 'r1', + scenarioId: 's1', + startedAt: 0, + endedAt: 1000, + status: 'completed', + outcome: { pass: true, score: 0.8 }, + }, + spans: [ + { + runId: 'r1', + spanId: 'j1', + kind: 'judge', + name: 'security final gate', + judgeId: 'security', + targetSpanId: 'commit-a', + dimension: 'final_gate', + score: 8, + startedAt: 1, + attributes: { finalGate: true, blocking: false }, + }, + { + runId: 'r1', + spanId: 'j2', + kind: 'judge', + name: 'patch audit', + judgeId: 'patch', + targetSpanId: 'commit-a', + dimension: 'patch', + score: 2, + startedAt: 2, + attributes: { blocking: true }, + }, + ], + events: [], + artifacts: [{ artifactId: 'a', runId: 'r1', contentType: 'text/plain', sizeBytes: 1, hash: 'h' }], + budget: [], + } +} From a0ae16a383bf7370292f31d4a5dffc27d42a017c Mon Sep 17 00:00:00 2001 From: Drew Stone Date: Fri, 24 Apr 2026 16:10:33 -0600 Subject: [PATCH 3/3] feat(0.7.2): extract muffled-gate scanner + CostTracker.recordVerdict helper MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Two reusable primitives promoted out of starter-foundry so every agent-eval consumer gets them for free: 1) scanForMuffledGates() + DEFAULT_FINDERS + UNIVERSAL_FINDERS (src/muffled-gate-scanner.ts, exported from index) Test helper that greps consumer source for gate/measurement anti-patterns and returns {file, line, pattern} findings. 5 default finders (fallback-to-pass, literal-true-pass, auto- match-no-expectation, skip-counts-as-pass, construct-vs-call-cwd). Supports per-file context-specific finders + auto-derived scan across importers of a target string (e.g. '@tangle-network/agent-eval'). `muffle-ok: ` annotation is the opt-out escape hatch. Pattern documented at starter-foundry/.evolve/patterns/muffled-gate.md (both gating + measurement layers). 10+ incidents in starter-foundry motivated this; any agent-eval consumer hits the same class. 2) CostTracker.recordVerdict(verdict, scenarioId, tags?) (src/cost-tracker.ts) Convenience: record + markOutcome in one call from a {usage, verdict}-shaped judge response. Returns null + no-ops when verdict has no usage (e.g. compile-gate short-circuit) so callers don't need their own guard. Starter-foundry's agent-eval-scaffold.mjs hand-rolls this 3-line pattern per seed; now one call. Tests: +12 (7 scanner + 4 recordVerdict + 1 absorbed). 336/336 pass. Build clean. Version 0.7.1 → 0.7.2. No breaking changes; purely additive exports. --- package.json | 2 +- src/cost-tracker.ts | 32 ++++ src/index.ts | 13 ++ src/muffled-gate-scanner.ts | 289 +++++++++++++++++++++++++++++ tests/cost-tracker.test.ts | 59 ++++++ tests/muffled-gate-scanner.test.ts | 166 +++++++++++++++++ 6 files changed, 560 insertions(+), 1 deletion(-) create mode 100644 src/muffled-gate-scanner.ts create mode 100644 tests/cost-tracker.test.ts create mode 100644 tests/muffled-gate-scanner.test.ts diff --git a/package.json b/package.json index a895f47..787573f 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "@tangle-network/agent-eval", - "version": "0.7.1", + "version": "0.7.2", "description": "Trace-first evaluation framework for Tangle agents. Core (spans, pipelines, sandbox harness, OTLP export), trust (dataset, red-team, calibration, behavior DSL), builder-of-builders (three-layer eval, resumable sessions, meta-runtime correlation), and frontier (meta-eval correlation study, Process Reward Modeling, bisector).", "type": "module", "main": "./dist/index.js", diff --git a/src/cost-tracker.ts b/src/cost-tracker.ts index 29dde18..7327fdc 100644 --- a/src/cost-tracker.ts +++ b/src/cost-tracker.ts @@ -72,6 +72,38 @@ export class CostTracker { bucket.completed = completed } + /** + * Convenience: record + markOutcome in one call from a + * `{ usage, verdict }`-shaped response (starter-foundry's + * `invokeMetaJudge` returns this shape; consumers that wrap any + * judge/critic can follow the same convention). + * + * `usage.model` must be present in `MODEL_PRICING` for cost math to + * populate; otherwise totalCostUsd stays at 0 for the entry but + * tokens still aggregate. + */ + recordVerdict( + verdict: { + usage?: { inputTokens: number; outputTokens: number; model: string; cachedTokens?: number; reasoningTokens?: number } + verdict?: 'pass' | 'fail' | 'borderline' | string + }, + scenarioId: string, + tags?: Record, + ): CostEntry | null { + if (!verdict.usage) return null + const entry = this.record({ + scenarioId, + model: verdict.usage.model, + inputTokens: verdict.usage.inputTokens, + outputTokens: verdict.usage.outputTokens, + cachedTokens: verdict.usage.cachedTokens, + reasoningTokens: verdict.usage.reasoningTokens, + tags, + }) + this.markOutcome(scenarioId, verdict.verdict === 'pass') + return entry + } + get(scenarioId: string): ScenarioCost | undefined { return this.byScenario.get(scenarioId) } diff --git a/src/index.ts b/src/index.ts index bca7428..717aca8 100644 --- a/src/index.ts +++ b/src/index.ts @@ -266,6 +266,19 @@ export type { CostEntry, ScenarioCost, CostSummary, TokenSpec } from './cost-tra export { dominates, paretoFrontier } from './pareto' export type { Direction, Objective, ParetoResult } from './pareto' +export { + scanForMuffledGates, + formatFindings, + DEFAULT_FINDERS, + UNIVERSAL_FINDERS, + findFallbackToPass, + findLiteralTruePass, + findConstructorCwdDropped, + findAutoMatchNoExpectation, + findSkipCountsAsPass, +} from './muffled-gate-scanner' +export type { MuffledFinding, MuffledFinder, ScanOptions } from './muffled-gate-scanner' + export { analyzeSeries } from './series-convergence' export type { SeriesConvergenceOptions, SeriesConvergenceResult } from './series-convergence' diff --git a/src/muffled-gate-scanner.ts b/src/muffled-gate-scanner.ts new file mode 100644 index 0000000..e5d926c --- /dev/null +++ b/src/muffled-gate-scanner.ts @@ -0,0 +1,289 @@ +/** + * muffled-gate-scanner — test helper that greps consumer source for + * gate + measurement anti-patterns and fails with file:line locations. + * + * Named pattern lives at starter-foundry's `.evolve/patterns/muffled-gate.md`; + * same shape applies to every consumer (a gate that should fail loud + * returns silent success; a metric that should emit a real number + * reports noise/empty). + * + * Usage (in a consumer project's test file): + * + * import { scanForMuffledGates, DEFAULT_FINDERS } from '@tangle-network/agent-eval' + * + * test('no muffled gates in eval surface', () => { + * const findings = scanForMuffledGates({ + * repoRoot: process.cwd(), + * scanFiles: ['src/eval/scaffold.ts', 'scripts/promote.mjs'], + * finders: DEFAULT_FINDERS, + * }) + * if (findings.length) assert.fail(formatFindings(findings)) + * }) + * + * Customize by passing your own `finders` — each finder is + * `(file, text) => Finding[]` and runs per-file. + * + * Escape hatch: any line containing `muffle-ok:` is excluded from all + * finders, letting consumers opt a legitimate fallback out explicitly. + */ + +import { readFileSync, existsSync, readdirSync, statSync } from 'node:fs' +import { join } from 'node:path' + +export interface MuffledFinding { + file: string + line: number + lineText: string + pattern: string +} + +export type MuffledFinder = (file: string, text: string) => MuffledFinding[] + +export interface ScanOptions { + /** Absolute path to the repo root. */ + repoRoot: string + /** Explicit file list (paths relative to repoRoot) for context-specific finders. */ + scanFiles: string[] + /** + * Auto-derived scan: walk these dirs for files matching importGlob + the + * string `importsContain` and run the universal finders on them. Pattern + * from starter-foundry H4 (research/decisions/001) — catches new files + * with agent-eval import that would otherwise escape context-specific + * scan lists. + */ + autoDerive?: { + roots: string[] // e.g. ['src', 'scripts'] + extensions: RegExp // e.g. /\.(ts|mjs|js)$/ + importsContain: string // e.g. '@tangle-network/agent-eval' + universalFinders: MuffledFinder[] + } + /** Per-file finders (context-specific patterns). */ + finders: MuffledFinder[] +} + +/** + * Strip line comments + block-comment continuation lines from a single line + * so finders don't match prose about the pattern. + */ +function codeOf(line: string): string { + return line.replace(/\/\/.*$/, '').replace(/^\s*\*.*$/, '') +} + +/** Skip if the line carries the `muffle-ok:` escape hatch. */ +function isMuffleOk(line: string): boolean { + return line.includes('muffle-ok:') +} + +/** + * Default finder: `command || true` in a testCommand/setupCommand/cmd/command + * string. Swallows exit codes. + */ +export const findFallbackToPass: MuffledFinder = (file, text) => { + const out: MuffledFinding[] = [] + const lines = text.split('\n') + for (let i = 0; i < lines.length; i++) { + const line = lines[i]! + if (isMuffleOk(line)) continue + const code = codeOf(line) + if (!code.trim()) continue + if (/\|\| true/.test(code) && /(testCommand|setupCommand|cmd|command)/.test(code)) { + out.push({ file, line: i + 1, lineText: line.trim(), pattern: 'fallback-to-pass (|| true in command string)' }) + } + } + return out +} + +/** + * `testCommand: 'true'` literal silent-pass — an unknown-language dispatch + * arm that returns a no-op instead of throwing. + */ +export const findLiteralTruePass: MuffledFinder = (file, text) => { + const out: MuffledFinding[] = [] + const lines = text.split('\n') + for (let i = 0; i < lines.length; i++) { + const line = lines[i]! + if (isMuffleOk(line)) continue + const code = codeOf(line) + if (!code.trim()) continue + if (/testCommand\s*:\s*['"]true['"]/.test(code)) { + out.push({ file, line: i + 1, lineText: line.trim(), pattern: 'literal-true-pass (testCommand: "true")' }) + } + } + return out +} + +/** + * `new SubprocessSandboxDriver({ cwd: ... })` — constructor arg silently + * dropped in agent-eval <0.7.1. 0.7.1+ honors as fallback, but the form + * still invites confusion; prefer `new SubprocessSandboxDriver()` with + * cwd in the per-call HarnessConfig. + */ +export const findConstructorCwdDropped: MuffledFinder = (file, text) => { + const out: MuffledFinding[] = [] + const lines = text.split('\n') + for (let i = 0; i < lines.length; i++) { + const line = lines[i]! + if (isMuffleOk(line)) continue + const code = codeOf(line) + if (!code.trim()) continue + if (/new\s+SubprocessSandboxDriver\s*\(\s*\{[^}]*cwd\s*:/.test(code)) { + out.push({ + file, + line: i + 1, + lineText: line.trim(), + pattern: 'construct-vs-call cwd dropped (driver.exec reads config.cwd, not constructor.cwd)', + }) + } + } + return out +} + +/** + * `if (!expected) return true` — matcher auto-passes when ground truth is + * absent. Inflates accuracy metrics for scenarios without expectations. + */ +export const findAutoMatchNoExpectation: MuffledFinder = (file, text) => { + const out: MuffledFinding[] = [] + const lines = text.split('\n') + for (let i = 0; i < lines.length; i++) { + const line = lines[i]! + if (isMuffleOk(line)) continue + const code = codeOf(line) + if (!code.trim()) continue + if (/if\s*\(\s*!expected\s*\)\s*return\s+true/.test(code)) { + out.push({ + file, + line: i + 1, + lineText: line.trim(), + pattern: 'auto-match-no-expectation (if (!expected) return true)', + }) + } + } + return out +} + +/** + * `if (p.skipped) return true` — skip-counts-as-pass in quality scorers. + * Use three-valued `true | false | 'skipped'` return + explicit partial + * credit instead. + */ +export const findSkipCountsAsPass: MuffledFinder = (file, text) => { + const out: MuffledFinding[] = [] + const lines = text.split('\n') + for (let i = 0; i < lines.length; i++) { + const line = lines[i]! + if (isMuffleOk(line)) continue + const code = codeOf(line) + if (!code.trim()) continue + if (/if\s*\(\s*\w+\.skipped\s*\)\s*return\s+true/.test(code)) { + out.push({ + file, + line: i + 1, + lineText: line.trim(), + pattern: 'skip-counts-as-pass (if (.skipped) return true)', + }) + } + } + return out +} + +/** + * The canonical default bundle. Callers can import these individually, + * replace them, or append custom finders for project-specific patterns. + */ +export const DEFAULT_FINDERS: MuffledFinder[] = [ + findFallbackToPass, + findLiteralTruePass, + findAutoMatchNoExpectation, + findSkipCountsAsPass, +] + +/** Finders that should run on EVERY file with the target import, not just SCAN_FILES. */ +export const UNIVERSAL_FINDERS: MuffledFinder[] = [ + findConstructorCwdDropped, +] + +/** + * Walk `roots` under `repoRoot` and return file paths (relative to repoRoot) + * whose contents include `importsContain`. + */ +function autoDeriveImporters( + repoRoot: string, + roots: string[], + extensions: RegExp, + importsContain: string, +): string[] { + const matches: string[] = [] + const walk = (rel: string) => { + const abs = join(repoRoot, rel) + if (!existsSync(abs)) return + for (const entry of readdirSync(abs)) { + const sub = join(rel, entry) + const subAbs = join(repoRoot, sub) + let st + try { st = statSync(subAbs) } catch { continue } + if (st.isDirectory()) { + if (entry === 'node_modules' || entry === 'dist' || entry === 'dist-tests' || entry.startsWith('.')) continue + walk(sub) + } else if (st.isFile() && extensions.test(entry)) { + if (entry.endsWith('.test.ts') || entry.endsWith('.test.mjs') || entry.endsWith('.test.js')) continue + let text: string + try { text = readFileSync(subAbs, 'utf8') } catch { continue } + if (text.includes(importsContain)) matches.push(sub) + } + } + } + for (const r of roots) walk(r) + return matches +} + +/** + * Run all finders against the configured files. Returns a flat list of + * findings. Callers format + assert as they prefer. + */ +export function scanForMuffledGates(opts: ScanOptions): MuffledFinding[] { + const findings: MuffledFinding[] = [] + const scanned = new Set() + + // Context-specific: run all finders on explicit SCAN_FILES. + for (const file of opts.scanFiles) { + const abs = join(opts.repoRoot, file) + if (!existsSync(abs)) continue + const text = readFileSync(abs, 'utf8') + for (const find of opts.finders) findings.push(...find(file, text)) + scanned.add(file) + } + + // Auto-derived: run universal finders on every importer not already scanned. + if (opts.autoDerive) { + const importers = autoDeriveImporters( + opts.repoRoot, + opts.autoDerive.roots, + opts.autoDerive.extensions, + opts.autoDerive.importsContain, + ) + for (const file of importers) { + if (scanned.has(file)) continue + const abs = join(opts.repoRoot, file) + if (!existsSync(abs)) continue + const text = readFileSync(abs, 'utf8') + for (const find of opts.autoDerive.universalFinders) findings.push(...find(file, text)) + } + } + + return findings +} + +/** + * Format findings into a single assert.fail-ready message. Each finding + * carries file:line + pattern name + the offending line. + */ +export function formatFindings(findings: MuffledFinding[]): string { + if (findings.length === 0) return '' + return [ + `Found ${findings.length} muffled-gate pattern(s).`, + `Fix each or annotate the line with "// muffle-ok: ".`, + '', + ...findings.map((f) => ` ${f.file}:${f.line} — ${f.pattern}\n ${f.lineText}`), + ].join('\n') +} diff --git a/tests/cost-tracker.test.ts b/tests/cost-tracker.test.ts new file mode 100644 index 0000000..b9055c8 --- /dev/null +++ b/tests/cost-tracker.test.ts @@ -0,0 +1,59 @@ +import { describe, it, expect } from 'vitest' +import { CostTracker } from '../src/cost-tracker' + +describe('CostTracker.recordVerdict', () => { + it('records + markOutcome in one call from verdict.usage + verdict.verdict', () => { + const t = new CostTracker() + const entry = t.recordVerdict( + { + usage: { inputTokens: 1000, outputTokens: 500, model: 'gpt-4o-mini' }, + verdict: 'pass', + }, + 'scn-1', + { phase: 'meta-judge' }, + ) + expect(entry).not.toBeNull() + expect(entry!.scenarioId).toBe('scn-1') + expect(entry!.inputTokens).toBe(1000) + expect(entry!.tags?.phase).toBe('meta-judge') + + const s = t.summary() + expect(s.scenarioCount).toBe(1) + expect(s.completedCount).toBe(1) // verdict === 'pass' → markOutcome(true) + }) + + it('returns null + no-ops when verdict has no usage (e.g. compile-gate short-circuit)', () => { + const t = new CostTracker() + const entry = t.recordVerdict({ verdict: 'fail' }, 'scn-no-usage') + expect(entry).toBeNull() + expect(t.summary().scenarioCount).toBe(0) + }) + + it('verdict !== "pass" → markOutcome(false)', () => { + const t = new CostTracker() + t.recordVerdict( + { usage: { inputTokens: 100, outputTokens: 50, model: 'gpt-4o-mini' }, verdict: 'borderline' }, + 'scn-border', + ) + expect(t.summary().completedCount).toBe(0) + }) + + it('propagates cachedTokens + reasoningTokens to the underlying record', () => { + const t = new CostTracker() + t.recordVerdict( + { + usage: { + inputTokens: 500, + outputTokens: 200, + cachedTokens: 100, + reasoningTokens: 50, + model: 'claude-sonnet-4-20250514', + }, + verdict: 'pass', + }, + 'scn-cache', + ) + const bucket = t.get('scn-cache') + expect(bucket!.totalCachedTokens).toBe(100) + }) +}) diff --git a/tests/muffled-gate-scanner.test.ts b/tests/muffled-gate-scanner.test.ts new file mode 100644 index 0000000..e4f94a3 --- /dev/null +++ b/tests/muffled-gate-scanner.test.ts @@ -0,0 +1,166 @@ +import { describe, it, expect } from 'vitest' +import { mkdtempSync, writeFileSync, mkdirSync, rmSync } from 'node:fs' +import { tmpdir } from 'node:os' +import { join } from 'node:path' +import { + scanForMuffledGates, + formatFindings, + DEFAULT_FINDERS, + UNIVERSAL_FINDERS, + findFallbackToPass, + findConstructorCwdDropped, + findSkipCountsAsPass, +} from '../src/muffled-gate-scanner' + +/** + * Build an isolated temp repo with the given file map and return its path. + */ +function fixture(files: Record): string { + const root = mkdtempSync(join(tmpdir(), 'muffled-gate-scanner-')) + for (const [rel, content] of Object.entries(files)) { + const abs = join(root, rel) + mkdirSync(join(abs, '..'), { recursive: true }) + writeFileSync(abs, content) + } + return root +} + +describe('muffled-gate-scanner', () => { + it('finds `|| true` in a testCommand string', () => { + const root = fixture({ + 'src/runner.ts': ` + const config = { + testCommand: 'pnpm run validate || pnpm run build || true', + } + `, + }) + try { + const findings = scanForMuffledGates({ + repoRoot: root, + scanFiles: ['src/runner.ts'], + finders: [findFallbackToPass], + }) + expect(findings).toHaveLength(1) + expect(findings[0]!.pattern).toMatch(/fallback-to-pass/) + expect(findings[0]!.line).toBe(3) + } finally { + rmSync(root, { recursive: true, force: true }) + } + }) + + it('`muffle-ok:` annotation on the same line excludes the finding', () => { + const root = fixture({ + 'src/runner.ts': ` + const config = { + testCommand: 'forge install || true', // muffle-ok: setup is best-effort; forge build is the real gate + } + `, + }) + try { + const findings = scanForMuffledGates({ + repoRoot: root, + scanFiles: ['src/runner.ts'], + finders: [findFallbackToPass], + }) + expect(findings).toHaveLength(0) + } finally { + rmSync(root, { recursive: true, force: true }) + } + }) + + it('auto-derive walks importers + applies universal finders to files not on explicit list', () => { + const root = fixture({ + 'src/a.ts': ` + import { SubprocessSandboxDriver } from '@tangle-network/agent-eval' + const driver = new SubprocessSandboxDriver({ cwd: '/tmp' }) + `, + 'src/b.ts': ` + import assert from 'node:assert' + const noop = true + `, + 'scripts/c.mjs': ` + import { SubprocessSandboxDriver } from '@tangle-network/agent-eval' + const driver2 = new SubprocessSandboxDriver({ cwd: '/tmp' }) + `, + }) + try { + const findings = scanForMuffledGates({ + repoRoot: root, + scanFiles: [], // empty — rely entirely on auto-derive + finders: [], + autoDerive: { + roots: ['src', 'scripts'], + extensions: /\.(ts|mjs|js)$/, + importsContain: '@tangle-network/agent-eval', + universalFinders: [findConstructorCwdDropped], + }, + }) + // b.ts does NOT import agent-eval → skipped. + // a.ts and c.mjs both import + both have the bug → 2 findings. + expect(findings).toHaveLength(2) + expect(findings.map((f) => f.file).sort()).toEqual(['scripts/c.mjs', 'src/a.ts']) + } finally { + rmSync(root, { recursive: true, force: true }) + } + }) + + it('scanFiles takes precedence over auto-derive (dedup — no double-scan)', () => { + const root = fixture({ + 'src/a.ts': ` + import { SubprocessSandboxDriver } from '@tangle-network/agent-eval' + const driver = new SubprocessSandboxDriver({ cwd: '/tmp' }) + `, + }) + try { + const findings = scanForMuffledGates({ + repoRoot: root, + scanFiles: ['src/a.ts'], // explicit + finders: [findConstructorCwdDropped], // applied via explicit + autoDerive: { + roots: ['src'], + extensions: /\.ts$/, + importsContain: '@tangle-network/agent-eval', + universalFinders: [findConstructorCwdDropped], // also applied via auto — should NOT double-count + }, + }) + expect(findings).toHaveLength(1) + } finally { + rmSync(root, { recursive: true, force: true }) + } + }) + + it('DEFAULT_FINDERS is a stable bundle that catches the common cases', () => { + const root = fixture({ + 'src/scorer.ts': ` + function phaseOk(p) { + if (p.skipped) return true + return p.ok === true + } + `, + }) + try { + const findings = scanForMuffledGates({ + repoRoot: root, + scanFiles: ['src/scorer.ts'], + finders: DEFAULT_FINDERS, + }) + expect(findings.length).toBeGreaterThan(0) + expect(findings.some((f) => f.pattern.includes('skip-counts-as-pass'))).toBe(true) + } finally { + rmSync(root, { recursive: true, force: true }) + } + }) + + it('formatFindings returns assert.fail-ready message with file:line + pattern + line body', () => { + const msg = formatFindings([ + { file: 'src/a.ts', line: 42, lineText: "testCommand: 'foo || true',", pattern: 'fallback-to-pass' }, + ]) + expect(msg).toMatch(/src\/a\.ts:42/) + expect(msg).toMatch(/fallback-to-pass/) + expect(msg).toMatch(/muffle-ok:/) // escape-hatch hint included + }) + + it('exports UNIVERSAL_FINDERS which includes the construct-vs-call cwd finder', () => { + expect(UNIVERSAL_FINDERS).toContain(findConstructorCwdDropped) + }) +})