diff --git a/.claude/skills/deep-plan-pipeline/references/run-ledger.schema.md b/.claude/skills/deep-plan-pipeline/references/run-ledger.schema.md index 92a6e022e..2c7dde20b 100644 --- a/.claude/skills/deep-plan-pipeline/references/run-ledger.schema.md +++ b/.claude/skills/deep-plan-pipeline/references/run-ledger.schema.md @@ -13,7 +13,13 @@ created before this contract run unchanged (the conductor falls back to its prio ```jsonc { + // — sidecar metadata (Codex-convention-aligned; deliberately lighter — no createdAt/updatedAt, kept deterministic) — "schemaVersion": 1, + "schemaName": "run-ledger", + "schemaUrl": ".claude/skills/deep-plan-pipeline/references/run-ledger.schema.md", + "artifactKind": "deep-plan-run-ledger", + "producer": { "skill": "deep-plan", "skillVersion": "1.0.0", "module": "run-ledger.mjs" }, + "legs": { "deepPlan": { "status": "", "verdict": "sound|minor-only|needs-critical-fixes|has-major-gaps", "degraded": false, "at": "YYYY-MM-DD" }, "review": { "status": "", "verdict": "", "counts": { /* … */ }, "at": "YYYY-MM-DD" }, @@ -21,10 +27,74 @@ created before this contract run unchanged (the conductor falls back to its prio "implementation": { "status": "", "rawStatus": "", "branch": "…", "at": "YYYY-MM-DD" } }, "loopbackCount": 0, // bumped by the revise leg on each revise→re-review loop - "lastEscalation": null // conductor-managed + "lastEscalation": null, // conductor-managed + + // — Phase B: governable-autonomy fields (all OPTIONAL; set via writeMeta; absent ⇒ prior behaviour) — + "authority": "triage", // what the NEXT stage MAY do: triage", + "metric": "", + "predicates": [ { "label": "parity", "kind": "command", "cmd": "node …/X-golden-parity.test.js" } ], + "met": false, // = AND(predicate exit codes); empty predicates ⇒ false (fail-closed) + "bounds": { "maxIterations": 3, "capTokens": 800000, "noProgressRounds": 2 } + }, + + // — Phase C: loop-engine resume state (written by run-loop.mjs; absent until the loop runs) — + "loop": { + "completed": ["item-id"], // queue items whose action + goal passed (the resume key set) + "escalated": { "item": "…", "reason": "goal-not-met|goal-absent|action-failed|invalid-queue|authority-denied|maxIterations|capTokens|noProgressRounds", "failing": [""] } // null when complete + } } ``` +## Governable-autonomy fields (Phases B–C) + +These make the ledger ready for the autonomous loop engine (`run-loop.mjs`, Phase C — **built but +un-wired** to real domains). Each is optional and inert until a consumer reads it (the loop engine, once +its real-domain config is wired; the conductor for resume). + +- **`authority`** — the granted permission tier (`triage < implement < push < merge`), checked by + `authorityAllows(ledger, action)`. **Fail-closed**: absent/unknown authority permits nothing, so a + handoff can never silently escalate from triage to merge. +- **`goal.bounds`** — the ACTIVE loop boundary, read by `boundExceeded(bounds, {iterations, tokens, + noProgress})`, which returns the tripped bound name (a stop reason) or null. The boundary that prevents + runaway (the 6M-token failure had none). Bound fields are positive integers; `0`/negative trip + immediately and should be treated as misconfiguration. +- **`budget`** (reserved) — a top-level `{capTokens, spent}` field for the loop's own spend-tracking; + NOT yet read by any helper or by the Phase-C loop engine (which today tracks completion in + `loop.completed`, not token spend). The **active** token cap is `goal.bounds.capTokens`; the budget + field is wired when the loop's real-domain config tracks spend. +- **`loop`** — the loop engine's resume state (`{completed, escalated}`), written by `run-loop.mjs` and + read back by `firstPendingItem` to resume mid-queue. Disjoint from `legs` (a real pipeline's per-leg + state) — the two coexist in one ledger without collision. +- **`goal`** — the quantified "done", evaluated by `evaluateGoal(goal, {cwd})`: + `met = AND(predicate.cmd exit 0)`. Every predicate is a **falsifiable command** (deterministic-first; + a metric/verdict predicate is just a command that exits 0/1). **Empty predicates ⇒ not met** + (fail-closed: a loop needs something that can say *no*). The loop runs + `until met ∨ boundExceeded(goal.bounds) → else escalate`. + +## Sibling: `run-events.jsonl` (append-only transition log) + +Written alongside the ledger by `appendEvent()` (called write-ahead from every `writeLeg`/`writeMeta`). +The Claude-side machine transition trail — the lighter counterpart to the Codex runtime's +`deep-plan-events.jsonl` (same `.claude`/`.agents` runtime split). One JSON object per line, never +rewritten; each carries a **monotonic 1-based `seq`** for ordering + replay/gap detection. + +```jsonc +{ "seq": 1, "type": "leg", "leg": "deepPlan", "status": "done-advance", "verdict": "sound", "at": "YYYY-MM-DD" } +{ "seq": 2, "type": "meta", "keys": ["loopbackCount"], "at": null } +{ "seq": 3, "type": "loop", "item": "…", "actionOk": true, "actionSkipped": false, "met": true } // Phase C: one per loop item +{ "seq": 4, "type": "loop-end", "status": "complete|escalated", "reason": null, "completed": 2 } // Phase C: loop terminal +``` + +Write-ahead ordering (event appended *before* the ledger write) means a crash over-counts (an event +with no committed state — auditable) rather than silently under-counting; a torn-write guard in +`appendEvent` separates a crashed partial line so `seq` stays sane. Reader (intended): **observability** +— the events log is NOT yet consumed by any code; the conductor resumes from `run-ledger.json` via +`firstPendingLeg`, not from this file. It is *unit-tested*, not JSON-schema-validated (the deliberate lean-Claude +posture; the Codex `deep-plan-events.jsonl` carries the heavier registered-schema discipline). + ## Status enum (the single point that preserves verdict semantics) A leg's `status` is **derived from its verdict** — never collapsed to a binary done/not-done — via the diff --git a/.claude/skills/deep-plan/DECISIONS.md b/.claude/skills/deep-plan/DECISIONS.md new file mode 100644 index 000000000..b43c91ef7 --- /dev/null +++ b/.claude/skills/deep-plan/DECISIONS.md @@ -0,0 +1,47 @@ +# DECISIONS — deep-plan skill family + +Human-legible decision journal: **why**, not what. Append dated, high-level entries for meaningful +decisions only (not routine actions, never secrets). Distinct from `run-events.jsonl` (machine +transition log) and `CHANGELOG.md` (releases). Purpose: a person who stays out of the per-step loop +can re-enter with full understanding. *(Steinberger's persistent log; Karpathy: "outsource your +thinking, never your understanding.")* + +--- + +## 2026-06-15 — Run-state ledger as the integration seam (not a monolith) +**Decision:** integrate the legs through ONE shared `run-ledger.json` (state) + `signal-map.md` +(routing), with the conductor as a thin reader — rather than welding the pipeline into a single +workflow. **Why:** every source (Anthropic skills, Spec Kit, orchestrator-worker) and our own +6M-token convergence failure say composed-with-contracts beats monolith; "seamless handoff" comes +from the shared state contract, not a shared container. + +## 2026-06-15 — Build the harness lean + test-gated, NOT via /deep-plan on itself +**Decision:** harness/tooling changes are built directly, gated by `node --test`, using `/deep-plan` +at most ONCE to extract invariants. **Why:** two `/deep-plan` runs to plan the ledger cost ~6M tokens +and never converged (category error: app-change planner aimed at meta-tooling; self-referential; +additive-by-design amplifies complexity). The test suite is the verifier, not another audit loop. +Captured in memory: `deep-plan-harness-self-modification`. + +## 2026-06-15 — Status derives from verdict; never collapse to binary done/not-done +**Decision:** a leg's ledger status is *derived* from its verdict via maps mirroring `signal-map.md`, +fail-closed. **Why:** the binary-flatten defect (a `has-major-gaps` leg-1 auto-advancing to review) +was the exact resume-correctness bug both audits kept flagging; deriving status preserves the +conductor's routing semantics so a resumed run can never skip a certification gate. + +## 2026-06-15 — Governable-autonomy essentials = authority, completion condition, decision journal +**Decision:** prioritize three of the four "final" handoff components — `authority` tier, quantified +`goal`/completion condition, and this journal — as additive ledger fields + one doc. **Why:** +Karpathy's rule for autonomy is "set the boundaries first" (objective, metric, permissions) and keep +a record you can re-enter through. Budget boundary + the rollout loop are deferred until a consumer +(the loop) exists — don't build machinery ahead of its substrate. + +## 2026-06-16 — Built the loop ENGINE (Phase C) but kept it UN-WIRED +**Decision:** build a generic, domain-agnostic loop engine (`run-loop.mjs`) now — validated against a +synthetic harness — but inject `actionFor`/`goalFor` and leave it un-wired to real domains. **Why:** the +"don't build machinery before its substrate" rule applies at the *wiring* boundary, not the *engine* +boundary. The engine's mechanics (advance / gate / escalate / bound / resume) are knowable from first +principles + the orchestrator-worker research and are fully testable against mocks — so a synthetic +harness *can* say "no" to the engine. What's genuinely unknowable without Stage 1 is the *fit* to real +migration (the real queue, the real parity tests, the escalation UX), so the **wiring** stays deferred. +A synthetic test proves "the engine does what it's coded to," not "this design fits real migration" — the +latter still wants Stage 1 to prove one migration by hand first. diff --git a/.claude/skills/deep-plan/VISION.md b/.claude/skills/deep-plan/VISION.md new file mode 100644 index 000000000..dd5d71c07 --- /dev/null +++ b/.claude/skills/deep-plan/VISION.md @@ -0,0 +1,84 @@ +# VISION — The deep-plan skill family as a governable-autonomy foundation + +**North star.** A composed system of single-purpose skills that plans, reviews, implements, and +audits changes — integrated through *shared contracts* (the run-ledger state + the signal-map +routing law), not a monolith — so work can be tracked, handed off, resumed, and (eventually) looped +**without a human in every step, yet never without a human's understanding.** + +This file is the constitution: it states intent + boundaries + the extension contract, so each +addition (and each loop tick, later) does not re-derive intent from scratch. + +## The 7-layer reference architecture (industry-convergent: Anthropic skills, Spec Kit, orchestrator-worker) + +| Layer | Role | Where it lives | Status | +|---|---|---|---| +| 1. Constitution / VISION | durable intent + boundaries | this file | ✅ | +| 2. Staged artifact pipeline | spec → plan → review → implement → audit | the deep-plan skills | ✅ | +| 3. Control-plane / worker | orchestrator routes; workers execute (tool-enforced) | `deep-plan-pipeline` conductor | ⚠️ prose-enforced | +| 4. Durable checkpoint | resume from state N, not scratch | `run-ledger.json` | ✅ | +| 5. Validator that says "no" | tests / audit refute the work | `node --test` + adversarial audit | ✅ | +| 6. Bounded routing | verdict → action, caps, no improvisation | `signal-map.md` + bounds | ✅ | +| 7. Decision-ready HITL escalation | opinionated brief, not a raw question | conductor escalation block | ⚠️ format | + +## Governable-autonomy components (the tracking surface) + +Every component is a **field on the run-ledger or a sibling file** — never a new monolith. + +| Component | Form | Purpose | Status | +|---|---|---|---| +| Tracking | `run-ledger.json` (`legs`, status) | resume at the right leg | ✅ shipped | +| Handoff | the ledger contract | leg→leg carries the verdict | ✅ shipped | +| Transcript | platform `subagents/workflows/*` | full-fidelity replay | ✅ platform | +| Logging | `run-events.jsonl` (append-only, `seq`-stamped, write-ahead) | machine transition trail; reader = conductor resume/observability | ✅ Phase A — Claude-side counterpart to Codex `deep-plan-events.jsonl`, deliberately lighter (unit-tested, not JSON-schema-registered) | +| Sidecar conformance | `schemaName`/`schemaUrl`/`artifactKind`/`producer` | cross-side consistency | ✅ Phase A — Codex-convention-aligned, kept deterministic (omits `createdAt`/`updatedAt`) | +| **Authority** | `run-ledger.authority` + `authorityAllows()` | what the next stage MAY do (triage= want +} + +// ── Phase B: completion condition — the quantified "done" ────────────────────────────────── +// done = (P₁ ∧ … ∧ Pₙ), each predicate a FALSIFIABLE command (exit 0 = pass). Deterministic-first; +// a "metric"/"verdict" predicate is just a command that computes it and exits 0/1. Empty predicates → +// NOT met (fail-closed: a loop needs something that can say no). `bounds` are the loop's stop conditions, +// evaluated separately via boundExceeded(). A chatty-but-passing command is not misread as a failure +// (maxBuffer raised). +// +// TRUST BOUNDARY: each predicate `cmd` runs in a shell with the conductor's privileges. Predicates MUST +// be operator- or plan-authored; NEVER derive a predicate cmd from untrusted input. When a consumer (the +// loop) wires this in, gate it behind authorityAllows(ledger, 'implement'). +export function evaluateGoal(goal, { cwd, timeoutMs = 120000, maxBuffer = 10 * 1024 * 1024 } = {}) { + const results = (goal && Array.isArray(goal.predicates) ? goal.predicates : []).map((p) => { + let pass = false, error = null + if (!p || typeof p.cmd !== 'string' || !p.cmd) { + error = 'predicate has no cmd' // guard: a clean message, not a raw Node TypeError + } else { + try { execSync(p.cmd, { cwd, stdio: 'pipe', timeout: timeoutMs, maxBuffer }); pass = true } + catch (e) { error = e && e.message ? String(e.message).slice(0, 160) : 'nonzero exit' } + } + return { label: (p && p.label) ?? ((p && p.cmd) || '').slice(0, 60) ?? '', kind: (p && p.kind) ?? 'command', pass, error } + }) + const met = results.length > 0 && results.every((r) => r.pass) // empty → not met + return { met, results } +} + +// Has a loop bound tripped? Returns the bound name (a stop reason) or null — the other half of +// `stop = done ∨ bound`. Without it, no autonomy is safe. +export function boundExceeded(bounds, { iterations = 0, tokens = 0, noProgress = 0 } = {}) { + if (!bounds) return null + if (bounds.maxIterations != null && iterations >= bounds.maxIterations) return 'maxIterations' + if (bounds.capTokens != null && tokens >= bounds.capTokens) return 'capTokens' + if (bounds.noProgressRounds != null && noProgress >= bounds.noProgressRounds) return 'noProgressRounds' + return null +} + export { LEG_ORDER } diff --git a/.claude/skills/deep-plan/scripts/run-loop.mjs b/.claude/skills/deep-plan/scripts/run-loop.mjs new file mode 100644 index 000000000..ad16ee145 --- /dev/null +++ b/.claude/skills/deep-plan/scripts/run-loop.mjs @@ -0,0 +1,89 @@ +// Generic, domain-agnostic LOOP ENGINE over the run-ledger (Phase C). +// +// Advances a queue: for each item it runs the item's ACTION (the work), then evaluates the item's +// completion GOAL (the validator that can say "no"), respecting AUTHORITY and BOUNDS, recording every +// step to the run-ledger so the loop is RESUMABLE, and ESCALATING on the first failure or tripped bound. +// +// UN-WIRED by design: the per-item action command and goal predicates are INJECTED via `actionFor(item)` +// and `goalFor(item)`. The real config — pointing this at the Stage-3+ per-domain migration skill + its +// real parity tests — is a config step deferred until Stage 1 proves one migration by hand. This module +// is validated against a SYNTHETIC harness (run-loop.test.mjs), which proves the engine MECHANICS, not +// the real-migration fit. +// +// SAFETY: action/goal commands run in a shell (same trust boundary as evaluateGoal) and MUST be +// operator/plan-authored. The loop is fail-closed: it runs only when the ledger (or the explicit param) +// grants 'implement' authority. +import { execSync } from 'node:child_process' +import { readLedger, writeMeta, appendEvent, authorityAllows, evaluateGoal, boundExceeded } from './run-ledger.mjs' + +const itemId = (it) => (typeof it === 'string' ? it : it && it.id) +const msg = (e) => String((e && e.message) || e).slice(0, 160) + +// The resumable primitive: the first queue item not yet recorded complete in the ledger. +export function firstPendingItem(queue, completed) { + const done = new Set(completed) + return (queue || []).find((it) => !done.has(itemId(it))) +} + +// runLoop({ planDir, queue, actionFor, goalFor, authority?, bounds?, cwd? }) → { status, ... } +// status 'complete' — every queue item passed its goal +// status 'escalated' — stopped on authority-denied / a tripped bound / action-failed / goal-not-met +// Resumable: re-invoking after an escalation skips already-completed items (read from ledger.loop). +export function runLoop({ planDir, queue = [], actionFor, goalFor, authority, bounds = {}, cwd } = {}) { + const granted = authority ?? (readLedger(planDir) || {}).authority + if (!authorityAllows({ authority: granted }, 'implement')) { + return finish(planDir, 'escalated', { reason: 'authority-denied', authority: granted ?? null, completed: [] }) + } + // queue precondition (fail-closed): every item needs a stable, UNIQUE, non-empty string id (the resume + // key). A malformed queue must be REJECTED, never silently "completed" — an item with no resumable + // identity, or a duplicate id that would collapse two distinct items into one, is an error. + const ids = (queue || []).map(itemId) + if (ids.some((id) => typeof id !== 'string' || id === '')) { + return finish(planDir, 'escalated', { reason: 'invalid-queue', detail: 'every queue item needs a non-empty string id', completed: [] }) + } + if (new Set(ids).size !== ids.length) { + return finish(planDir, 'escalated', { reason: 'invalid-queue', detail: 'duplicate item ids', completed: [] }) + } + const completed = [...(((readLedger(planDir) || {}).loop || {}).completed || [])] + for (;;) { + const item = firstPendingItem(queue, completed) + if (item === undefined) return finish(planDir, 'complete', { completed }) + + const bound = boundExceeded(bounds, { iterations: completed.length }) + if (bound) return finish(planDir, 'escalated', { reason: bound, item: itemId(item), completed }) + + // 1. run the item's ACTION (the work that mutates state). A falsy actionFor is a goal-only item, + // recorded as actionSkipped so a typo'd real wiring leaves a signal in the trail (not silent). + let actionOk = true, actionErr = null + const cmd = actionFor && actionFor(item) + const actionSkipped = !cmd + if (cmd) { + try { execSync(cmd, { cwd, stdio: 'pipe', timeout: 120000, maxBuffer: 10 * 1024 * 1024 }) } + catch (e) { actionOk = false; actionErr = msg(e) } + } + // 2. evaluate the item's GOAL (the falsifiable validator) — skipped if the action already failed. + // A missing/empty goal is 'goal-absent' (fail-closed config error), distinct from 'goal-not-met'. + const goal = goalFor && goalFor(item) + const goalAbsent = !goal || !Array.isArray(goal.predicates) || goal.predicates.length === 0 + const ev = actionOk ? evaluateGoal(goal, { cwd }) : { met: false, results: [] } + appendEvent(planDir, { type: 'loop', item: itemId(item), actionOk, actionSkipped, met: ev.met }) + + if (actionOk && ev.met) { + completed.push(itemId(item)) + writeMeta(planDir, { loop: { completed, escalated: null } }) + continue + } + // first failure → escalate (stop), preserving what completed (resume picks up here) + const reason = !actionOk ? 'action-failed' : (goalAbsent ? 'goal-absent' : 'goal-not-met') + const failing = goalAbsent ? [''] : ev.results.filter((r) => !r.pass).map((r) => r.label) + writeMeta(planDir, { loop: { completed, escalated: { item: itemId(item), reason, failing } } }) + return finish(planDir, 'escalated', { reason, item: itemId(item), failing, completed, actionErr }, false) + } +} + +function finish(planDir, status, data, writeState = true) { + // uniform `escalated` shape across ALL stop reasons (authority / invalid-queue / bound / failure) + if (writeState) writeMeta(planDir, { loop: { completed: data.completed || [], escalated: status === 'complete' ? null : { reason: data.reason, item: data.item ?? null } } }) + appendEvent(planDir, { type: 'loop-end', status, reason: data.reason ?? null, completed: (data.completed || []).length }) + return { status, ...data } +} diff --git a/.claude/skills/deep-plan/test/run-ledger-governance.test.mjs b/.claude/skills/deep-plan/test/run-ledger-governance.test.mjs new file mode 100644 index 000000000..144134dae --- /dev/null +++ b/.claude/skills/deep-plan/test/run-ledger-governance.test.mjs @@ -0,0 +1,96 @@ +// Governance/observability tests for the run-ledger: sidecar metadata, the append-only event +// log (Phase A), plus authority tiers and the goal/completion-condition evaluator (Phase B). +// Deterministic gate — the validator that says "no" for the governable-autonomy layer. +import { test } from 'node:test' +import assert from 'node:assert/strict' +import fs from 'node:fs' +import os from 'node:os' +import path from 'node:path' +import { + writeLeg, writeMeta, readLedger, appendEvent, EVENTS_FILE, + authorityAllows, authorityRank, evaluateGoal, boundExceeded, +} from '../scripts/run-ledger.mjs' + +const tmp = () => fs.mkdtempSync(path.join(os.tmpdir(), 'run-ledger-gov-')) +const events = (d) => fs.readFileSync(path.join(d, EVENTS_FILE), 'utf8').trim().split('\n').filter(Boolean).map((l) => JSON.parse(l)) + +// ---- Phase A: sidecar metadata conformance ---- +test('a fresh ledger carries full sidecar metadata (schemaVersion/Name/Url + artifactKind + producer.skillVersion)', () => { + const d = tmp() + const led = writeLeg(d, 'deepPlan', { status: 'done-advance', verdict: 'sound' }) + assert.equal(led.schemaVersion, 1) + assert.equal(led.schemaName, 'run-ledger') + assert.match(led.schemaUrl, /run-ledger\.schema\.md$/) + assert.equal(led.artifactKind, 'deep-plan-run-ledger') + assert.equal(led.producer.skill, 'deep-plan') + assert.equal(led.producer.skillVersion, '1.0.0') +}) + +// ---- Phase A: append-only event log ---- +test('every writeLeg/writeMeta appends a parseable event with monotonic seq; append-only (never rewritten)', () => { + const d = tmp() + writeLeg(d, 'deepPlan', { status: 'done-advance', verdict: 'sound', at: '2026-06-15' }) + writeLeg(d, 'review', { status: 'done-advance', verdict: 'minor-only', at: '2026-06-15' }) + writeMeta(d, { loopbackCount: 1 }) + const ev = events(d) + assert.equal(ev.length, 3) // accumulates, not overwrites + assert.deepEqual(ev.map((e) => e.seq), [1, 2, 3]) // monotonic → gap/replay detectable + assert.deepEqual(ev[0], { seq: 1, type: 'leg', leg: 'deepPlan', status: 'done-advance', verdict: 'sound', at: '2026-06-15' }) + assert.equal(ev[1].leg, 'review') + assert.equal(ev[2].type, 'meta') + assert.deepEqual(ev[2].keys, ['loopbackCount']) +}) + +test('events are WRITE-AHEAD: after a successful write the ledger state and its event agree', () => { + const d = tmp() + writeLeg(d, 'deepPlan', { status: 'done-advance', verdict: 'sound' }) + assert.equal(readLedger(d).legs.deepPlan.status, 'done-advance') + assert.equal(events(d)[0].status, 'done-advance') // append-before-write: never silently under-counts +}) + +test('appendEvent is independently usable, order-preserving, and seq-stamped', () => { + const d = tmp() + appendEvent(d, { type: 'note', n: 1 }) + appendEvent(d, { type: 'note', n: 2 }) + assert.deepEqual(events(d).map((e) => [e.seq, e.n]), [[1, 1], [2, 2]]) +}) + +// ---- Phase B: authority tiers (fail-closed, monotonic) ---- +test('authority is monotonic and fail-closed: no/unknown authority permits nothing; grants never silently escalate', () => { + assert.equal(authorityAllows({}, 'triage'), false) // no grant → nothing (fail-closed) + assert.equal(authorityAllows({ authority: 'bogus' }, 'triage'), false) // unknown → nothing + assert.equal(authorityAllows({ authority: 'triage' }, 'triage'), true) + assert.equal(authorityAllows({ authority: 'triage' }, 'push'), false) // can't escalate above grant + assert.equal(authorityAllows({ authority: 'merge' }, 'push'), true) // higher grant covers lower + // fail-closed on the ACTION side too: an unknown/typo'd/case-mismatched action is denied, never granted + assert.equal(authorityAllows({ authority: 'merge' }, 'bogus'), false) + assert.equal(authorityAllows({ authority: 'merge' }, 'MERGE'), false) // case-sensitive + assert.equal(authorityAllows({ authority: 'merge' }, undefined), false) + assert.ok(authorityRank('merge') > authorityRank('triage')) +}) + +// ---- Phase B: completion condition (conjunction of falsifiable commands, fail-closed) ---- +test('evaluateGoal: met iff ALL predicate commands pass (exit 0); empty predicates → NOT met (a loop needs a "no")', () => { + assert.equal(evaluateGoal({ predicates: [{ cmd: 'true' }, { cmd: 'true' }] }).met, true) + const mixed = evaluateGoal({ predicates: [{ cmd: 'true' }, { cmd: 'false', label: 'gate' }] }) + assert.equal(mixed.met, false) // conjunction: one failing predicate fails the whole + assert.equal(mixed.results[1].pass, false) + assert.equal(mixed.results[1].label, 'gate') + assert.equal(evaluateGoal({ predicates: [] }).met, false) // fail-closed: nothing to say no + assert.equal(evaluateGoal({}).met, false) +}) +test('evaluateGoal: a predicate with no cmd is fail-closed with a clean message (not a raw TypeError)', () => { + const r = evaluateGoal({ predicates: [{ label: 'x' }] }) + assert.equal(r.met, false) + assert.equal(r.results[0].error, 'predicate has no cmd') +}) + +// ---- Phase B: loop bounds (the other half of stop = done ∨ bound) ---- +test('boundExceeded returns the tripped bound name, else null', () => { + const b = { maxIterations: 3, capTokens: 1000, noProgressRounds: 2 } + assert.equal(boundExceeded(b, { iterations: 1, tokens: 10, noProgress: 0 }), null) + assert.equal(boundExceeded(b, { iterations: 3 }), 'maxIterations') + assert.equal(boundExceeded(b, { tokens: 1000 }), 'capTokens') + assert.equal(boundExceeded(b, { noProgress: 2 }), 'noProgressRounds') + assert.equal(boundExceeded(null, { iterations: 99 }), null) // no bounds declared → no trip +}) diff --git a/.claude/skills/deep-plan/test/run-loop.test.mjs b/.claude/skills/deep-plan/test/run-loop.test.mjs new file mode 100644 index 000000000..f99ab0b8d --- /dev/null +++ b/.claude/skills/deep-plan/test/run-loop.test.mjs @@ -0,0 +1,167 @@ +// Synthetic harness for the generic loop ENGINE (Phase C). Validates the MECHANICS — advance / gate / +// escalate / bound / authority / RESUME — against a fake queue with controllable pass/fail commands +// (`true`/`false`), NOT real domain migration. The test commands are the engine's "something that can +// say no." (Real-migration fit is deferred to Stage 1; this proves the engine does what it's coded to.) +import { test } from 'node:test' +import assert from 'node:assert/strict' +import fs from 'node:fs' +import os from 'node:os' +import path from 'node:path' +import { runLoop, firstPendingItem } from '../scripts/run-loop.mjs' +import { readLedger, writeLeg, EVENTS_FILE } from '../scripts/run-ledger.mjs' + +const tmp = () => fs.mkdtempSync(path.join(os.tmpdir(), 'run-loop-')) +const actionFor = (it) => it.action +const goalFor = (it) => ({ predicates: [{ cmd: it.goal, label: it.id }] }) +const item = (id, action, goal) => ({ id, action, goal }) +const loopEvents = (d) => fs.readFileSync(path.join(d, EVENTS_FILE), 'utf8').trim().split('\n').map(JSON.parse).filter((e) => e.type === 'loop' || e.type === 'loop-end') + +// ---- authority gate (fail-closed) ---- +test('the loop refuses to run without implement authority (fail-closed)', () => { + const d = tmp() + const r = runLoop({ planDir: d, queue: [item('a', 'true', 'true')], actionFor, goalFor, authority: 'triage' }) + assert.equal(r.status, 'escalated') + assert.equal(r.reason, 'authority-denied') + assert.deepEqual(r.completed, []) +}) +test('implement (or higher) authority lets the loop run', () => { + const d = tmp() + assert.equal(runLoop({ planDir: d, queue: [item('a', 'true', 'true')], actionFor, goalFor, authority: 'merge' }).status, 'complete') +}) + +// ---- happy path ---- +test('all items pass action + goal → complete, in order', () => { + const d = tmp() + const r = runLoop({ planDir: d, queue: [item('a', 'true', 'true'), item('b', 'true', 'true')], actionFor, goalFor, authority: 'implement' }) + assert.equal(r.status, 'complete') + assert.deepEqual(r.completed, ['a', 'b']) + assert.deepEqual(readLedger(d).loop.completed, ['a', 'b']) +}) + +// ---- the two failure modes ---- +test('a failing GOAL escalates at that item (completed preserved), with the failing predicate named', () => { + const d = tmp() + const r = runLoop({ planDir: d, queue: [item('a', 'true', 'true'), item('b', 'true', 'false')], actionFor, goalFor, authority: 'implement' }) + assert.equal(r.status, 'escalated') + assert.equal(r.reason, 'goal-not-met') + assert.equal(r.item, 'b') + assert.deepEqual(r.completed, ['a']) + assert.deepEqual(r.failing, ['b']) +}) +test('a failing ACTION escalates before the goal is even evaluated', () => { + const d = tmp() + const r = runLoop({ planDir: d, queue: [item('a', 'true', 'true'), item('b', 'false', 'true')], actionFor, goalFor, authority: 'implement' }) + assert.equal(r.status, 'escalated') + assert.equal(r.reason, 'action-failed') + assert.equal(r.item, 'b') + assert.deepEqual(r.completed, ['a']) +}) + +// ---- bounds (total cap, resume-safe) ---- +test('maxIterations caps total completed items, then escalates (not silently truncates)', () => { + const d = tmp() + const q = ['a', 'b', 'c', 'd', 'e'].map((id) => item(id, 'true', 'true')) + const r = runLoop({ planDir: d, queue: q, actionFor, goalFor, authority: 'implement', bounds: { maxIterations: 2 } }) + assert.equal(r.status, 'escalated') + assert.equal(r.reason, 'maxIterations') + assert.deepEqual(r.completed, ['a', 'b']) + assert.equal(readLedger(d).loop.escalated.item, 'c') // uniform escalated shape: bound stop records the item too +}) + +// ---- RESUME (the point of the ledger) ---- +test('after an escalation, re-running resumes from the failed item (skips completed) and finishes', () => { + const d = tmp() + // run 1: b's goal fails → escalate at b, completed [a] + const r1 = runLoop({ planDir: d, queue: [item('a', 'true', 'true'), item('b', 'true', 'false')], actionFor, goalFor, authority: 'implement' }) + assert.equal(r1.status, 'escalated') + assert.deepEqual(r1.completed, ['a']) + // "fix" b (its goal now passes) and re-run on the SAME ledger → resumes at b, completes + const r2 = runLoop({ planDir: d, queue: [item('a', 'true', 'true'), item('b', 'true', 'true')], actionFor, goalFor, authority: 'implement' }) + assert.equal(r2.status, 'complete') + assert.deepEqual(r2.completed, ['a', 'b']) + // 'a' was NOT re-run on resume (firstPendingItem skipped it) + assert.equal(readLedger(d).loop.completed.filter((x) => x === 'a').length, 1) +}) + +// ---- empty queue ---- +test('an empty queue completes vacuously (nothing to do)', () => { + const d = tmp() + const r = runLoop({ planDir: d, queue: [], actionFor, goalFor, authority: 'implement' }) + assert.equal(r.status, 'complete') + assert.deepEqual(r.completed, []) +}) + +test('a malformed queue is REJECTED (fail-closed), never silently completed', () => { + // duplicate ids would collapse two distinct items into one → reject + const dup = runLoop({ planDir: tmp(), queue: [item('a', 'true', 'true'), item('a', 'true', 'true')], actionFor, goalFor, authority: 'implement' }) + assert.equal(dup.status, 'escalated') + assert.equal(dup.reason, 'invalid-queue') + // an item with no stable string id has no resumable identity → reject + const noid = runLoop({ planDir: tmp(), queue: [{ action: 'true', goal: 'true' }], actionFor, goalFor, authority: 'implement' }) + assert.equal(noid.status, 'escalated') + assert.equal(noid.reason, 'invalid-queue') +}) + +// ---- ledger + event trail ---- +test('the loop records per-item events + a loop-end event to the run-ledger trail', () => { + const d = tmp() + runLoop({ planDir: d, queue: [item('a', 'true', 'true'), item('b', 'true', 'false')], actionFor, goalFor, authority: 'implement' }) + const ev = loopEvents(d) + assert.equal(ev.filter((e) => e.type === 'loop').length, 2) // one per attempted item + assert.equal(ev.filter((e) => e.type === 'loop-end').length, 1) + assert.equal(ev.find((e) => e.type === 'loop-end').status, 'escalated') +}) + +// ---- the resumable primitive in isolation ---- +test('firstPendingItem returns the first not-yet-completed item, or undefined when all done', () => { + const q = [item('a', 'true', 'true'), item('b', 'true', 'true')] + assert.equal(firstPendingItem(q, []).id, 'a') + assert.equal(firstPendingItem(q, ['a']).id, 'b') + assert.equal(firstPendingItem(q, ['a', 'b']), undefined) +}) + +// ---- a missing goal is a CONFIG error (goal-absent), distinct from a failing goal ---- +test('a missing/empty goal is goal-ABSENT (fail-closed config error), not goal-not-met', () => { + const r = runLoop({ + planDir: tmp(), + queue: [item('a', 'true', 'true'), { id: 'b', action: 'true' }], + actionFor, + goalFor: (it) => (it.goal ? { predicates: [{ cmd: it.goal, label: it.id }] } : {}), // b has no goal + authority: 'implement', + }) + assert.equal(r.status, 'escalated') + assert.equal(r.reason, 'goal-absent') + assert.equal(r.item, 'b') + assert.deepEqual(r.failing, ['']) +}) + +// ---- a goal-only item records actionSkipped (a typo'd real wiring leaves a signal) ---- +test('a goal-only item (no action) is recorded actionSkipped and completes on its goal', () => { + const d = tmp() + const r = runLoop({ planDir: d, queue: [{ id: 'a', goal: 'true' }], actionFor: () => undefined, goalFor, authority: 'implement' }) + assert.equal(r.status, 'complete') + const ev = fs.readFileSync(path.join(d, EVENTS_FILE), 'utf8').trim().split('\n').map(JSON.parse).find((e) => e.type === 'loop') + assert.equal(ev.actionSkipped, true) +}) + +// ---- Phase-A × Phase-C: the loop's `loop` field is disjoint from the legs a real pipeline writes ---- +test('the loop preserves existing leg state (loop and legs are disjoint ledger keys)', () => { + const d = tmp() + writeLeg(d, 'deepPlan', { status: 'done-advance', verdict: 'sound' }) // a real pipeline left legs here + runLoop({ planDir: d, queue: [item('a', 'true', 'true')], actionFor, goalFor, authority: 'implement' }) + const led = readLedger(d) + assert.equal(led.legs.deepPlan.status, 'done-advance') // legs untouched by the loop + assert.deepEqual(led.loop.completed, ['a']) // loop state sits alongside, not clobbering +}) + +// ---- resume after a BOUND escalation (not just a failure) ---- +test('resume after a maxIterations escalation: raising the cap continues from where it stopped', () => { + const d = tmp() + const q = ['a', 'b', 'c'].map((id) => item(id, 'true', 'true')) + const r1 = runLoop({ planDir: d, queue: q, actionFor, goalFor, authority: 'implement', bounds: { maxIterations: 2 } }) + assert.equal(r1.reason, 'maxIterations') + assert.deepEqual(r1.completed, ['a', 'b']) + const r2 = runLoop({ planDir: d, queue: q, actionFor, goalFor, authority: 'implement', bounds: { maxIterations: 5 } }) + assert.equal(r2.status, 'complete') + assert.deepEqual(r2.completed, ['a', 'b', 'c']) // resumed at c; a/b not re-run +})