From 0348b3573031f0456eed53e25223fe2326a00a49 Mon Sep 17 00:00:00 2001 From: Number531 <120485065+Number531@users.noreply.github.com> Date: Mon, 15 Jun 2026 18:28:46 -0400 Subject: [PATCH 1/3] feat(deep-plan): governable-autonomy layer on the run-ledger (event log, sidecar metadata, authority, completion condition) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds the tracking/handoff components Steinberger & Karpathy name as the boundaries to set before autonomy — all additive, optional, backward-compatible (readLedger→null seam), and inert until a consumer reads them. Built lean + test-gated (NOT via /deep-plan on itself). Phase A (observability/conformance): - run-events.jsonl: append-only, monotonic seq (replay/gap detection), WRITE-AHEAD (event before ledger write → crash over-counts, never silently under-counts). Claude-side counterpart to the Codex deep-plan-events.jsonl (deliberately lighter). - sidecar metadata: schemaName + schemaUrl + artifactKind + producer.skillVersion. - VISION.md (constitution + 7-layer architecture + the four-step extension contract) + DECISIONS.md (human decision journal). Phase B (governable autonomy): - authority tiers (triage --- .../references/run-ledger.schema.md | 57 +++++++++++- .claude/skills/deep-plan/DECISIONS.md | 36 ++++++++ .claude/skills/deep-plan/VISION.md | 80 ++++++++++++++++ .../skills/deep-plan/scripts/run-ledger.mjs | 80 +++++++++++++++- .../test/run-ledger-governance.test.mjs | 91 +++++++++++++++++++ 5 files changed, 341 insertions(+), 3 deletions(-) create mode 100644 .claude/skills/deep-plan/DECISIONS.md create mode 100644 .claude/skills/deep-plan/VISION.md create mode 100644 .claude/skills/deep-plan/test/run-ledger-governance.test.mjs diff --git a/.claude/skills/deep-plan-pipeline/references/run-ledger.schema.md b/.claude/skills/deep-plan-pipeline/references/run-ledger.schema.md index 92a6e022e..1c80aff57 100644 --- a/.claude/skills/deep-plan-pipeline/references/run-ledger.schema.md +++ b/.claude/skills/deep-plan-pipeline/references/run-ledger.schema.md @@ -13,7 +13,13 @@ created before this contract run unchanged (the conductor falls back to its prio ```jsonc { + // — sidecar metadata (Codex-convention-aligned; deliberately lighter — no createdAt/updatedAt, kept deterministic) — "schemaVersion": 1, + "schemaName": "run-ledger", + "schemaUrl": ".claude/skills/deep-plan-pipeline/references/run-ledger.schema.md", + "artifactKind": "deep-plan-run-ledger", + "producer": { "skill": "deep-plan", "skillVersion": "1.0.0", "module": "run-ledger.mjs" }, + "legs": { "deepPlan": { "status": "", "verdict": "sound|minor-only|needs-critical-fixes|has-major-gaps", "degraded": false, "at": "YYYY-MM-DD" }, "review": { "status": "", "verdict": "", "counts": { /* … */ }, "at": "YYYY-MM-DD" }, @@ -21,10 +27,59 @@ created before this contract run unchanged (the conductor falls back to its prio "implementation": { "status": "", "rawStatus": "", "branch": "…", "at": "YYYY-MM-DD" } }, "loopbackCount": 0, // bumped by the revise leg on each revise→re-review loop - "lastEscalation": null // conductor-managed + "lastEscalation": null, // conductor-managed + + // — Phase B: governable-autonomy fields (all OPTIONAL; set via writeMeta; absent ⇒ prior behaviour) — + "authority": "triage", // what the NEXT stage MAY do: triage", + "metric": "", + "predicates": [ { "label": "parity", "kind": "command", "cmd": "node …/X-golden-parity.test.js" } ], + "met": false, // = AND(predicate exit codes); empty predicates ⇒ false (fail-closed) + "bounds": { "maxIterations": 3, "capTokens": 800000, "noProgressRounds": 2 } + } } ``` +## Governable-autonomy fields (Phase B) + +These make the ledger ready for an autonomous loop *before* the loop exists — each is optional and +inert until a consumer reads it (the conductor / a future rollout loop). + +- **`authority`** — the granted permission tier (`triage < implement < push < merge`), checked by + `authorityAllows(ledger, action)`. **Fail-closed**: absent/unknown authority permits nothing, so a + handoff can never silently escalate from triage to merge. +- **`goal.bounds`** — the ACTIVE loop boundary, read by `boundExceeded(bounds, {iterations, tokens, + noProgress})`, which returns the tripped bound name (a stop reason) or null. The boundary that prevents + runaway (the 6M-token failure had none). Bound fields are positive integers; `0`/negative trip + immediately and should be treated as misconfiguration. +- **`budget`** (reserved) — a top-level `{capTokens, spent}` field for the loop's own spend-tracking; + NOT yet wired to a helper. The **active** token cap today is `goal.bounds.capTokens`; when the loop + ships it will reconcile `budget.spent` against `capTokens` and own this field. +- **`goal`** — the quantified "done", evaluated by `evaluateGoal(goal, {cwd})`: + `met = AND(predicate.cmd exit 0)`. Every predicate is a **falsifiable command** (deterministic-first; + a metric/verdict predicate is just a command that exits 0/1). **Empty predicates ⇒ not met** + (fail-closed: a loop needs something that can say *no*). The loop runs + `until met ∨ boundExceeded(goal.bounds) → else escalate`. + +## Sibling: `run-events.jsonl` (append-only transition log) + +Written alongside the ledger by `appendEvent()` (called write-ahead from every `writeLeg`/`writeMeta`). +The Claude-side machine transition trail — the lighter counterpart to the Codex runtime's +`deep-plan-events.jsonl` (same `.claude`/`.agents` runtime split). One JSON object per line, never +rewritten; each carries a **monotonic 1-based `seq`** for ordering + replay/gap detection. + +```jsonc +{ "seq": 1, "type": "leg", "leg": "deepPlan", "status": "done-advance", "verdict": "sound", "at": "YYYY-MM-DD" } +{ "seq": 2, "type": "meta", "keys": ["loopbackCount"], "at": null } +``` + +Write-ahead ordering (event appended *before* the ledger write) means a crash over-counts (an event +with no committed state — auditable) rather than silently under-counting. Reader: the conductor's +resume/observability path. It is *unit-tested*, not JSON-schema-validated (the deliberate lean-Claude +posture; the Codex `deep-plan-events.jsonl` carries the heavier registered-schema discipline). + ## Status enum (the single point that preserves verdict semantics) A leg's `status` is **derived from its verdict** — never collapsed to a binary done/not-done — via the diff --git a/.claude/skills/deep-plan/DECISIONS.md b/.claude/skills/deep-plan/DECISIONS.md new file mode 100644 index 000000000..cbe23af5f --- /dev/null +++ b/.claude/skills/deep-plan/DECISIONS.md @@ -0,0 +1,36 @@ +# DECISIONS — deep-plan skill family + +Human-legible decision journal: **why**, not what. Append dated, high-level entries for meaningful +decisions only (not routine actions, never secrets). Distinct from `run-events.jsonl` (machine +transition log) and `CHANGELOG.md` (releases). Purpose: a person who stays out of the per-step loop +can re-enter with full understanding. *(Steinberger's persistent log; Karpathy: "outsource your +thinking, never your understanding.")* + +--- + +## 2026-06-15 — Run-state ledger as the integration seam (not a monolith) +**Decision:** integrate the legs through ONE shared `run-ledger.json` (state) + `signal-map.md` +(routing), with the conductor as a thin reader — rather than welding the pipeline into a single +workflow. **Why:** every source (Anthropic skills, Spec Kit, orchestrator-worker) and our own +6M-token convergence failure say composed-with-contracts beats monolith; "seamless handoff" comes +from the shared state contract, not a shared container. + +## 2026-06-15 — Build the harness lean + test-gated, NOT via /deep-plan on itself +**Decision:** harness/tooling changes are built directly, gated by `node --test`, using `/deep-plan` +at most ONCE to extract invariants. **Why:** two `/deep-plan` runs to plan the ledger cost ~6M tokens +and never converged (category error: app-change planner aimed at meta-tooling; self-referential; +additive-by-design amplifies complexity). The test suite is the verifier, not another audit loop. +Captured in memory: `deep-plan-harness-self-modification`. + +## 2026-06-15 — Status derives from verdict; never collapse to binary done/not-done +**Decision:** a leg's ledger status is *derived* from its verdict via maps mirroring `signal-map.md`, +fail-closed. **Why:** the binary-flatten defect (a `has-major-gaps` leg-1 auto-advancing to review) +was the exact resume-correctness bug both audits kept flagging; deriving status preserves the +conductor's routing semantics so a resumed run can never skip a certification gate. + +## 2026-06-15 — Governable-autonomy essentials = authority, completion condition, decision journal +**Decision:** prioritize three of the four "final" handoff components — `authority` tier, quantified +`goal`/completion condition, and this journal — as additive ledger fields + one doc. **Why:** +Karpathy's rule for autonomy is "set the boundaries first" (objective, metric, permissions) and keep +a record you can re-enter through. Budget boundary + the rollout loop are deferred until a consumer +(the loop) exists — don't build machinery ahead of its substrate. diff --git a/.claude/skills/deep-plan/VISION.md b/.claude/skills/deep-plan/VISION.md new file mode 100644 index 000000000..36e48aeb1 --- /dev/null +++ b/.claude/skills/deep-plan/VISION.md @@ -0,0 +1,80 @@ +# VISION — The deep-plan skill family as a governable-autonomy foundation + +**North star.** A composed system of single-purpose skills that plans, reviews, implements, and +audits changes — integrated through *shared contracts* (the run-ledger state + the signal-map +routing law), not a monolith — so work can be tracked, handed off, resumed, and (eventually) looped +**without a human in every step, yet never without a human's understanding.** + +This file is the constitution: it states intent + boundaries + the extension contract, so each +addition (and each loop tick, later) does not re-derive intent from scratch. + +## The 7-layer reference architecture (industry-convergent: Anthropic skills, Spec Kit, orchestrator-worker) + +| Layer | Role | Where it lives | Status | +|---|---|---|---| +| 1. Constitution / VISION | durable intent + boundaries | this file | ✅ | +| 2. Staged artifact pipeline | spec → plan → review → implement → audit | the deep-plan skills | ✅ | +| 3. Control-plane / worker | orchestrator routes; workers execute (tool-enforced) | `deep-plan-pipeline` conductor | ⚠️ prose-enforced | +| 4. Durable checkpoint | resume from state N, not scratch | `run-ledger.json` | ✅ | +| 5. Validator that says "no" | tests / audit refute the work | `node --test` + adversarial audit | ✅ | +| 6. Bounded routing | verdict → action, caps, no improvisation | `signal-map.md` + bounds | ✅ | +| 7. Decision-ready HITL escalation | opinionated brief, not a raw question | conductor escalation block | ⚠️ format | + +## Governable-autonomy components (the tracking surface) + +Every component is a **field on the run-ledger or a sibling file** — never a new monolith. + +| Component | Form | Purpose | Status | +|---|---|---|---| +| Tracking | `run-ledger.json` (`legs`, status) | resume at the right leg | ✅ shipped | +| Handoff | the ledger contract | leg→leg carries the verdict | ✅ shipped | +| Transcript | platform `subagents/workflows/*` | full-fidelity replay | ✅ platform | +| Logging | `run-events.jsonl` (append-only, `seq`-stamped, write-ahead) | machine transition trail; reader = conductor resume/observability | ✅ Phase A — Claude-side counterpart to Codex `deep-plan-events.jsonl`, deliberately lighter (unit-tested, not JSON-schema-registered) | +| Sidecar conformance | `schemaName`/`schemaUrl`/`artifactKind`/`producer` | cross-side consistency | ✅ Phase A — Codex-convention-aligned, kept deterministic (omits `createdAt`/`updatedAt`) | +| **Authority** | `run-ledger.authority` + `authorityAllows()` | what the next stage MAY do (triage= want +} + +// ── Phase B: completion condition — the quantified "done" ────────────────────────────────── +// done = (P₁ ∧ … ∧ Pₙ), each predicate a FALSIFIABLE command (exit 0 = pass). Deterministic-first; +// a "metric"/"verdict" predicate is just a command that computes it and exits 0/1. Empty predicates → +// NOT met (fail-closed: a loop needs something that can say no). `bounds` are the loop's stop conditions, +// evaluated separately via boundExceeded(). A chatty-but-passing command is not misread as a failure +// (maxBuffer raised). +// +// TRUST BOUNDARY: each predicate `cmd` runs in a shell with the conductor's privileges. Predicates MUST +// be operator- or plan-authored; NEVER derive a predicate cmd from untrusted input. When a consumer (the +// loop) wires this in, gate it behind authorityAllows(ledger, 'implement'). +export function evaluateGoal(goal, { cwd, timeoutMs = 120000, maxBuffer = 10 * 1024 * 1024 } = {}) { + const results = (goal && Array.isArray(goal.predicates) ? goal.predicates : []).map((p) => { + let pass = false, error = null + try { execSync(p.cmd, { cwd, stdio: 'pipe', timeout: timeoutMs, maxBuffer }); pass = true } + catch (e) { error = e && e.message ? String(e.message).slice(0, 160) : 'nonzero exit' } + return { label: p.label ?? (p.cmd || '').slice(0, 60), kind: p.kind ?? 'command', pass, error } + }) + const met = results.length > 0 && results.every((r) => r.pass) // empty → not met + return { met, results } +} + +// Has a loop bound tripped? Returns the bound name (a stop reason) or null — the other half of +// `stop = done ∨ bound`. Without it, no autonomy is safe. +export function boundExceeded(bounds, { iterations = 0, tokens = 0, noProgress = 0 } = {}) { + if (!bounds) return null + if (bounds.maxIterations != null && iterations >= bounds.maxIterations) return 'maxIterations' + if (bounds.capTokens != null && tokens >= bounds.capTokens) return 'capTokens' + if (bounds.noProgressRounds != null && noProgress >= bounds.noProgressRounds) return 'noProgressRounds' + return null +} + export { LEG_ORDER } diff --git a/.claude/skills/deep-plan/test/run-ledger-governance.test.mjs b/.claude/skills/deep-plan/test/run-ledger-governance.test.mjs new file mode 100644 index 000000000..4313ffaa4 --- /dev/null +++ b/.claude/skills/deep-plan/test/run-ledger-governance.test.mjs @@ -0,0 +1,91 @@ +// Governance/observability tests for the run-ledger: sidecar metadata, the append-only event +// log (Phase A), plus authority tiers and the goal/completion-condition evaluator (Phase B). +// Deterministic gate — the validator that says "no" for the governable-autonomy layer. +import { test } from 'node:test' +import assert from 'node:assert/strict' +import fs from 'node:fs' +import os from 'node:os' +import path from 'node:path' +import { + writeLeg, writeMeta, readLedger, appendEvent, EVENTS_FILE, + authorityAllows, authorityRank, evaluateGoal, boundExceeded, +} from '../scripts/run-ledger.mjs' + +const tmp = () => fs.mkdtempSync(path.join(os.tmpdir(), 'run-ledger-gov-')) +const events = (d) => fs.readFileSync(path.join(d, EVENTS_FILE), 'utf8').trim().split('\n').filter(Boolean).map((l) => JSON.parse(l)) + +// ---- Phase A: sidecar metadata conformance ---- +test('a fresh ledger carries full sidecar metadata (schemaVersion/Name/Url + artifactKind + producer.skillVersion)', () => { + const d = tmp() + const led = writeLeg(d, 'deepPlan', { status: 'done-advance', verdict: 'sound' }) + assert.equal(led.schemaVersion, 1) + assert.equal(led.schemaName, 'run-ledger') + assert.match(led.schemaUrl, /run-ledger\.schema\.md$/) + assert.equal(led.artifactKind, 'deep-plan-run-ledger') + assert.equal(led.producer.skill, 'deep-plan') + assert.equal(led.producer.skillVersion, '1.0.0') +}) + +// ---- Phase A: append-only event log ---- +test('every writeLeg/writeMeta appends a parseable event with monotonic seq; append-only (never rewritten)', () => { + const d = tmp() + writeLeg(d, 'deepPlan', { status: 'done-advance', verdict: 'sound', at: '2026-06-15' }) + writeLeg(d, 'review', { status: 'done-advance', verdict: 'minor-only', at: '2026-06-15' }) + writeMeta(d, { loopbackCount: 1 }) + const ev = events(d) + assert.equal(ev.length, 3) // accumulates, not overwrites + assert.deepEqual(ev.map((e) => e.seq), [1, 2, 3]) // monotonic → gap/replay detectable + assert.deepEqual(ev[0], { seq: 1, type: 'leg', leg: 'deepPlan', status: 'done-advance', verdict: 'sound', at: '2026-06-15' }) + assert.equal(ev[1].leg, 'review') + assert.equal(ev[2].type, 'meta') + assert.deepEqual(ev[2].keys, ['loopbackCount']) +}) + +test('events are WRITE-AHEAD: after a successful write the ledger state and its event agree', () => { + const d = tmp() + writeLeg(d, 'deepPlan', { status: 'done-advance', verdict: 'sound' }) + assert.equal(readLedger(d).legs.deepPlan.status, 'done-advance') + assert.equal(events(d)[0].status, 'done-advance') // append-before-write: never silently under-counts +}) + +test('appendEvent is independently usable, order-preserving, and seq-stamped', () => { + const d = tmp() + appendEvent(d, { type: 'note', n: 1 }) + appendEvent(d, { type: 'note', n: 2 }) + assert.deepEqual(events(d).map((e) => [e.seq, e.n]), [[1, 1], [2, 2]]) +}) + +// ---- Phase B: authority tiers (fail-closed, monotonic) ---- +test('authority is monotonic and fail-closed: no/unknown authority permits nothing; grants never silently escalate', () => { + assert.equal(authorityAllows({}, 'triage'), false) // no grant → nothing (fail-closed) + assert.equal(authorityAllows({ authority: 'bogus' }, 'triage'), false) // unknown → nothing + assert.equal(authorityAllows({ authority: 'triage' }, 'triage'), true) + assert.equal(authorityAllows({ authority: 'triage' }, 'push'), false) // can't escalate above grant + assert.equal(authorityAllows({ authority: 'merge' }, 'push'), true) // higher grant covers lower + // fail-closed on the ACTION side too: an unknown/typo'd/case-mismatched action is denied, never granted + assert.equal(authorityAllows({ authority: 'merge' }, 'bogus'), false) + assert.equal(authorityAllows({ authority: 'merge' }, 'MERGE'), false) // case-sensitive + assert.equal(authorityAllows({ authority: 'merge' }, undefined), false) + assert.ok(authorityRank('merge') > authorityRank('triage')) +}) + +// ---- Phase B: completion condition (conjunction of falsifiable commands, fail-closed) ---- +test('evaluateGoal: met iff ALL predicate commands pass (exit 0); empty predicates → NOT met (a loop needs a "no")', () => { + assert.equal(evaluateGoal({ predicates: [{ cmd: 'true' }, { cmd: 'true' }] }).met, true) + const mixed = evaluateGoal({ predicates: [{ cmd: 'true' }, { cmd: 'false', label: 'gate' }] }) + assert.equal(mixed.met, false) // conjunction: one failing predicate fails the whole + assert.equal(mixed.results[1].pass, false) + assert.equal(mixed.results[1].label, 'gate') + assert.equal(evaluateGoal({ predicates: [] }).met, false) // fail-closed: nothing to say no + assert.equal(evaluateGoal({}).met, false) +}) + +// ---- Phase B: loop bounds (the other half of stop = done ∨ bound) ---- +test('boundExceeded returns the tripped bound name, else null', () => { + const b = { maxIterations: 3, capTokens: 1000, noProgressRounds: 2 } + assert.equal(boundExceeded(b, { iterations: 1, tokens: 10, noProgress: 0 }), null) + assert.equal(boundExceeded(b, { iterations: 3 }), 'maxIterations') + assert.equal(boundExceeded(b, { tokens: 1000 }), 'capTokens') + assert.equal(boundExceeded(b, { noProgress: 2 }), 'noProgressRounds') + assert.equal(boundExceeded(null, { iterations: 99 }), null) // no bounds declared → no trip +}) From efbdc1cdfe18776383700f2a61e9a6d5ef24a0eb Mon Sep 17 00:00:00 2001 From: Number531 <120485065+Number531@users.noreply.github.com> Date: Tue, 16 Jun 2026 00:12:45 -0400 Subject: [PATCH 2/3] =?UTF-8?q?feat(deep-plan):=20Phase=20C=20=E2=80=94=20?= =?UTF-8?q?generic=20loop=20engine=20(run-loop.mjs)=20+=20synthetic=20harn?= =?UTF-8?q?ess?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The autonomous-loop ENGINE that consumes the Phase-A/B governable-autonomy layer. Advances a queue: per item, run the item's ACTION then evaluate its completion GOAL (the validator that can say no), gating on AUTHORITY + BOUNDS, recording to the run-ledger so it's RESUMABLE, escalating on the first failure / tripped bound. Deliberately UN-WIRED — actionFor/goalFor are injected; the real per-domain migration config is deferred to Stage 1 (prove one by hand first). Validated by a SYNTHETIC harness (real true/false commands, deterministic): happy path, goal-not-met, action-failed, maxIterations total-cap, authority fail-closed, empty queue, and RESUME-from-ledger (skips completed, never re-runs). The harness proves the engine MECHANICS, not real-migration fit (honest scope, stated in code + test headers). Adversarial audit: SOUND. 3 MINOR hardening items remediated — malformed queues (duplicate / missing / non-string ids) are now REJECTED fail-closed as 'invalid-queue' (never silently 'completed'), and the escalated ledger record is uniform across all stop reasons. 75 *.test.mjs + upgrade-harness + revise dry-run-harness green; zero non-test callers (engine is capability-ahead-of-its-real-wiring, by design); backward-compatible. Co-Authored-By: Claude Fable 5 --- .claude/skills/deep-plan/VISION.md | 6 +- .claude/skills/deep-plan/scripts/run-loop.mjs | 84 ++++++++++++ .../skills/deep-plan/test/run-loop.test.mjs | 121 ++++++++++++++++++ 3 files changed, 210 insertions(+), 1 deletion(-) create mode 100644 .claude/skills/deep-plan/scripts/run-loop.mjs create mode 100644 .claude/skills/deep-plan/test/run-loop.test.mjs diff --git a/.claude/skills/deep-plan/VISION.md b/.claude/skills/deep-plan/VISION.md index 36e48aeb1..dd5d71c07 100644 --- a/.claude/skills/deep-plan/VISION.md +++ b/.claude/skills/deep-plan/VISION.md @@ -35,6 +35,7 @@ Every component is a **field on the run-ledger or a sibling file** — never a n | **Completion condition** | `run-ledger.goal` + `evaluateGoal()` | quantified, falsifiable "done" (AND of command predicates) | ✅ Phase B | | Decision journal | `DECISIONS.md` | human rationale (why, not what) | ✅ Phase A | | Budget boundary | `boundExceeded()` over `goal.bounds.capTokens` | token cap; stops runaway | ✅ Phase B (bound helper + active bound). The top-level `budget` field is *reserved* for the loop's spend-tracking — not yet wired. | +| **Loop engine** | `run-loop.mjs` + synthetic harness | advance / gate / escalate / bound / authority / **resume** over the ledger | ✅ Phase C — **UN-WIRED**: mechanics proven against mocks; real-domain config (the migration skill + parity tests) deferred to Stage 1 | ### Completion condition — quantified (Layer-6 core) `done = (P₁ ∧ … ∧ Pₙ)` where each predicate is a **falsifiable command (exit 0 = pass)**, @@ -65,11 +66,14 @@ schema change, an untested component) and you forfeit the property.** - **Phase A** — `run-events.jsonl` + sidecar metadata + this `VISION.md` + `DECISIONS.md`. *(built ✅)* - **Phase B** — `authority` tier + `goal`/`evaluateGoal` completion condition + `goal.bounds`/`boundExceeded` loop bounds. *(built ✅)* — the top-level `budget` field stays reserved (see Later). +- **Phase C** — generic loop ENGINE (`run-loop.mjs`): advance/gate/escalate/bound/authority/resume over the ledger, validated by a synthetic harness. *(built ✅ — UN-WIRED: the engine exists; the real-domain config is deferred, see Later.)* - **Later (consumer-gated — build when its reader exists):** - Tool-enforced control plane (strip `Edit` from the conductor) — Layer 3. - Decision-ready escalation brief format — Layer 7. - Budget boundary field — when a loop reads it. - - **The rollout loop** (`/loop` + `/goal` over the ledger) for Stage-3+ per-domain MCP→Python + - **Wire the loop to real domains** — point the Phase-C engine's `actionFor`/`goalFor` at the Stage-3+ + per-domain MCP→Python migration skill + its real parity tests. The engine is built; the wiring + the + real validators come from Stage 1 (which proves one migration by hand first). migrations — the repetitive substrate where automation pays off. Do NOT build before that substrate exists (the "build machinery before there is work for it" trap). diff --git a/.claude/skills/deep-plan/scripts/run-loop.mjs b/.claude/skills/deep-plan/scripts/run-loop.mjs new file mode 100644 index 000000000..13b54477e --- /dev/null +++ b/.claude/skills/deep-plan/scripts/run-loop.mjs @@ -0,0 +1,84 @@ +// Generic, domain-agnostic LOOP ENGINE over the run-ledger (Phase C). +// +// Advances a queue: for each item it runs the item's ACTION (the work), then evaluates the item's +// completion GOAL (the validator that can say "no"), respecting AUTHORITY and BOUNDS, recording every +// step to the run-ledger so the loop is RESUMABLE, and ESCALATING on the first failure or tripped bound. +// +// UN-WIRED by design: the per-item action command and goal predicates are INJECTED via `actionFor(item)` +// and `goalFor(item)`. The real config — pointing this at the Stage-3+ per-domain migration skill + its +// real parity tests — is a config step deferred until Stage 1 proves one migration by hand. This module +// is validated against a SYNTHETIC harness (run-loop.test.mjs), which proves the engine MECHANICS, not +// the real-migration fit. +// +// SAFETY: action/goal commands run in a shell (same trust boundary as evaluateGoal) and MUST be +// operator/plan-authored. The loop is fail-closed: it runs only when the ledger (or the explicit param) +// grants 'implement' authority. +import { execSync } from 'node:child_process' +import { readLedger, writeMeta, appendEvent, authorityAllows, evaluateGoal, boundExceeded } from './run-ledger.mjs' + +const itemId = (it) => (typeof it === 'string' ? it : it && it.id) +const msg = (e) => String((e && e.message) || e).slice(0, 160) + +// The resumable primitive: the first queue item not yet recorded complete in the ledger. +export function firstPendingItem(queue, completed) { + const done = new Set(completed) + return (queue || []).find((it) => !done.has(itemId(it))) +} + +// runLoop({ planDir, queue, actionFor, goalFor, authority?, bounds?, cwd? }) → { status, ... } +// status 'complete' — every queue item passed its goal +// status 'escalated' — stopped on authority-denied / a tripped bound / action-failed / goal-not-met +// Resumable: re-invoking after an escalation skips already-completed items (read from ledger.loop). +export function runLoop({ planDir, queue = [], actionFor, goalFor, authority, bounds = {}, cwd } = {}) { + const granted = authority ?? (readLedger(planDir) || {}).authority + if (!authorityAllows({ authority: granted }, 'implement')) { + return finish(planDir, 'escalated', { reason: 'authority-denied', authority: granted ?? null, completed: [] }) + } + // queue precondition (fail-closed): every item needs a stable, UNIQUE, non-empty string id (the resume + // key). A malformed queue must be REJECTED, never silently "completed" — an item with no resumable + // identity, or a duplicate id that would collapse two distinct items into one, is an error. + const ids = (queue || []).map(itemId) + if (ids.some((id) => typeof id !== 'string' || id === '')) { + return finish(planDir, 'escalated', { reason: 'invalid-queue', detail: 'every queue item needs a non-empty string id', completed: [] }) + } + if (new Set(ids).size !== ids.length) { + return finish(planDir, 'escalated', { reason: 'invalid-queue', detail: 'duplicate item ids', completed: [] }) + } + const completed = [...(((readLedger(planDir) || {}).loop || {}).completed || [])] + for (;;) { + const item = firstPendingItem(queue, completed) + if (item === undefined) return finish(planDir, 'complete', { completed }) + + const bound = boundExceeded(bounds, { iterations: completed.length }) + if (bound) return finish(planDir, 'escalated', { reason: bound, item: itemId(item), completed }) + + // 1. run the item's ACTION (the work that mutates state) + let actionOk = true, actionErr = null + const cmd = actionFor && actionFor(item) + if (cmd) { + try { execSync(cmd, { cwd, stdio: 'pipe', timeout: 120000, maxBuffer: 10 * 1024 * 1024 }) } + catch (e) { actionOk = false; actionErr = msg(e) } + } + // 2. evaluate the item's GOAL (the falsifiable validator) — skipped if the action already failed + const ev = actionOk ? evaluateGoal(goalFor && goalFor(item), { cwd }) : { met: false, results: [] } + appendEvent(planDir, { type: 'loop', item: itemId(item), actionOk, met: ev.met }) + + if (actionOk && ev.met) { + completed.push(itemId(item)) + writeMeta(planDir, { loop: { completed, escalated: null } }) + continue + } + // first failure → escalate (stop), preserving what completed (resume picks up here) + const reason = actionOk ? 'goal-not-met' : 'action-failed' + const failing = ev.results.filter((r) => !r.pass).map((r) => r.label) + writeMeta(planDir, { loop: { completed, escalated: { item: itemId(item), reason, failing } } }) + return finish(planDir, 'escalated', { reason, item: itemId(item), failing, completed, actionErr }, false) + } +} + +function finish(planDir, status, data, writeState = true) { + // uniform `escalated` shape across ALL stop reasons (authority / invalid-queue / bound / failure) + if (writeState) writeMeta(planDir, { loop: { completed: data.completed || [], escalated: status === 'complete' ? null : { reason: data.reason, item: data.item ?? null } } }) + appendEvent(planDir, { type: 'loop-end', status, reason: data.reason ?? null, completed: (data.completed || []).length }) + return { status, ...data } +} diff --git a/.claude/skills/deep-plan/test/run-loop.test.mjs b/.claude/skills/deep-plan/test/run-loop.test.mjs new file mode 100644 index 000000000..895b924e8 --- /dev/null +++ b/.claude/skills/deep-plan/test/run-loop.test.mjs @@ -0,0 +1,121 @@ +// Synthetic harness for the generic loop ENGINE (Phase C). Validates the MECHANICS — advance / gate / +// escalate / bound / authority / RESUME — against a fake queue with controllable pass/fail commands +// (`true`/`false`), NOT real domain migration. The test commands are the engine's "something that can +// say no." (Real-migration fit is deferred to Stage 1; this proves the engine does what it's coded to.) +import { test } from 'node:test' +import assert from 'node:assert/strict' +import fs from 'node:fs' +import os from 'node:os' +import path from 'node:path' +import { runLoop, firstPendingItem } from '../scripts/run-loop.mjs' +import { readLedger, EVENTS_FILE } from '../scripts/run-ledger.mjs' + +const tmp = () => fs.mkdtempSync(path.join(os.tmpdir(), 'run-loop-')) +const actionFor = (it) => it.action +const goalFor = (it) => ({ predicates: [{ cmd: it.goal, label: it.id }] }) +const item = (id, action, goal) => ({ id, action, goal }) +const loopEvents = (d) => fs.readFileSync(path.join(d, EVENTS_FILE), 'utf8').trim().split('\n').map(JSON.parse).filter((e) => e.type === 'loop' || e.type === 'loop-end') + +// ---- authority gate (fail-closed) ---- +test('the loop refuses to run without implement authority (fail-closed)', () => { + const d = tmp() + const r = runLoop({ planDir: d, queue: [item('a', 'true', 'true')], actionFor, goalFor, authority: 'triage' }) + assert.equal(r.status, 'escalated') + assert.equal(r.reason, 'authority-denied') + assert.deepEqual(r.completed, []) +}) +test('implement (or higher) authority lets the loop run', () => { + const d = tmp() + assert.equal(runLoop({ planDir: d, queue: [item('a', 'true', 'true')], actionFor, goalFor, authority: 'merge' }).status, 'complete') +}) + +// ---- happy path ---- +test('all items pass action + goal → complete, in order', () => { + const d = tmp() + const r = runLoop({ planDir: d, queue: [item('a', 'true', 'true'), item('b', 'true', 'true')], actionFor, goalFor, authority: 'implement' }) + assert.equal(r.status, 'complete') + assert.deepEqual(r.completed, ['a', 'b']) + assert.deepEqual(readLedger(d).loop.completed, ['a', 'b']) +}) + +// ---- the two failure modes ---- +test('a failing GOAL escalates at that item (completed preserved), with the failing predicate named', () => { + const d = tmp() + const r = runLoop({ planDir: d, queue: [item('a', 'true', 'true'), item('b', 'true', 'false')], actionFor, goalFor, authority: 'implement' }) + assert.equal(r.status, 'escalated') + assert.equal(r.reason, 'goal-not-met') + assert.equal(r.item, 'b') + assert.deepEqual(r.completed, ['a']) + assert.deepEqual(r.failing, ['b']) +}) +test('a failing ACTION escalates before the goal is even evaluated', () => { + const d = tmp() + const r = runLoop({ planDir: d, queue: [item('a', 'true', 'true'), item('b', 'false', 'true')], actionFor, goalFor, authority: 'implement' }) + assert.equal(r.status, 'escalated') + assert.equal(r.reason, 'action-failed') + assert.equal(r.item, 'b') + assert.deepEqual(r.completed, ['a']) +}) + +// ---- bounds (total cap, resume-safe) ---- +test('maxIterations caps total completed items, then escalates (not silently truncates)', () => { + const d = tmp() + const q = ['a', 'b', 'c', 'd', 'e'].map((id) => item(id, 'true', 'true')) + const r = runLoop({ planDir: d, queue: q, actionFor, goalFor, authority: 'implement', bounds: { maxIterations: 2 } }) + assert.equal(r.status, 'escalated') + assert.equal(r.reason, 'maxIterations') + assert.deepEqual(r.completed, ['a', 'b']) + assert.equal(readLedger(d).loop.escalated.item, 'c') // uniform escalated shape: bound stop records the item too +}) + +// ---- RESUME (the point of the ledger) ---- +test('after an escalation, re-running resumes from the failed item (skips completed) and finishes', () => { + const d = tmp() + // run 1: b's goal fails → escalate at b, completed [a] + const r1 = runLoop({ planDir: d, queue: [item('a', 'true', 'true'), item('b', 'true', 'false')], actionFor, goalFor, authority: 'implement' }) + assert.equal(r1.status, 'escalated') + assert.deepEqual(r1.completed, ['a']) + // "fix" b (its goal now passes) and re-run on the SAME ledger → resumes at b, completes + const r2 = runLoop({ planDir: d, queue: [item('a', 'true', 'true'), item('b', 'true', 'true')], actionFor, goalFor, authority: 'implement' }) + assert.equal(r2.status, 'complete') + assert.deepEqual(r2.completed, ['a', 'b']) + // 'a' was NOT re-run on resume (firstPendingItem skipped it) + assert.equal(readLedger(d).loop.completed.filter((x) => x === 'a').length, 1) +}) + +// ---- empty queue ---- +test('an empty queue completes vacuously (nothing to do)', () => { + const d = tmp() + const r = runLoop({ planDir: d, queue: [], actionFor, goalFor, authority: 'implement' }) + assert.equal(r.status, 'complete') + assert.deepEqual(r.completed, []) +}) + +test('a malformed queue is REJECTED (fail-closed), never silently completed', () => { + // duplicate ids would collapse two distinct items into one → reject + const dup = runLoop({ planDir: tmp(), queue: [item('a', 'true', 'true'), item('a', 'true', 'true')], actionFor, goalFor, authority: 'implement' }) + assert.equal(dup.status, 'escalated') + assert.equal(dup.reason, 'invalid-queue') + // an item with no stable string id has no resumable identity → reject + const noid = runLoop({ planDir: tmp(), queue: [{ action: 'true', goal: 'true' }], actionFor, goalFor, authority: 'implement' }) + assert.equal(noid.status, 'escalated') + assert.equal(noid.reason, 'invalid-queue') +}) + +// ---- ledger + event trail ---- +test('the loop records per-item events + a loop-end event to the run-ledger trail', () => { + const d = tmp() + runLoop({ planDir: d, queue: [item('a', 'true', 'true'), item('b', 'true', 'false')], actionFor, goalFor, authority: 'implement' }) + const ev = loopEvents(d) + assert.equal(ev.filter((e) => e.type === 'loop').length, 2) // one per attempted item + assert.equal(ev.filter((e) => e.type === 'loop-end').length, 1) + assert.equal(ev.find((e) => e.type === 'loop-end').status, 'escalated') +}) + +// ---- the resumable primitive in isolation ---- +test('firstPendingItem returns the first not-yet-completed item, or undefined when all done', () => { + const q = [item('a', 'true', 'true'), item('b', 'true', 'true')] + assert.equal(firstPendingItem(q, []).id, 'a') + assert.equal(firstPendingItem(q, ['a']).id, 'b') + assert.equal(firstPendingItem(q, ['a', 'b']), undefined) +}) From 6f17480ea20adf602827f38cb3e2e53f6d734360 Mon Sep 17 00:00:00 2001 From: Number531 <120485065+Number531@users.noreply.github.com> Date: Tue, 16 Jun 2026 00:59:10 -0400 Subject: [PATCH 3/3] fix(deep-plan): remediate full-audit findings on the governable-autonomy layer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Three independent adversarial audits of PR #229 (A+B+C): code SOUND, backward-compat CLEAN, docs HAD-CONTRADICTIONS (schema doc lagged Phase C). All real findings remediated; code was not overclaimed (every '✅ built' verified real). No fail-open, no regression. Code (fail-closed / robustness / observability): - run-loop: distinguish 'goal-absent' (missing goal config) from 'goal-not-met' (goal ran + failed); record actionSkipped so a typo'd real wiring leaves a trail signal, not a silent skip. - evaluateGoal: guard a predicate with no cmd → clean 'predicate has no cmd' error, not a raw TypeError. - appendEvent: torn-write guard — separate a crashed partial line so seq/parse stay sane. Docs (schema doc was written before Phase C landed): - document the ledger field + the / event types that run-loop.mjs writes. - correct 'before the loop exists' → the engine is built-but-un-wired; budget is read by nothing yet. - soften the events-log 'Reader: conductor' claim → observability, not yet consumed (write-only today). - DECISIONS.md: Phase-C entry recording the build-engine-but-defer-wiring decision. Tests (+5): missing-cmd guard; goal-absent vs goal-not-met; actionSkipped; the Phase-A×Phase-C cross-path (loop preserves legs); resume after a maxIterations escalation. Deferred (LOW, noted): per-item writeMeta is O(N) full-ledger rewrites + a redundant meta event — fine for the bounded domain queue, optimize only if queues grow large. 80 *.test.mjs + upgrade-harness + revise dry-run-harness green; still zero non-test callers; backward-compatible. Co-Authored-By: Claude Fable 5 --- .../references/run-ledger.schema.md | 33 +++++++++---- .claude/skills/deep-plan/DECISIONS.md | 11 +++++ .../skills/deep-plan/scripts/run-ledger.mjs | 16 +++++-- .claude/skills/deep-plan/scripts/run-loop.mjs | 17 ++++--- .../test/run-ledger-governance.test.mjs | 5 ++ .../skills/deep-plan/test/run-loop.test.mjs | 48 ++++++++++++++++++- 6 files changed, 110 insertions(+), 20 deletions(-) diff --git a/.claude/skills/deep-plan-pipeline/references/run-ledger.schema.md b/.claude/skills/deep-plan-pipeline/references/run-ledger.schema.md index 1c80aff57..2c7dde20b 100644 --- a/.claude/skills/deep-plan-pipeline/references/run-ledger.schema.md +++ b/.claude/skills/deep-plan-pipeline/references/run-ledger.schema.md @@ -38,14 +38,21 @@ created before this contract run unchanged (the conductor falls back to its prio "predicates": [ { "label": "parity", "kind": "command", "cmd": "node …/X-golden-parity.test.js" } ], "met": false, // = AND(predicate exit codes); empty predicates ⇒ false (fail-closed) "bounds": { "maxIterations": 3, "capTokens": 800000, "noProgressRounds": 2 } + }, + + // — Phase C: loop-engine resume state (written by run-loop.mjs; absent until the loop runs) — + "loop": { + "completed": ["item-id"], // queue items whose action + goal passed (the resume key set) + "escalated": { "item": "…", "reason": "goal-not-met|goal-absent|action-failed|invalid-queue|authority-denied|maxIterations|capTokens|noProgressRounds", "failing": [""] } // null when complete } } ``` -## Governable-autonomy fields (Phase B) +## Governable-autonomy fields (Phases B–C) -These make the ledger ready for an autonomous loop *before* the loop exists — each is optional and -inert until a consumer reads it (the conductor / a future rollout loop). +These make the ledger ready for the autonomous loop engine (`run-loop.mjs`, Phase C — **built but +un-wired** to real domains). Each is optional and inert until a consumer reads it (the loop engine, once +its real-domain config is wired; the conductor for resume). - **`authority`** — the granted permission tier (`triage < implement < push < merge`), checked by `authorityAllows(ledger, action)`. **Fail-closed**: absent/unknown authority permits nothing, so a @@ -55,8 +62,12 @@ inert until a consumer reads it (the conductor / a future rollout loop). runaway (the 6M-token failure had none). Bound fields are positive integers; `0`/negative trip immediately and should be treated as misconfiguration. - **`budget`** (reserved) — a top-level `{capTokens, spent}` field for the loop's own spend-tracking; - NOT yet wired to a helper. The **active** token cap today is `goal.bounds.capTokens`; when the loop - ships it will reconcile `budget.spent` against `capTokens` and own this field. + NOT yet read by any helper or by the Phase-C loop engine (which today tracks completion in + `loop.completed`, not token spend). The **active** token cap is `goal.bounds.capTokens`; the budget + field is wired when the loop's real-domain config tracks spend. +- **`loop`** — the loop engine's resume state (`{completed, escalated}`), written by `run-loop.mjs` and + read back by `firstPendingItem` to resume mid-queue. Disjoint from `legs` (a real pipeline's per-leg + state) — the two coexist in one ledger without collision. - **`goal`** — the quantified "done", evaluated by `evaluateGoal(goal, {cwd})`: `met = AND(predicate.cmd exit 0)`. Every predicate is a **falsifiable command** (deterministic-first; a metric/verdict predicate is just a command that exits 0/1). **Empty predicates ⇒ not met** @@ -71,13 +82,17 @@ The Claude-side machine transition trail — the lighter counterpart to the Code rewritten; each carries a **monotonic 1-based `seq`** for ordering + replay/gap detection. ```jsonc -{ "seq": 1, "type": "leg", "leg": "deepPlan", "status": "done-advance", "verdict": "sound", "at": "YYYY-MM-DD" } -{ "seq": 2, "type": "meta", "keys": ["loopbackCount"], "at": null } +{ "seq": 1, "type": "leg", "leg": "deepPlan", "status": "done-advance", "verdict": "sound", "at": "YYYY-MM-DD" } +{ "seq": 2, "type": "meta", "keys": ["loopbackCount"], "at": null } +{ "seq": 3, "type": "loop", "item": "…", "actionOk": true, "actionSkipped": false, "met": true } // Phase C: one per loop item +{ "seq": 4, "type": "loop-end", "status": "complete|escalated", "reason": null, "completed": 2 } // Phase C: loop terminal ``` Write-ahead ordering (event appended *before* the ledger write) means a crash over-counts (an event -with no committed state — auditable) rather than silently under-counting. Reader: the conductor's -resume/observability path. It is *unit-tested*, not JSON-schema-validated (the deliberate lean-Claude +with no committed state — auditable) rather than silently under-counting; a torn-write guard in +`appendEvent` separates a crashed partial line so `seq` stays sane. Reader (intended): **observability** +— the events log is NOT yet consumed by any code; the conductor resumes from `run-ledger.json` via +`firstPendingLeg`, not from this file. It is *unit-tested*, not JSON-schema-validated (the deliberate lean-Claude posture; the Codex `deep-plan-events.jsonl` carries the heavier registered-schema discipline). ## Status enum (the single point that preserves verdict semantics) diff --git a/.claude/skills/deep-plan/DECISIONS.md b/.claude/skills/deep-plan/DECISIONS.md index cbe23af5f..b43c91ef7 100644 --- a/.claude/skills/deep-plan/DECISIONS.md +++ b/.claude/skills/deep-plan/DECISIONS.md @@ -34,3 +34,14 @@ conductor's routing semantics so a resumed run can never skip a certification ga Karpathy's rule for autonomy is "set the boundaries first" (objective, metric, permissions) and keep a record you can re-enter through. Budget boundary + the rollout loop are deferred until a consumer (the loop) exists — don't build machinery ahead of its substrate. + +## 2026-06-16 — Built the loop ENGINE (Phase C) but kept it UN-WIRED +**Decision:** build a generic, domain-agnostic loop engine (`run-loop.mjs`) now — validated against a +synthetic harness — but inject `actionFor`/`goalFor` and leave it un-wired to real domains. **Why:** the +"don't build machinery before its substrate" rule applies at the *wiring* boundary, not the *engine* +boundary. The engine's mechanics (advance / gate / escalate / bound / resume) are knowable from first +principles + the orchestrator-worker research and are fully testable against mocks — so a synthetic +harness *can* say "no" to the engine. What's genuinely unknowable without Stage 1 is the *fit* to real +migration (the real queue, the real parity tests, the escalation UX), so the **wiring** stays deferred. +A synthetic test proves "the engine does what it's coded to," not "this design fits real migration" — the +latter still wants Stage 1 to prove one migration by hand first. diff --git a/.claude/skills/deep-plan/scripts/run-ledger.mjs b/.claude/skills/deep-plan/scripts/run-ledger.mjs index c8f001c9e..62326814c 100644 --- a/.claude/skills/deep-plan/scripts/run-ledger.mjs +++ b/.claude/skills/deep-plan/scripts/run-ledger.mjs @@ -97,7 +97,11 @@ function eventCount(planDir) { // writeLeg/writeMeta log WRITE-AHEAD (event before the ledger write) so a crash over-counts // (auditable) rather than silently under-counts. Reader: the conductor's resume/observability path. export function appendEvent(planDir, event) { - fs.appendFileSync(eventsPath(planDir), JSON.stringify({ seq: eventCount(planDir) + 1, ...event }) + '\n') + const p = eventsPath(planDir) + // torn-write guard: if a prior crash left a final line without a trailing newline, separate it so the + // new record never concatenates onto a partial one (keeps the append-only log parseable + seq sane). + try { const b = fs.readFileSync(p); if (b.length && b[b.length - 1] !== 0x0a) fs.appendFileSync(p, '\n') } catch { /* absent — nothing to repair */ } + fs.appendFileSync(p, JSON.stringify({ seq: eventCount(planDir) + 1, ...event }) + '\n') } // read-merge-write a single leg entry; never clobbers another leg's key. @@ -193,9 +197,13 @@ export function authorityAllows(ledger, action) { export function evaluateGoal(goal, { cwd, timeoutMs = 120000, maxBuffer = 10 * 1024 * 1024 } = {}) { const results = (goal && Array.isArray(goal.predicates) ? goal.predicates : []).map((p) => { let pass = false, error = null - try { execSync(p.cmd, { cwd, stdio: 'pipe', timeout: timeoutMs, maxBuffer }); pass = true } - catch (e) { error = e && e.message ? String(e.message).slice(0, 160) : 'nonzero exit' } - return { label: p.label ?? (p.cmd || '').slice(0, 60), kind: p.kind ?? 'command', pass, error } + if (!p || typeof p.cmd !== 'string' || !p.cmd) { + error = 'predicate has no cmd' // guard: a clean message, not a raw Node TypeError + } else { + try { execSync(p.cmd, { cwd, stdio: 'pipe', timeout: timeoutMs, maxBuffer }); pass = true } + catch (e) { error = e && e.message ? String(e.message).slice(0, 160) : 'nonzero exit' } + } + return { label: (p && p.label) ?? ((p && p.cmd) || '').slice(0, 60) ?? '', kind: (p && p.kind) ?? 'command', pass, error } }) const met = results.length > 0 && results.every((r) => r.pass) // empty → not met return { met, results } diff --git a/.claude/skills/deep-plan/scripts/run-loop.mjs b/.claude/skills/deep-plan/scripts/run-loop.mjs index 13b54477e..ad16ee145 100644 --- a/.claude/skills/deep-plan/scripts/run-loop.mjs +++ b/.claude/skills/deep-plan/scripts/run-loop.mjs @@ -52,16 +52,21 @@ export function runLoop({ planDir, queue = [], actionFor, goalFor, authority, bo const bound = boundExceeded(bounds, { iterations: completed.length }) if (bound) return finish(planDir, 'escalated', { reason: bound, item: itemId(item), completed }) - // 1. run the item's ACTION (the work that mutates state) + // 1. run the item's ACTION (the work that mutates state). A falsy actionFor is a goal-only item, + // recorded as actionSkipped so a typo'd real wiring leaves a signal in the trail (not silent). let actionOk = true, actionErr = null const cmd = actionFor && actionFor(item) + const actionSkipped = !cmd if (cmd) { try { execSync(cmd, { cwd, stdio: 'pipe', timeout: 120000, maxBuffer: 10 * 1024 * 1024 }) } catch (e) { actionOk = false; actionErr = msg(e) } } - // 2. evaluate the item's GOAL (the falsifiable validator) — skipped if the action already failed - const ev = actionOk ? evaluateGoal(goalFor && goalFor(item), { cwd }) : { met: false, results: [] } - appendEvent(planDir, { type: 'loop', item: itemId(item), actionOk, met: ev.met }) + // 2. evaluate the item's GOAL (the falsifiable validator) — skipped if the action already failed. + // A missing/empty goal is 'goal-absent' (fail-closed config error), distinct from 'goal-not-met'. + const goal = goalFor && goalFor(item) + const goalAbsent = !goal || !Array.isArray(goal.predicates) || goal.predicates.length === 0 + const ev = actionOk ? evaluateGoal(goal, { cwd }) : { met: false, results: [] } + appendEvent(planDir, { type: 'loop', item: itemId(item), actionOk, actionSkipped, met: ev.met }) if (actionOk && ev.met) { completed.push(itemId(item)) @@ -69,8 +74,8 @@ export function runLoop({ planDir, queue = [], actionFor, goalFor, authority, bo continue } // first failure → escalate (stop), preserving what completed (resume picks up here) - const reason = actionOk ? 'goal-not-met' : 'action-failed' - const failing = ev.results.filter((r) => !r.pass).map((r) => r.label) + const reason = !actionOk ? 'action-failed' : (goalAbsent ? 'goal-absent' : 'goal-not-met') + const failing = goalAbsent ? [''] : ev.results.filter((r) => !r.pass).map((r) => r.label) writeMeta(planDir, { loop: { completed, escalated: { item: itemId(item), reason, failing } } }) return finish(planDir, 'escalated', { reason, item: itemId(item), failing, completed, actionErr }, false) } diff --git a/.claude/skills/deep-plan/test/run-ledger-governance.test.mjs b/.claude/skills/deep-plan/test/run-ledger-governance.test.mjs index 4313ffaa4..144134dae 100644 --- a/.claude/skills/deep-plan/test/run-ledger-governance.test.mjs +++ b/.claude/skills/deep-plan/test/run-ledger-governance.test.mjs @@ -79,6 +79,11 @@ test('evaluateGoal: met iff ALL predicate commands pass (exit 0); empty predicat assert.equal(evaluateGoal({ predicates: [] }).met, false) // fail-closed: nothing to say no assert.equal(evaluateGoal({}).met, false) }) +test('evaluateGoal: a predicate with no cmd is fail-closed with a clean message (not a raw TypeError)', () => { + const r = evaluateGoal({ predicates: [{ label: 'x' }] }) + assert.equal(r.met, false) + assert.equal(r.results[0].error, 'predicate has no cmd') +}) // ---- Phase B: loop bounds (the other half of stop = done ∨ bound) ---- test('boundExceeded returns the tripped bound name, else null', () => { diff --git a/.claude/skills/deep-plan/test/run-loop.test.mjs b/.claude/skills/deep-plan/test/run-loop.test.mjs index 895b924e8..f99ab0b8d 100644 --- a/.claude/skills/deep-plan/test/run-loop.test.mjs +++ b/.claude/skills/deep-plan/test/run-loop.test.mjs @@ -8,7 +8,7 @@ import fs from 'node:fs' import os from 'node:os' import path from 'node:path' import { runLoop, firstPendingItem } from '../scripts/run-loop.mjs' -import { readLedger, EVENTS_FILE } from '../scripts/run-ledger.mjs' +import { readLedger, writeLeg, EVENTS_FILE } from '../scripts/run-ledger.mjs' const tmp = () => fs.mkdtempSync(path.join(os.tmpdir(), 'run-loop-')) const actionFor = (it) => it.action @@ -119,3 +119,49 @@ test('firstPendingItem returns the first not-yet-completed item, or undefined wh assert.equal(firstPendingItem(q, ['a']).id, 'b') assert.equal(firstPendingItem(q, ['a', 'b']), undefined) }) + +// ---- a missing goal is a CONFIG error (goal-absent), distinct from a failing goal ---- +test('a missing/empty goal is goal-ABSENT (fail-closed config error), not goal-not-met', () => { + const r = runLoop({ + planDir: tmp(), + queue: [item('a', 'true', 'true'), { id: 'b', action: 'true' }], + actionFor, + goalFor: (it) => (it.goal ? { predicates: [{ cmd: it.goal, label: it.id }] } : {}), // b has no goal + authority: 'implement', + }) + assert.equal(r.status, 'escalated') + assert.equal(r.reason, 'goal-absent') + assert.equal(r.item, 'b') + assert.deepEqual(r.failing, ['']) +}) + +// ---- a goal-only item records actionSkipped (a typo'd real wiring leaves a signal) ---- +test('a goal-only item (no action) is recorded actionSkipped and completes on its goal', () => { + const d = tmp() + const r = runLoop({ planDir: d, queue: [{ id: 'a', goal: 'true' }], actionFor: () => undefined, goalFor, authority: 'implement' }) + assert.equal(r.status, 'complete') + const ev = fs.readFileSync(path.join(d, EVENTS_FILE), 'utf8').trim().split('\n').map(JSON.parse).find((e) => e.type === 'loop') + assert.equal(ev.actionSkipped, true) +}) + +// ---- Phase-A × Phase-C: the loop's `loop` field is disjoint from the legs a real pipeline writes ---- +test('the loop preserves existing leg state (loop and legs are disjoint ledger keys)', () => { + const d = tmp() + writeLeg(d, 'deepPlan', { status: 'done-advance', verdict: 'sound' }) // a real pipeline left legs here + runLoop({ planDir: d, queue: [item('a', 'true', 'true')], actionFor, goalFor, authority: 'implement' }) + const led = readLedger(d) + assert.equal(led.legs.deepPlan.status, 'done-advance') // legs untouched by the loop + assert.deepEqual(led.loop.completed, ['a']) // loop state sits alongside, not clobbering +}) + +// ---- resume after a BOUND escalation (not just a failure) ---- +test('resume after a maxIterations escalation: raising the cap continues from where it stopped', () => { + const d = tmp() + const q = ['a', 'b', 'c'].map((id) => item(id, 'true', 'true')) + const r1 = runLoop({ planDir: d, queue: q, actionFor, goalFor, authority: 'implement', bounds: { maxIterations: 2 } }) + assert.equal(r1.reason, 'maxIterations') + assert.deepEqual(r1.completed, ['a', 'b']) + const r2 = runLoop({ planDir: d, queue: q, actionFor, goalFor, authority: 'implement', bounds: { maxIterations: 5 } }) + assert.equal(r2.status, 'complete') + assert.deepEqual(r2.completed, ['a', 'b', 'c']) // resumed at c; a/b not re-run +})