From 0348b3573031f0456eed53e25223fe2326a00a49 Mon Sep 17 00:00:00 2001
From: Number531 <120485065+Number531@users.noreply.github.com>
Date: Mon, 15 Jun 2026 18:28:46 -0400
Subject: [PATCH 1/3] feat(deep-plan): governable-autonomy layer on the
 run-ledger (event log, sidecar metadata, authority, completion condition)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Adds the tracking/handoff components Steinberger & Karpathy name as the boundaries to set
before autonomy — all additive, optional, backward-compatible (readLedger→null seam), and
inert until a consumer reads them. Built lean + test-gated (NOT via /deep-plan on itself).

Phase A (observability/conformance):
- run-events.jsonl: append-only, monotonic seq (replay/gap detection), WRITE-AHEAD (event
  before ledger write → crash over-counts, never silently under-counts). Claude-side
  counterpart to the Codex deep-plan-events.jsonl (deliberately lighter).
- sidecar metadata: schemaName + schemaUrl + artifactKind + producer.skillVersion.
- VISION.md (constitution + 7-layer architecture + the four-step extension contract) +
  DECISIONS.md (human decision journal).

Phase B (governable autonomy):
- authority tiers (triage<implement<push<merge): authorityAllows(), fail-closed on BOTH
  the authority and the action side (unknown/typo'd action denied).
- completion condition: evaluateGoal() = AND of falsifiable command predicates (exit 0);
  empty predicates ⇒ not met (a loop needs something that can say no); maxBuffer + a TRUST
  BOUNDARY note on the shell surface.
- loop bounds: boundExceeded() over goal.bounds (the 6M-token failure had none).

Each phase passed an independent adversarial audit; gaps remediated (incl. a CRITICAL
authority fail-open on the action side) and re-verified. 64 *.test.mjs + upgrade-harness +
revise dry-run-harness + signal-map-drift(38) all green; zero new callers in the existing
pipeline; backward-compat proven.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 .../references/run-ledger.schema.md           | 57 +++++++++++-
 .claude/skills/deep-plan/DECISIONS.md         | 36 ++++++++
 .claude/skills/deep-plan/VISION.md            | 80 ++++++++++++++++
 .../skills/deep-plan/scripts/run-ledger.mjs   | 80 +++++++++++++++-
 .../test/run-ledger-governance.test.mjs       | 91 +++++++++++++++++++
 5 files changed, 341 insertions(+), 3 deletions(-)
 create mode 100644 .claude/skills/deep-plan/DECISIONS.md
 create mode 100644 .claude/skills/deep-plan/VISION.md
 create mode 100644 .claude/skills/deep-plan/test/run-ledger-governance.test.mjs
diff --git a/.claude/skills/deep-plan-pipeline/references/run-ledger.schema.md b/.claude/skills/deep-plan-pipeline/references/run-ledger.schema.md
index 92a6e022e..1c80aff57 100644
--- a/.claude/skills/deep-plan-pipeline/references/run-ledger.schema.md
+++ b/.claude/skills/deep-plan-pipeline/references/run-ledger.schema.md
@@ -13,7 +13,13 @@ created before this contract run unchanged (the conductor falls back to its prio
 
 ```jsonc
 {
+  // — sidecar metadata (Codex-convention-aligned; deliberately lighter — no createdAt/updatedAt, kept deterministic) —
   "schemaVersion": 1,
+  "schemaName": "run-ledger",
+  "schemaUrl": ".claude/skills/deep-plan-pipeline/references/run-ledger.schema.md",
+  "artifactKind": "deep-plan-run-ledger",
+  "producer": { "skill": "deep-plan", "skillVersion": "1.0.0", "module": "run-ledger.mjs" },
+
   "legs": {
     "deepPlan":       { "status": "<status>", "verdict": "sound|minor-only|needs-critical-fixes|has-major-gaps", "degraded": false, "at": "YYYY-MM-DD" },
     "review":         { "status": "<status>", "verdict": "<review verdict>", "counts": { /* … */ }, "at": "YYYY-MM-DD" },
@@ -21,10 +27,59 @@ created before this contract run unchanged (the conductor falls back to its prio
     "implementation": { "status": "<status>", "rawStatus": "<leg-3 return status>", "branch": "…", "at": "YYYY-MM-DD" }
   },
   "loopbackCount": 0,        // bumped by the revise leg on each revise→re-review loop
-  "lastEscalation": null     // conductor-managed
+  "lastEscalation": null,    // conductor-managed
+
+  // — Phase B: governable-autonomy fields (all OPTIONAL; set via writeMeta; absent ⇒ prior behaviour) —
+  "authority": "triage",     // what the NEXT stage MAY do: triage<implement<push<merge (fail-closed: absent/unknown ⇒ nothing)
+  "budget": { "capTokens": 800000, "spent": 0 },   // (reserved) loop spend-tracking — NOT yet wired; the active cap is goal.bounds.capTokens
+  "goal": {                  // the quantified completion condition
+    "objective": "<human-readable>",
+    "metric": "<the measured success criterion>",
+    "predicates": [ { "label": "parity", "kind": "command", "cmd": "node …/X-golden-parity.test.js" } ],
+    "met": false,            // = AND(predicate exit codes); empty predicates ⇒ false (fail-closed)
+    "bounds": { "maxIterations": 3, "capTokens": 800000, "noProgressRounds": 2 }
+  }
 }
 ```
 
+## Governable-autonomy fields (Phase B)
+
+These make the ledger ready for an autonomous loop *before* the loop exists — each is optional and
+inert until a consumer reads it (the conductor / a future rollout loop).
+
+- **`authority`** — the granted permission tier (`triage < implement < push < merge`), checked by
+  `authorityAllows(ledger, action)`. **Fail-closed**: absent/unknown authority permits nothing, so a
+  handoff can never silently escalate from triage to merge.
+- **`goal.bounds`** — the ACTIVE loop boundary, read by `boundExceeded(bounds, {iterations, tokens,
+  noProgress})`, which returns the tripped bound name (a stop reason) or null. The boundary that prevents
+  runaway (the 6M-token failure had none). Bound fields are positive integers; `0`/negative trip
+  immediately and should be treated as misconfiguration.
+- **`budget`** (reserved) — a top-level `{capTokens, spent}` field for the loop's own spend-tracking;
+  NOT yet wired to a helper. The **active** token cap today is `goal.bounds.capTokens`; when the loop
+  ships it will reconcile `budget.spent` against `capTokens` and own this field.
+- **`goal`** — the quantified "done", evaluated by `evaluateGoal(goal, {cwd})`:
+  `met = AND(predicate.cmd exit 0)`. Every predicate is a **falsifiable command** (deterministic-first;
+  a metric/verdict predicate is just a command that exits 0/1). **Empty predicates ⇒ not met**
+  (fail-closed: a loop needs something that can say *no*). The loop runs
+  `until met ∨ boundExceeded(goal.bounds) → else escalate`.
+
+## Sibling: `run-events.jsonl` (append-only transition log)
+
+Written alongside the ledger by `appendEvent()` (called write-ahead from every `writeLeg`/`writeMeta`).
+The Claude-side machine transition trail — the lighter counterpart to the Codex runtime's
+`deep-plan-events.jsonl` (same `.claude`/`.agents` runtime split). One JSON object per line, never
+rewritten; each carries a **monotonic 1-based `seq`** for ordering + replay/gap detection.
+
+```jsonc
+{ "seq": 1, "type": "leg",  "leg": "deepPlan", "status": "done-advance", "verdict": "sound", "at": "YYYY-MM-DD" }
+{ "seq": 2, "type": "meta", "keys": ["loopbackCount"], "at": null }
+```
+
+Write-ahead ordering (event appended *before* the ledger write) means a crash over-counts (an event
+with no committed state — auditable) rather than silently under-counting. Reader: the conductor's
+resume/observability path. It is *unit-tested*, not JSON-schema-validated (the deliberate lean-Claude
+posture; the Codex `deep-plan-events.jsonl` carries the heavier registered-schema discipline).
+
 ## Status enum (the single point that preserves verdict semantics)
 
 A leg's `status` is **derived from its verdict** — never collapsed to a binary done/not-done — via the
diff --git a/.claude/skills/deep-plan/DECISIONS.md b/.claude/skills/deep-plan/DECISIONS.md
new file mode 100644
index 000000000..cbe23af5f
--- /dev/null
+++ b/.claude/skills/deep-plan/DECISIONS.md
@@ -0,0 +1,36 @@
+# DECISIONS — deep-plan skill family
+
+Human-legible decision journal: **why**, not what. Append dated, high-level entries for meaningful
+decisions only (not routine actions, never secrets). Distinct from `run-events.jsonl` (machine
+transition log) and `CHANGELOG.md` (releases). Purpose: a person who stays out of the per-step loop
+can re-enter with full understanding. *(Steinberger's persistent log; Karpathy: "outsource your
+thinking, never your understanding.")*
+
+---
+
+## 2026-06-15 — Run-state ledger as the integration seam (not a monolith)
+**Decision:** integrate the legs through ONE shared `run-ledger.json` (state) + `signal-map.md`
+(routing), with the conductor as a thin reader — rather than welding the pipeline into a single
+workflow. **Why:** every source (Anthropic skills, Spec Kit, orchestrator-worker) and our own
+6M-token convergence failure say composed-with-contracts beats monolith; "seamless handoff" comes
+from the shared state contract, not a shared container.
+
+## 2026-06-15 — Build the harness lean + test-gated, NOT via /deep-plan on itself
+**Decision:** harness/tooling changes are built directly, gated by `node --test`, using `/deep-plan`
+at most ONCE to extract invariants. **Why:** two `/deep-plan` runs to plan the ledger cost ~6M tokens
+and never converged (category error: app-change planner aimed at meta-tooling; self-referential;
+additive-by-design amplifies complexity). The test suite is the verifier, not another audit loop.
+Captured in memory: `deep-plan-harness-self-modification`.
+
+## 2026-06-15 — Status derives from verdict; never collapse to binary done/not-done
+**Decision:** a leg's ledger status is *derived* from its verdict via maps mirroring `signal-map.md`,
+fail-closed. **Why:** the binary-flatten defect (a `has-major-gaps` leg-1 auto-advancing to review)
+was the exact resume-correctness bug both audits kept flagging; deriving status preserves the
+conductor's routing semantics so a resumed run can never skip a certification gate.
+
+## 2026-06-15 — Governable-autonomy essentials = authority, completion condition, decision journal
+**Decision:** prioritize three of the four "final" handoff components — `authority` tier, quantified
+`goal`/completion condition, and this journal — as additive ledger fields + one doc. **Why:**
+Karpathy's rule for autonomy is "set the boundaries first" (objective, metric, permissions) and keep
+a record you can re-enter through. Budget boundary + the rollout loop are deferred until a consumer
+(the loop) exists — don't build machinery ahead of its substrate.
diff --git a/.claude/skills/deep-plan/VISION.md b/.claude/skills/deep-plan/VISION.md
new file mode 100644
index 000000000..36e48aeb1
--- /dev/null
+++ b/.claude/skills/deep-plan/VISION.md
@@ -0,0 +1,80 @@
+# VISION — The deep-plan skill family as a governable-autonomy foundation
+
+**North star.** A composed system of single-purpose skills that plans, reviews, implements, and
+audits changes — integrated through *shared contracts* (the run-ledger state + the signal-map
+routing law), not a monolith — so work can be tracked, handed off, resumed, and (eventually) looped
+**without a human in every step, yet never without a human's understanding.**
+
+This file is the constitution: it states intent + boundaries + the extension contract, so each
+addition (and each loop tick, later) does not re-derive intent from scratch.
+
+## The 7-layer reference architecture (industry-convergent: Anthropic skills, Spec Kit, orchestrator-worker)
+
+| Layer | Role | Where it lives | Status |
+|---|---|---|---|
+| 1. Constitution / VISION | durable intent + boundaries | this file | ✅ |
+| 2. Staged artifact pipeline | spec → plan → review → implement → audit | the deep-plan skills | ✅ |
+| 3. Control-plane / worker | orchestrator routes; workers execute (tool-enforced) | `deep-plan-pipeline` conductor | ⚠️ prose-enforced |
+| 4. Durable checkpoint | resume from state N, not scratch | `run-ledger.json` | ✅ |
+| 5. Validator that says "no" | tests / audit refute the work | `node --test` + adversarial audit | ✅ |
+| 6. Bounded routing | verdict → action, caps, no improvisation | `signal-map.md` + bounds | ✅ |
+| 7. Decision-ready HITL escalation | opinionated brief, not a raw question | conductor escalation block | ⚠️ format |
+
+## Governable-autonomy components (the tracking surface)
+
+Every component is a **field on the run-ledger or a sibling file** — never a new monolith.
+
+| Component | Form | Purpose | Status |
+|---|---|---|---|
+| Tracking | `run-ledger.json` (`legs`, status) | resume at the right leg | ✅ shipped |
+| Handoff | the ledger contract | leg→leg carries the verdict | ✅ shipped |
+| Transcript | platform `subagents/workflows/*` | full-fidelity replay | ✅ platform |
+| Logging | `run-events.jsonl` (append-only, `seq`-stamped, write-ahead) | machine transition trail; reader = conductor resume/observability | ✅ Phase A — Claude-side counterpart to Codex `deep-plan-events.jsonl`, deliberately lighter (unit-tested, not JSON-schema-registered) |
+| Sidecar conformance | `schemaName`/`schemaUrl`/`artifactKind`/`producer` | cross-side consistency | ✅ Phase A — Codex-convention-aligned, kept deterministic (omits `createdAt`/`updatedAt`) |
+| **Authority** | `run-ledger.authority` + `authorityAllows()` | what the next stage MAY do (triage<implement<push<merge, fail-closed) | ✅ Phase B |
+| **Completion condition** | `run-ledger.goal` + `evaluateGoal()` | quantified, falsifiable "done" (AND of command predicates) | ✅ Phase B |
+| Decision journal | `DECISIONS.md` | human rationale (why, not what) | ✅ Phase A |
+| Budget boundary | `boundExceeded()` over `goal.bounds.capTokens` | token cap; stops runaway | ✅ Phase B (bound helper + active bound). The top-level `budget` field is *reserved* for the loop's spend-tracking — not yet wired. |
+
+### Completion condition — quantified (Layer-6 core)
+`done = (P₁ ∧ … ∧ Pₙ)` where each predicate is a **falsifiable command (exit 0 = pass)**,
+deterministic-first, threshold-and-judge only where irreducible, measured against a captured
+baseline. The loop runs `until done ∨ bound (iterations / budget / no-progress) → else escalate`.
+A predicate that cannot fail is "the agent agreeing with itself." (Internalized the hard way as
+audit findings F9/F10: *prove by command, never by prose.*)
+
+## The extension contract (what makes this an evolutionary foundation)
+
+Adding any component is **always** the same four-step, additive move — never a breaking change:
+
+```
+1. add an OPTIONAL field/leg to run-ledger (readers tolerate its absence — readLedger→null seam)
+2. wire its WRITER into one skill's persist script
+3. ship its VALIDATOR (node --test) + update the drift guard
+4. land as its OWN small PR
+```
+
+Properties this guarantees: backward-compatible (inert on pre-contract folders), fail-closed
+(unknown verdict/status → escalate, never silent mis-route), validated-by-construction (no
+component ships without its own test), and incremental (Tier-0 useful, every tier additive).
+
+**Hold this line and the foundation evolves seamlessly; break it (a required field, a breaking
+schema change, an untested component) and you forfeit the property.**
+
+## Backlog (build order)
+
+- **Phase A** — `run-events.jsonl` + sidecar metadata + this `VISION.md` + `DECISIONS.md`. *(built ✅)*
+- **Phase B** — `authority` tier + `goal`/`evaluateGoal` completion condition + `goal.bounds`/`boundExceeded` loop bounds. *(built ✅)* — the top-level `budget` field stays reserved (see Later).
+- **Later (consumer-gated — build when its reader exists):**
+  - Tool-enforced control plane (strip `Edit` from the conductor) — Layer 3.
+  - Decision-ready escalation brief format — Layer 7.
+  - Budget boundary field — when a loop reads it.
+  - **The rollout loop** (`/loop` + `/goal` over the ledger) for Stage-3+ per-domain MCP→Python
+    migrations — the repetitive substrate where automation pays off. Do NOT build before that
+    substrate exists (the "build machinery before there is work for it" trap).
+
+## Non-negotiables (the boundaries before autonomy)
+- A loop needs **something that can say no** (a test / type-check / real error). Never loop without one.
+- **Constitution first**: intent is durable here, not re-derived per tick.
+- Don't build a consumer-less component. Don't merge components into a monolith. Don't deep-plan the
+  deep-plan harness (lean + test-gated only).
diff --git a/.claude/skills/deep-plan/scripts/run-ledger.mjs b/.claude/skills/deep-plan/scripts/run-ledger.mjs
index ec76f4aae..c8f001c9e 100644
--- a/.claude/skills/deep-plan/scripts/run-ledger.mjs
+++ b/.claude/skills/deep-plan/scripts/run-ledger.mjs
@@ -13,9 +13,12 @@
 // derived from it via the maps below, fail-closed (unknown verdict/status → escalate/halt).
 import fs from 'node:fs'
 import path from 'node:path'
+import { execSync } from 'node:child_process'
 
 export const LEDGER_FILE = 'run-ledger.json'
+export const EVENTS_FILE = 'run-events.jsonl' // append-only transition log (sibling of the ledger)
 export const SCHEMA_VERSION = 1
+export const SCHEMA_NAME = 'run-ledger'
 
 // Leg 1 (/deep-plan) audit.verdict → ledger status. signal-map.md §Leg 1.
 // (degraded===true overrides to 'rerun' — the conductor re-runs leg 1 once, then escalates.)
@@ -71,7 +74,30 @@ export function readLedger(planDir) {
 }
 
 function emptyLedger() {
-  return { schemaVersion: SCHEMA_VERSION, legs: {}, loopbackCount: 0, lastEscalation: null }
+  return {
+    schemaVersion: SCHEMA_VERSION,
+    schemaName: SCHEMA_NAME,
+    schemaUrl: '.claude/skills/deep-plan-pipeline/references/run-ledger.schema.md',
+    artifactKind: 'deep-plan-run-ledger',
+    producer: { skill: 'deep-plan', skillVersion: '1.0.0', module: 'run-ledger.mjs' },
+    legs: {}, loopbackCount: 0, lastEscalation: null,
+  }
+}
+
+function eventsPath(planDir) { return path.join(planDir, EVENTS_FILE) }
+function eventCount(planDir) {
+  try { return fs.readFileSync(eventsPath(planDir), 'utf8').split('\n').filter(Boolean).length }
+  catch { return 0 }
+}
+
+// Append-only event log (one JSON object per line) — the MACHINE transition trail, the Claude-side
+// counterpart to the Codex runtime's deep-plan-events.jsonl (deliberately lighter; same .claude/.agents
+// runtime split). Distinct from run-ledger.json (current state) and DECISIONS.md (human rationale).
+// Each event carries a monotonic 1-based `seq` for ordering + replay/gap detection; never rewritten.
+// writeLeg/writeMeta log WRITE-AHEAD (event before the ledger write) so a crash over-counts
+// (auditable) rather than silently under-counts. Reader: the conductor's resume/observability path.
+export function appendEvent(planDir, event) {
+  fs.appendFileSync(eventsPath(planDir), JSON.stringify({ seq: eventCount(planDir) + 1, ...event }) + '\n')
 }
 
 // read-merge-write a single leg entry; never clobbers another leg's key.
@@ -79,14 +105,16 @@ export function writeLeg(planDir, legKey, entry) {
   const led = readLedger(planDir) || emptyLedger()
   if (!led.legs) led.legs = {}
   led.legs[legKey] = { ...(led.legs[legKey] || {}), ...entry }
+  appendEvent(planDir, { type: 'leg', leg: legKey, status: entry.status ?? null, verdict: entry.verdict ?? null, at: entry.at ?? null }) // write-ahead
   fs.writeFileSync(ledgerPath(planDir), JSON.stringify(led, null, 2) + '\n')
   return led
 }
 
-// merge top-level conductor meta (loopbackCount, lastEscalation).
+// merge top-level conductor meta (loopbackCount, lastEscalation, authority, budget, goal).
 export function writeMeta(planDir, patch) {
   const led = readLedger(planDir) || emptyLedger()
   Object.assign(led, patch)
+  appendEvent(planDir, { type: 'meta', keys: Object.keys(patch), at: patch.at ?? null }) // write-ahead
   fs.writeFileSync(ledgerPath(planDir), JSON.stringify(led, null, 2) + '\n')
   return led
 }
@@ -135,4 +163,52 @@ export function isResumable(ledger, legKey) {
   return e.status === 'done-advance'
 }
 
+// ── Phase B: authority tiers (what the NEXT stage MAY do) ──────────────────────────────────
+// Monotonic permission ladder (ascending). The ledger's `authority` field records the granted
+// level; a handoff can never silently escalate (triage → merge requires a deliberate grant).
+export const AUTHORITY_LEVELS = ['triage', 'implement', 'push', 'merge']
+
+export function authorityRank(level) {
+  const i = AUTHORITY_LEVELS.indexOf(level)
+  return i < 0 ? -1 : i // unknown/absent → -1 (below triage): fail-closed
+}
+// Does the ledger's granted authority permit `action`? Fail-closed on BOTH sides: an absent/unknown
+// authority permits nothing, AND an unknown/typo'd/case-mismatched action (not in AUTHORITY_LEVELS) is denied.
+export function authorityAllows(ledger, action) {
+  const want = authorityRank(action)
+  if (want < 0) return false // unknown action ⇒ deny — never authorize something we can't rank
+  return authorityRank(ledger && ledger.authority) >= want
+}
+
+// ── Phase B: completion condition — the quantified "done" ──────────────────────────────────
+// done = (P₁ ∧ … ∧ Pₙ), each predicate a FALSIFIABLE command (exit 0 = pass). Deterministic-first;
+// a "metric"/"verdict" predicate is just a command that computes it and exits 0/1. Empty predicates →
+// NOT met (fail-closed: a loop needs something that can say no). `bounds` are the loop's stop conditions,
+// evaluated separately via boundExceeded(). A chatty-but-passing command is not misread as a failure
+// (maxBuffer raised).
+//
+// TRUST BOUNDARY: each predicate `cmd` runs in a shell with the conductor's privileges. Predicates MUST
+// be operator- or plan-authored; NEVER derive a predicate cmd from untrusted input. When a consumer (the
+// loop) wires this in, gate it behind authorityAllows(ledger, 'implement').
+export function evaluateGoal(goal, { cwd, timeoutMs = 120000, maxBuffer = 10 * 1024 * 1024 } = {}) {
+  const results = (goal && Array.isArray(goal.predicates) ? goal.predicates : []).map((p) => {
+    let pass = false, error = null
+    try { execSync(p.cmd, { cwd, stdio: 'pipe', timeout: timeoutMs, maxBuffer }); pass = true }
+    catch (e) { error = e && e.message ? String(e.message).slice(0, 160) : 'nonzero exit' }
+    return { label: p.label ?? (p.cmd || '').slice(0, 60), kind: p.kind ?? 'command', pass, error }
+  })
+  const met = results.length > 0 && results.every((r) => r.pass) // empty → not met
+  return { met, results }
+}
+
+// Has a loop bound tripped? Returns the bound name (a stop reason) or null — the other half of
+// `stop = done ∨ bound`. Without it, no autonomy is safe.
+export function boundExceeded(bounds, { iterations = 0, tokens = 0, noProgress = 0 } = {}) {
+  if (!bounds) return null
+  if (bounds.maxIterations != null && iterations >= bounds.maxIterations) return 'maxIterations'
+  if (bounds.capTokens != null && tokens >= bounds.capTokens) return 'capTokens'
+  if (bounds.noProgressRounds != null && noProgress >= bounds.noProgressRounds) return 'noProgressRounds'
+  return null
+}
+
 export { LEG_ORDER }
diff --git a/.claude/skills/deep-plan/test/run-ledger-governance.test.mjs b/.claude/skills/deep-plan/test/run-ledger-governance.test.mjs
new file mode 100644
index 000000000..4313ffaa4
--- /dev/null
+++ b/.claude/skills/deep-plan/test/run-ledger-governance.test.mjs
@@ -0,0 +1,91 @@
+// Governance/observability tests for the run-ledger: sidecar metadata, the append-only event
+// log (Phase A), plus authority tiers and the goal/completion-condition evaluator (Phase B).
+// Deterministic gate — the validator that says "no" for the governable-autonomy layer.
+import { test } from 'node:test'
+import assert from 'node:assert/strict'
+import fs from 'node:fs'
+import os from 'node:os'
+import path from 'node:path'
+import {
+  writeLeg, writeMeta, readLedger, appendEvent, EVENTS_FILE,
+  authorityAllows, authorityRank, evaluateGoal, boundExceeded,
+} from '../scripts/run-ledger.mjs'
+
+const tmp = () => fs.mkdtempSync(path.join(os.tmpdir(), 'run-ledger-gov-'))
+const events = (d) => fs.readFileSync(path.join(d, EVENTS_FILE), 'utf8').trim().split('\n').filter(Boolean).map((l) => JSON.parse(l))
+
+// ---- Phase A: sidecar metadata conformance ----
+test('a fresh ledger carries full sidecar metadata (schemaVersion/Name/Url + artifactKind + producer.skillVersion)', () => {
+  const d = tmp()
+  const led = writeLeg(d, 'deepPlan', { status: 'done-advance', verdict: 'sound' })
+  assert.equal(led.schemaVersion, 1)
+  assert.equal(led.schemaName, 'run-ledger')
+  assert.match(led.schemaUrl, /run-ledger\.schema\.md$/)
+  assert.equal(led.artifactKind, 'deep-plan-run-ledger')
+  assert.equal(led.producer.skill, 'deep-plan')
+  assert.equal(led.producer.skillVersion, '1.0.0')
+})
+
+// ---- Phase A: append-only event log ----
+test('every writeLeg/writeMeta appends a parseable event with monotonic seq; append-only (never rewritten)', () => {
+  const d = tmp()
+  writeLeg(d, 'deepPlan', { status: 'done-advance', verdict: 'sound', at: '2026-06-15' })
+  writeLeg(d, 'review', { status: 'done-advance', verdict: 'minor-only', at: '2026-06-15' })
+  writeMeta(d, { loopbackCount: 1 })
+  const ev = events(d)
+  assert.equal(ev.length, 3)                          // accumulates, not overwrites
+  assert.deepEqual(ev.map((e) => e.seq), [1, 2, 3])   // monotonic → gap/replay detectable
+  assert.deepEqual(ev[0], { seq: 1, type: 'leg', leg: 'deepPlan', status: 'done-advance', verdict: 'sound', at: '2026-06-15' })
+  assert.equal(ev[1].leg, 'review')
+  assert.equal(ev[2].type, 'meta')
+  assert.deepEqual(ev[2].keys, ['loopbackCount'])
+})
+
+test('events are WRITE-AHEAD: after a successful write the ledger state and its event agree', () => {
+  const d = tmp()
+  writeLeg(d, 'deepPlan', { status: 'done-advance', verdict: 'sound' })
+  assert.equal(readLedger(d).legs.deepPlan.status, 'done-advance')
+  assert.equal(events(d)[0].status, 'done-advance')   // append-before-write: never silently under-counts
+})
+
+test('appendEvent is independently usable, order-preserving, and seq-stamped', () => {
+  const d = tmp()
+  appendEvent(d, { type: 'note', n: 1 })
+  appendEvent(d, { type: 'note', n: 2 })
+  assert.deepEqual(events(d).map((e) => [e.seq, e.n]), [[1, 1], [2, 2]])
+})
+
+// ---- Phase B: authority tiers (fail-closed, monotonic) ----
+test('authority is monotonic and fail-closed: no/unknown authority permits nothing; grants never silently escalate', () => {
+  assert.equal(authorityAllows({}, 'triage'), false)                    // no grant → nothing (fail-closed)
+  assert.equal(authorityAllows({ authority: 'bogus' }, 'triage'), false) // unknown → nothing
+  assert.equal(authorityAllows({ authority: 'triage' }, 'triage'), true)
+  assert.equal(authorityAllows({ authority: 'triage' }, 'push'), false)  // can't escalate above grant
+  assert.equal(authorityAllows({ authority: 'merge' }, 'push'), true)    // higher grant covers lower
+  // fail-closed on the ACTION side too: an unknown/typo'd/case-mismatched action is denied, never granted
+  assert.equal(authorityAllows({ authority: 'merge' }, 'bogus'), false)
+  assert.equal(authorityAllows({ authority: 'merge' }, 'MERGE'), false)  // case-sensitive
+  assert.equal(authorityAllows({ authority: 'merge' }, undefined), false)
+  assert.ok(authorityRank('merge') > authorityRank('triage'))
+})
+
+// ---- Phase B: completion condition (conjunction of falsifiable commands, fail-closed) ----
+test('evaluateGoal: met iff ALL predicate commands pass (exit 0); empty predicates → NOT met (a loop needs a "no")', () => {
+  assert.equal(evaluateGoal({ predicates: [{ cmd: 'true' }, { cmd: 'true' }] }).met, true)
+  const mixed = evaluateGoal({ predicates: [{ cmd: 'true' }, { cmd: 'false', label: 'gate' }] })
+  assert.equal(mixed.met, false)                       // conjunction: one failing predicate fails the whole
+  assert.equal(mixed.results[1].pass, false)
+  assert.equal(mixed.results[1].label, 'gate')
+  assert.equal(evaluateGoal({ predicates: [] }).met, false)  // fail-closed: nothing to say no
+  assert.equal(evaluateGoal({}).met, false)
+})
+
+// ---- Phase B: loop bounds (the other half of stop = done ∨ bound) ----
+test('boundExceeded returns the tripped bound name, else null', () => {
+  const b = { maxIterations: 3, capTokens: 1000, noProgressRounds: 2 }
+  assert.equal(boundExceeded(b, { iterations: 1, tokens: 10, noProgress: 0 }), null)
+  assert.equal(boundExceeded(b, { iterations: 3 }), 'maxIterations')
+  assert.equal(boundExceeded(b, { tokens: 1000 }), 'capTokens')
+  assert.equal(boundExceeded(b, { noProgress: 2 }), 'noProgressRounds')
+  assert.equal(boundExceeded(null, { iterations: 99 }), null)  // no bounds declared → no trip
+})

From efbdc1cdfe18776383700f2a61e9a6d5ef24a0eb Mon Sep 17 00:00:00 2001
From: Number531 <120485065+Number531@users.noreply.github.com>
Date: Tue, 16 Jun 2026 00:12:45 -0400
Subject: [PATCH 2/3] =?UTF-8?q?feat(deep-plan):=20Phase=20C=20=E2=80=94=20?=
 =?UTF-8?q?generic=20loop=20engine=20(run-loop.mjs)=20+=20synthetic=20harn?=
 =?UTF-8?q?ess?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

The autonomous-loop ENGINE that consumes the Phase-A/B governable-autonomy layer. Advances a
queue: per item, run the item's ACTION then evaluate its completion GOAL (the validator that
can say no), gating on AUTHORITY + BOUNDS, recording to the run-ledger so it's RESUMABLE,
escalating on the first failure / tripped bound. Deliberately UN-WIRED — actionFor/goalFor are
injected; the real per-domain migration config is deferred to Stage 1 (prove one by hand first).

Validated by a SYNTHETIC harness (real true/false commands, deterministic): happy path,
goal-not-met, action-failed, maxIterations total-cap, authority fail-closed, empty queue, and
RESUME-from-ledger (skips completed, never re-runs). The harness proves the engine MECHANICS,
not real-migration fit (honest scope, stated in code + test headers).

Adversarial audit: SOUND. 3 MINOR hardening items remediated — malformed queues (duplicate /
missing / non-string ids) are now REJECTED fail-closed as 'invalid-queue' (never silently
'completed'), and the escalated ledger record is uniform across all stop reasons.

75 *.test.mjs + upgrade-harness + revise dry-run-harness green; zero non-test callers (engine is
capability-ahead-of-its-real-wiring, by design); backward-compatible.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 .claude/skills/deep-plan/VISION.md            |   6 +-
 .claude/skills/deep-plan/scripts/run-loop.mjs |  84 ++++++++++++
 .../skills/deep-plan/test/run-loop.test.mjs   | 121 ++++++++++++++++++
 3 files changed, 210 insertions(+), 1 deletion(-)
 create mode 100644 .claude/skills/deep-plan/scripts/run-loop.mjs
 create mode 100644 .claude/skills/deep-plan/test/run-loop.test.mjs

diff --git a/.claude/skills/deep-plan/VISION.md b/.claude/skills/deep-plan/VISION.md
index 36e48aeb1..dd5d71c07 100644
--- a/.claude/skills/deep-plan/VISION.md
+++ b/.claude/skills/deep-plan/VISION.md
@@ -35,6 +35,7 @@ Every component is a **field on the run-ledger or a sibling file** — never a n
 | **Completion condition** | `run-ledger.goal` + `evaluateGoal()` | quantified, falsifiable "done" (AND of command predicates) | ✅ Phase B |
 | Decision journal | `DECISIONS.md` | human rationale (why, not what) | ✅ Phase A |
 | Budget boundary | `boundExceeded()` over `goal.bounds.capTokens` | token cap; stops runaway | ✅ Phase B (bound helper + active bound). The top-level `budget` field is *reserved* for the loop's spend-tracking — not yet wired. |
+| **Loop engine** | `run-loop.mjs` + synthetic harness | advance / gate / escalate / bound / authority / **resume** over the ledger | ✅ Phase C — **UN-WIRED**: mechanics proven against mocks; real-domain config (the migration skill + parity tests) deferred to Stage 1 |
 
 ### Completion condition — quantified (Layer-6 core)
 `done = (P₁ ∧ … ∧ Pₙ)` where each predicate is a **falsifiable command (exit 0 = pass)**,
@@ -65,11 +66,14 @@ schema change, an untested component) and you forfeit the property.**
 
 - **Phase A** — `run-events.jsonl` + sidecar metadata + this `VISION.md` + `DECISIONS.md`. *(built ✅)*
 - **Phase B** — `authority` tier + `goal`/`evaluateGoal` completion condition + `goal.bounds`/`boundExceeded` loop bounds. *(built ✅)* — the top-level `budget` field stays reserved (see Later).
+- **Phase C** — generic loop ENGINE (`run-loop.mjs`): advance/gate/escalate/bound/authority/resume over the ledger, validated by a synthetic harness. *(built ✅ — UN-WIRED: the engine exists; the real-domain config is deferred, see Later.)*
 - **Later (consumer-gated — build when its reader exists):**
   - Tool-enforced control plane (strip `Edit` from the conductor) — Layer 3.
   - Decision-ready escalation brief format — Layer 7.
   - Budget boundary field — when a loop reads it.
-  - **The rollout loop** (`/loop` + `/goal` over the ledger) for Stage-3+ per-domain MCP→Python
+  - **Wire the loop to real domains** — point the Phase-C engine's `actionFor`/`goalFor` at the Stage-3+
+    per-domain MCP→Python migration skill + its real parity tests. The engine is built; the wiring + the
+    real validators come from Stage 1 (which proves one migration by hand first).
     migrations — the repetitive substrate where automation pays off. Do NOT build before that
     substrate exists (the "build machinery before there is work for it" trap).
 
diff --git a/.claude/skills/deep-plan/scripts/run-loop.mjs b/.claude/skills/deep-plan/scripts/run-loop.mjs
new file mode 100644
index 000000000..13b54477e
--- /dev/null
+++ b/.claude/skills/deep-plan/scripts/run-loop.mjs
@@ -0,0 +1,84 @@
+// Generic, domain-agnostic LOOP ENGINE over the run-ledger (Phase C).
+//
+// Advances a queue: for each item it runs the item's ACTION (the work), then evaluates the item's
+// completion GOAL (the validator that can say "no"), respecting AUTHORITY and BOUNDS, recording every
+// step to the run-ledger so the loop is RESUMABLE, and ESCALATING on the first failure or tripped bound.
+//
+// UN-WIRED by design: the per-item action command and goal predicates are INJECTED via `actionFor(item)`
+// and `goalFor(item)`. The real config — pointing this at the Stage-3+ per-domain migration skill + its
+// real parity tests — is a config step deferred until Stage 1 proves one migration by hand. This module
+// is validated against a SYNTHETIC harness (run-loop.test.mjs), which proves the engine MECHANICS, not
+// the real-migration fit.
+//
+// SAFETY: action/goal commands run in a shell (same trust boundary as evaluateGoal) and MUST be
+// operator/plan-authored. The loop is fail-closed: it runs only when the ledger (or the explicit param)
+// grants 'implement' authority.
+import { execSync } from 'node:child_process'
+import { readLedger, writeMeta, appendEvent, authorityAllows, evaluateGoal, boundExceeded } from './run-ledger.mjs'
+
+const itemId = (it) => (typeof it === 'string' ? it : it && it.id)
+const msg = (e) => String((e && e.message) || e).slice(0, 160)
+
+// The resumable primitive: the first queue item not yet recorded complete in the ledger.
+export function firstPendingItem(queue, completed) {
+  const done = new Set(completed)
+  return (queue || []).find((it) => !done.has(itemId(it)))
+}
+
+// runLoop({ planDir, queue, actionFor, goalFor, authority?, bounds?, cwd? }) → { status, ... }
+//   status 'complete'  — every queue item passed its goal
+//   status 'escalated' — stopped on authority-denied / a tripped bound / action-failed / goal-not-met
+// Resumable: re-invoking after an escalation skips already-completed items (read from ledger.loop).
+export function runLoop({ planDir, queue = [], actionFor, goalFor, authority, bounds = {}, cwd } = {}) {
+  const granted = authority ?? (readLedger(planDir) || {}).authority
+  if (!authorityAllows({ authority: granted }, 'implement')) {
+    return finish(planDir, 'escalated', { reason: 'authority-denied', authority: granted ?? null, completed: [] })
+  }
+  // queue precondition (fail-closed): every item needs a stable, UNIQUE, non-empty string id (the resume
+  // key). A malformed queue must be REJECTED, never silently "completed" — an item with no resumable
+  // identity, or a duplicate id that would collapse two distinct items into one, is an error.
+  const ids = (queue || []).map(itemId)
+  if (ids.some((id) => typeof id !== 'string' || id === '')) {
+    return finish(planDir, 'escalated', { reason: 'invalid-queue', detail: 'every queue item needs a non-empty string id', completed: [] })
+  }
+  if (new Set(ids).size !== ids.length) {
+    return finish(planDir, 'escalated', { reason: 'invalid-queue', detail: 'duplicate item ids', completed: [] })
+  }
+  const completed = [...(((readLedger(planDir) || {}).loop || {}).completed || [])]
+  for (;;) {
+    const item = firstPendingItem(queue, completed)
+    if (item === undefined) return finish(planDir, 'complete', { completed })
+
+    const bound = boundExceeded(bounds, { iterations: completed.length })
+    if (bound) return finish(planDir, 'escalated', { reason: bound, item: itemId(item), completed })
+
+    // 1. run the item's ACTION (the work that mutates state)
+    let actionOk = true, actionErr = null
+    const cmd = actionFor && actionFor(item)
+    if (cmd) {
+      try { execSync(cmd, { cwd, stdio: 'pipe', timeout: 120000, maxBuffer: 10 * 1024 * 1024 }) }
+      catch (e) { actionOk = false; actionErr = msg(e) }
+    }
+    // 2. evaluate the item's GOAL (the falsifiable validator) — skipped if the action already failed
+    const ev = actionOk ? evaluateGoal(goalFor && goalFor(item), { cwd }) : { met: false, results: [] }
+    appendEvent(planDir, { type: 'loop', item: itemId(item), actionOk, met: ev.met })
+
+    if (actionOk && ev.met) {
+      completed.push(itemId(item))
+      writeMeta(planDir, { loop: { completed, escalated: null } })
+      continue
+    }
+    // first failure → escalate (stop), preserving what completed (resume picks up here)
+    const reason = actionOk ? 'goal-not-met' : 'action-failed'
+    const failing = ev.results.filter((r) => !r.pass).map((r) => r.label)
+    writeMeta(planDir, { loop: { completed, escalated: { item: itemId(item), reason, failing } } })
+    return finish(planDir, 'escalated', { reason, item: itemId(item), failing, completed, actionErr }, false)
+  }
+}
+
+function finish(planDir, status, data, writeState = true) {
+  // uniform `escalated` shape across ALL stop reasons (authority / invalid-queue / bound / failure)
+  if (writeState) writeMeta(planDir, { loop: { completed: data.completed || [], escalated: status === 'complete' ? null : { reason: data.reason, item: data.item ?? null } } })
+  appendEvent(planDir, { type: 'loop-end', status, reason: data.reason ?? null, completed: (data.completed || []).length })
+  return { status, ...data }
+}
diff --git a/.claude/skills/deep-plan/test/run-loop.test.mjs b/.claude/skills/deep-plan/test/run-loop.test.mjs
new file mode 100644
index 000000000..895b924e8
--- /dev/null
+++ b/.claude/skills/deep-plan/test/run-loop.test.mjs
@@ -0,0 +1,121 @@
+// Synthetic harness for the generic loop ENGINE (Phase C). Validates the MECHANICS — advance / gate /
+// escalate / bound / authority / RESUME — against a fake queue with controllable pass/fail commands
+// (`true`/`false`), NOT real domain migration. The test commands are the engine's "something that can
+// say no." (Real-migration fit is deferred to Stage 1; this proves the engine does what it's coded to.)
+import { test } from 'node:test'
+import assert from 'node:assert/strict'
+import fs from 'node:fs'
+import os from 'node:os'
+import path from 'node:path'
+import { runLoop, firstPendingItem } from '../scripts/run-loop.mjs'
+import { readLedger, EVENTS_FILE } from '../scripts/run-ledger.mjs'
+
+const tmp = () => fs.mkdtempSync(path.join(os.tmpdir(), 'run-loop-'))
+const actionFor = (it) => it.action
+const goalFor = (it) => ({ predicates: [{ cmd: it.goal, label: it.id }] })
+const item = (id, action, goal) => ({ id, action, goal })
+const loopEvents = (d) => fs.readFileSync(path.join(d, EVENTS_FILE), 'utf8').trim().split('\n').map(JSON.parse).filter((e) => e.type === 'loop' || e.type === 'loop-end')
+
+// ---- authority gate (fail-closed) ----
+test('the loop refuses to run without implement authority (fail-closed)', () => {
+  const d = tmp()
+  const r = runLoop({ planDir: d, queue: [item('a', 'true', 'true')], actionFor, goalFor, authority: 'triage' })
+  assert.equal(r.status, 'escalated')
+  assert.equal(r.reason, 'authority-denied')
+  assert.deepEqual(r.completed, [])
+})
+test('implement (or higher) authority lets the loop run', () => {
+  const d = tmp()
+  assert.equal(runLoop({ planDir: d, queue: [item('a', 'true', 'true')], actionFor, goalFor, authority: 'merge' }).status, 'complete')
+})
+
+// ---- happy path ----
+test('all items pass action + goal → complete, in order', () => {
+  const d = tmp()
+  const r = runLoop({ planDir: d, queue: [item('a', 'true', 'true'), item('b', 'true', 'true')], actionFor, goalFor, authority: 'implement' })
+  assert.equal(r.status, 'complete')
+  assert.deepEqual(r.completed, ['a', 'b'])
+  assert.deepEqual(readLedger(d).loop.completed, ['a', 'b'])
+})
+
+// ---- the two failure modes ----
+test('a failing GOAL escalates at that item (completed preserved), with the failing predicate named', () => {
+  const d = tmp()
+  const r = runLoop({ planDir: d, queue: [item('a', 'true', 'true'), item('b', 'true', 'false')], actionFor, goalFor, authority: 'implement' })
+  assert.equal(r.status, 'escalated')
+  assert.equal(r.reason, 'goal-not-met')
+  assert.equal(r.item, 'b')
+  assert.deepEqual(r.completed, ['a'])
+  assert.deepEqual(r.failing, ['b'])
+})
+test('a failing ACTION escalates before the goal is even evaluated', () => {
+  const d = tmp()
+  const r = runLoop({ planDir: d, queue: [item('a', 'true', 'true'), item('b', 'false', 'true')], actionFor, goalFor, authority: 'implement' })
+  assert.equal(r.status, 'escalated')
+  assert.equal(r.reason, 'action-failed')
+  assert.equal(r.item, 'b')
+  assert.deepEqual(r.completed, ['a'])
+})
+
+// ---- bounds (total cap, resume-safe) ----
+test('maxIterations caps total completed items, then escalates (not silently truncates)', () => {
+  const d = tmp()
+  const q = ['a', 'b', 'c', 'd', 'e'].map((id) => item(id, 'true', 'true'))
+  const r = runLoop({ planDir: d, queue: q, actionFor, goalFor, authority: 'implement', bounds: { maxIterations: 2 } })
+  assert.equal(r.status, 'escalated')
+  assert.equal(r.reason, 'maxIterations')
+  assert.deepEqual(r.completed, ['a', 'b'])
+  assert.equal(readLedger(d).loop.escalated.item, 'c')   // uniform escalated shape: bound stop records the item too
+})
+
+// ---- RESUME (the point of the ledger) ----
+test('after an escalation, re-running resumes from the failed item (skips completed) and finishes', () => {
+  const d = tmp()
+  // run 1: b's goal fails → escalate at b, completed [a]
+  const r1 = runLoop({ planDir: d, queue: [item('a', 'true', 'true'), item('b', 'true', 'false')], actionFor, goalFor, authority: 'implement' })
+  assert.equal(r1.status, 'escalated')
+  assert.deepEqual(r1.completed, ['a'])
+  // "fix" b (its goal now passes) and re-run on the SAME ledger → resumes at b, completes
+  const r2 = runLoop({ planDir: d, queue: [item('a', 'true', 'true'), item('b', 'true', 'true')], actionFor, goalFor, authority: 'implement' })
+  assert.equal(r2.status, 'complete')
+  assert.deepEqual(r2.completed, ['a', 'b'])
+  // 'a' was NOT re-run on resume (firstPendingItem skipped it)
+  assert.equal(readLedger(d).loop.completed.filter((x) => x === 'a').length, 1)
+})
+
+// ---- empty queue ----
+test('an empty queue completes vacuously (nothing to do)', () => {
+  const d = tmp()
+  const r = runLoop({ planDir: d, queue: [], actionFor, goalFor, authority: 'implement' })
+  assert.equal(r.status, 'complete')
+  assert.deepEqual(r.completed, [])
+})
+
+test('a malformed queue is REJECTED (fail-closed), never silently completed', () => {
+  // duplicate ids would collapse two distinct items into one → reject
+  const dup = runLoop({ planDir: tmp(), queue: [item('a', 'true', 'true'), item('a', 'true', 'true')], actionFor, goalFor, authority: 'implement' })
+  assert.equal(dup.status, 'escalated')
+  assert.equal(dup.reason, 'invalid-queue')
+  // an item with no stable string id has no resumable identity → reject
+  const noid = runLoop({ planDir: tmp(), queue: [{ action: 'true', goal: 'true' }], actionFor, goalFor, authority: 'implement' })
+  assert.equal(noid.status, 'escalated')
+  assert.equal(noid.reason, 'invalid-queue')
+})
+
+// ---- ledger + event trail ----
+test('the loop records per-item events + a loop-end event to the run-ledger trail', () => {
+  const d = tmp()
+  runLoop({ planDir: d, queue: [item('a', 'true', 'true'), item('b', 'true', 'false')], actionFor, goalFor, authority: 'implement' })
+  const ev = loopEvents(d)
+  assert.equal(ev.filter((e) => e.type === 'loop').length, 2)         // one per attempted item
+  assert.equal(ev.filter((e) => e.type === 'loop-end').length, 1)
+  assert.equal(ev.find((e) => e.type === 'loop-end').status, 'escalated')
+})
+
+// ---- the resumable primitive in isolation ----
+test('firstPendingItem returns the first not-yet-completed item, or undefined when all done', () => {
+  const q = [item('a', 'true', 'true'), item('b', 'true', 'true')]
+  assert.equal(firstPendingItem(q, []).id, 'a')
+  assert.equal(firstPendingItem(q, ['a']).id, 'b')
+  assert.equal(firstPendingItem(q, ['a', 'b']), undefined)
+})

From 6f17480ea20adf602827f38cb3e2e53f6d734360 Mon Sep 17 00:00:00 2001
From: Number531 <120485065+Number531@users.noreply.github.com>
Date: Tue, 16 Jun 2026 00:59:10 -0400
Subject: [PATCH 3/3] fix(deep-plan): remediate full-audit findings on the
 governable-autonomy layer
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Three independent adversarial audits of PR #229 (A+B+C): code SOUND, backward-compat CLEAN, docs
HAD-CONTRADICTIONS (schema doc lagged Phase C). All real findings remediated; code was not
overclaimed (every '✅ built' verified real). No fail-open, no regression.

Code (fail-closed / robustness / observability):
- run-loop: distinguish 'goal-absent' (missing goal config) from 'goal-not-met' (goal ran + failed);
  record actionSkipped so a typo'd real wiring leaves a trail signal, not a silent skip.
- evaluateGoal: guard a predicate with no cmd → clean 'predicate has no cmd' error, not a raw TypeError.
- appendEvent: torn-write guard — separate a crashed partial line so seq/parse stay sane.

Docs (schema doc was written before Phase C landed):
- document the  ledger field + the / event types that run-loop.mjs writes.
- correct 'before the loop exists' → the engine is built-but-un-wired; budget is read by nothing yet.
- soften the events-log 'Reader: conductor' claim → observability, not yet consumed (write-only today).
- DECISIONS.md: Phase-C entry recording the build-engine-but-defer-wiring decision.

Tests (+5): missing-cmd guard; goal-absent vs goal-not-met; actionSkipped; the Phase-A×Phase-C
cross-path (loop preserves legs); resume after a maxIterations escalation.

Deferred (LOW, noted): per-item writeMeta is O(N) full-ledger rewrites + a redundant meta event —
fine for the bounded domain queue, optimize only if queues grow large.

80 *.test.mjs + upgrade-harness + revise dry-run-harness green; still zero non-test callers; backward-compatible.

Co-Authored-By: Claude Fable 5 <noreply@anthropic.com>
---
 .../references/run-ledger.schema.md           | 33 +++++++++----
 .claude/skills/deep-plan/DECISIONS.md         | 11 +++++
 .../skills/deep-plan/scripts/run-ledger.mjs   | 16 +++++--
 .claude/skills/deep-plan/scripts/run-loop.mjs | 17 ++++---
 .../test/run-ledger-governance.test.mjs       |  5 ++
 .../skills/deep-plan/test/run-loop.test.mjs   | 48 ++++++++++++++++++-
 6 files changed, 110 insertions(+), 20 deletions(-)

diff --git a/.claude/skills/deep-plan-pipeline/references/run-ledger.schema.md b/.claude/skills/deep-plan-pipeline/references/run-ledger.schema.md
index 1c80aff57..2c7dde20b 100644
--- a/.claude/skills/deep-plan-pipeline/references/run-ledger.schema.md
+++ b/.claude/skills/deep-plan-pipeline/references/run-ledger.schema.md
@@ -38,14 +38,21 @@ created before this contract run unchanged (the conductor falls back to its prio
     "predicates": [ { "label": "parity", "kind": "command", "cmd": "node …/X-golden-parity.test.js" } ],
     "met": false,            // = AND(predicate exit codes); empty predicates ⇒ false (fail-closed)
     "bounds": { "maxIterations": 3, "capTokens": 800000, "noProgressRounds": 2 }
+  },
+
+  // — Phase C: loop-engine resume state (written by run-loop.mjs; absent until the loop runs) —
+  "loop": {
+    "completed": ["item-id"],                 // queue items whose action + goal passed (the resume key set)
+    "escalated": { "item": "…", "reason": "goal-not-met|goal-absent|action-failed|invalid-queue|authority-denied|maxIterations|capTokens|noProgressRounds", "failing": ["<predicate label>"] }  // null when complete
   }
 }
 ```
 
-## Governable-autonomy fields (Phase B)
+## Governable-autonomy fields (Phases B–C)
 
-These make the ledger ready for an autonomous loop *before* the loop exists — each is optional and
-inert until a consumer reads it (the conductor / a future rollout loop).
+These make the ledger ready for the autonomous loop engine (`run-loop.mjs`, Phase C — **built but
+un-wired** to real domains). Each is optional and inert until a consumer reads it (the loop engine, once
+its real-domain config is wired; the conductor for resume).
 
 - **`authority`** — the granted permission tier (`triage < implement < push < merge`), checked by
   `authorityAllows(ledger, action)`. **Fail-closed**: absent/unknown authority permits nothing, so a
@@ -55,8 +62,12 @@ inert until a consumer reads it (the conductor / a future rollout loop).
   runaway (the 6M-token failure had none). Bound fields are positive integers; `0`/negative trip
   immediately and should be treated as misconfiguration.
 - **`budget`** (reserved) — a top-level `{capTokens, spent}` field for the loop's own spend-tracking;
-  NOT yet wired to a helper. The **active** token cap today is `goal.bounds.capTokens`; when the loop
-  ships it will reconcile `budget.spent` against `capTokens` and own this field.
+  NOT yet read by any helper or by the Phase-C loop engine (which today tracks completion in
+  `loop.completed`, not token spend). The **active** token cap is `goal.bounds.capTokens`; the budget
+  field is wired when the loop's real-domain config tracks spend.
+- **`loop`** — the loop engine's resume state (`{completed, escalated}`), written by `run-loop.mjs` and
+  read back by `firstPendingItem` to resume mid-queue. Disjoint from `legs` (a real pipeline's per-leg
+  state) — the two coexist in one ledger without collision.
 - **`goal`** — the quantified "done", evaluated by `evaluateGoal(goal, {cwd})`:
   `met = AND(predicate.cmd exit 0)`. Every predicate is a **falsifiable command** (deterministic-first;
   a metric/verdict predicate is just a command that exits 0/1). **Empty predicates ⇒ not met**
@@ -71,13 +82,17 @@ The Claude-side machine transition trail — the lighter counterpart to the Code
 rewritten; each carries a **monotonic 1-based `seq`** for ordering + replay/gap detection.
 
 ```jsonc
-{ "seq": 1, "type": "leg",  "leg": "deepPlan", "status": "done-advance", "verdict": "sound", "at": "YYYY-MM-DD" }
-{ "seq": 2, "type": "meta", "keys": ["loopbackCount"], "at": null }
+{ "seq": 1, "type": "leg",      "leg": "deepPlan", "status": "done-advance", "verdict": "sound", "at": "YYYY-MM-DD" }
+{ "seq": 2, "type": "meta",     "keys": ["loopbackCount"], "at": null }
+{ "seq": 3, "type": "loop",     "item": "…", "actionOk": true, "actionSkipped": false, "met": true }   // Phase C: one per loop item
+{ "seq": 4, "type": "loop-end", "status": "complete|escalated", "reason": null, "completed": 2 }        // Phase C: loop terminal
 ```
 
 Write-ahead ordering (event appended *before* the ledger write) means a crash over-counts (an event
-with no committed state — auditable) rather than silently under-counting. Reader: the conductor's
-resume/observability path. It is *unit-tested*, not JSON-schema-validated (the deliberate lean-Claude
+with no committed state — auditable) rather than silently under-counting; a torn-write guard in
+`appendEvent` separates a crashed partial line so `seq` stays sane. Reader (intended): **observability**
+— the events log is NOT yet consumed by any code; the conductor resumes from `run-ledger.json` via
+`firstPendingLeg`, not from this file. It is *unit-tested*, not JSON-schema-validated (the deliberate lean-Claude
 posture; the Codex `deep-plan-events.jsonl` carries the heavier registered-schema discipline).
 
 ## Status enum (the single point that preserves verdict semantics)
diff --git a/.claude/skills/deep-plan/DECISIONS.md b/.claude/skills/deep-plan/DECISIONS.md
index cbe23af5f..b43c91ef7 100644
--- a/.claude/skills/deep-plan/DECISIONS.md
+++ b/.claude/skills/deep-plan/DECISIONS.md
@@ -34,3 +34,14 @@ conductor's routing semantics so a resumed run can never skip a certification ga
 Karpathy's rule for autonomy is "set the boundaries first" (objective, metric, permissions) and keep
 a record you can re-enter through. Budget boundary + the rollout loop are deferred until a consumer
 (the loop) exists — don't build machinery ahead of its substrate.
+
+## 2026-06-16 — Built the loop ENGINE (Phase C) but kept it UN-WIRED
+**Decision:** build a generic, domain-agnostic loop engine (`run-loop.mjs`) now — validated against a
+synthetic harness — but inject `actionFor`/`goalFor` and leave it un-wired to real domains. **Why:** the
+"don't build machinery before its substrate" rule applies at the *wiring* boundary, not the *engine*
+boundary. The engine's mechanics (advance / gate / escalate / bound / resume) are knowable from first
+principles + the orchestrator-worker research and are fully testable against mocks — so a synthetic
+harness *can* say "no" to the engine. What's genuinely unknowable without Stage 1 is the *fit* to real
+migration (the real queue, the real parity tests, the escalation UX), so the **wiring** stays deferred.
+A synthetic test proves "the engine does what it's coded to," not "this design fits real migration" — the
+latter still wants Stage 1 to prove one migration by hand first.
diff --git a/.claude/skills/deep-plan/scripts/run-ledger.mjs b/.claude/skills/deep-plan/scripts/run-ledger.mjs
index c8f001c9e..62326814c 100644
--- a/.claude/skills/deep-plan/scripts/run-ledger.mjs
+++ b/.claude/skills/deep-plan/scripts/run-ledger.mjs
@@ -97,7 +97,11 @@ function eventCount(planDir) {
 // writeLeg/writeMeta log WRITE-AHEAD (event before the ledger write) so a crash over-counts
 // (auditable) rather than silently under-counts. Reader: the conductor's resume/observability path.
 export function appendEvent(planDir, event) {
-  fs.appendFileSync(eventsPath(planDir), JSON.stringify({ seq: eventCount(planDir) + 1, ...event }) + '\n')
+  const p = eventsPath(planDir)
+  // torn-write guard: if a prior crash left a final line without a trailing newline, separate it so the
+  // new record never concatenates onto a partial one (keeps the append-only log parseable + seq sane).
+  try { const b = fs.readFileSync(p); if (b.length && b[b.length - 1] !== 0x0a) fs.appendFileSync(p, '\n') } catch { /* absent — nothing to repair */ }
+  fs.appendFileSync(p, JSON.stringify({ seq: eventCount(planDir) + 1, ...event }) + '\n')
 }
 
 // read-merge-write a single leg entry; never clobbers another leg's key.
@@ -193,9 +197,13 @@ export function authorityAllows(ledger, action) {
 export function evaluateGoal(goal, { cwd, timeoutMs = 120000, maxBuffer = 10 * 1024 * 1024 } = {}) {
   const results = (goal && Array.isArray(goal.predicates) ? goal.predicates : []).map((p) => {
     let pass = false, error = null
-    try { execSync(p.cmd, { cwd, stdio: 'pipe', timeout: timeoutMs, maxBuffer }); pass = true }
-    catch (e) { error = e && e.message ? String(e.message).slice(0, 160) : 'nonzero exit' }
-    return { label: p.label ?? (p.cmd || '').slice(0, 60), kind: p.kind ?? 'command', pass, error }
+    if (!p || typeof p.cmd !== 'string' || !p.cmd) {
+      error = 'predicate has no cmd' // guard: a clean message, not a raw Node TypeError
+    } else {
+      try { execSync(p.cmd, { cwd, stdio: 'pipe', timeout: timeoutMs, maxBuffer }); pass = true }
+      catch (e) { error = e && e.message ? String(e.message).slice(0, 160) : 'nonzero exit' }
+    }
+    return { label: (p && p.label) ?? ((p && p.cmd) || '').slice(0, 60) ?? '<no-cmd>', kind: (p && p.kind) ?? 'command', pass, error }
   })
   const met = results.length > 0 && results.every((r) => r.pass) // empty → not met
   return { met, results }
diff --git a/.claude/skills/deep-plan/scripts/run-loop.mjs b/.claude/skills/deep-plan/scripts/run-loop.mjs
index 13b54477e..ad16ee145 100644
--- a/.claude/skills/deep-plan/scripts/run-loop.mjs
+++ b/.claude/skills/deep-plan/scripts/run-loop.mjs
@@ -52,16 +52,21 @@ export function runLoop({ planDir, queue = [], actionFor, goalFor, authority, bo
     const bound = boundExceeded(bounds, { iterations: completed.length })
     if (bound) return finish(planDir, 'escalated', { reason: bound, item: itemId(item), completed })
 
-    // 1. run the item's ACTION (the work that mutates state)
+    // 1. run the item's ACTION (the work that mutates state). A falsy actionFor is a goal-only item,
+    //    recorded as actionSkipped so a typo'd real wiring leaves a signal in the trail (not silent).
     let actionOk = true, actionErr = null
     const cmd = actionFor && actionFor(item)
+    const actionSkipped = !cmd
     if (cmd) {
       try { execSync(cmd, { cwd, stdio: 'pipe', timeout: 120000, maxBuffer: 10 * 1024 * 1024 }) }
       catch (e) { actionOk = false; actionErr = msg(e) }
     }
-    // 2. evaluate the item's GOAL (the falsifiable validator) — skipped if the action already failed
-    const ev = actionOk ? evaluateGoal(goalFor && goalFor(item), { cwd }) : { met: false, results: [] }
-    appendEvent(planDir, { type: 'loop', item: itemId(item), actionOk, met: ev.met })
+    // 2. evaluate the item's GOAL (the falsifiable validator) — skipped if the action already failed.
+    //    A missing/empty goal is 'goal-absent' (fail-closed config error), distinct from 'goal-not-met'.
+    const goal = goalFor && goalFor(item)
+    const goalAbsent = !goal || !Array.isArray(goal.predicates) || goal.predicates.length === 0
+    const ev = actionOk ? evaluateGoal(goal, { cwd }) : { met: false, results: [] }
+    appendEvent(planDir, { type: 'loop', item: itemId(item), actionOk, actionSkipped, met: ev.met })
 
     if (actionOk && ev.met) {
       completed.push(itemId(item))
@@ -69,8 +74,8 @@ export function runLoop({ planDir, queue = [], actionFor, goalFor, authority, bo
       continue
     }
     // first failure → escalate (stop), preserving what completed (resume picks up here)
-    const reason = actionOk ? 'goal-not-met' : 'action-failed'
-    const failing = ev.results.filter((r) => !r.pass).map((r) => r.label)
+    const reason = !actionOk ? 'action-failed' : (goalAbsent ? 'goal-absent' : 'goal-not-met')
+    const failing = goalAbsent ? ['<no-predicates>'] : ev.results.filter((r) => !r.pass).map((r) => r.label)
     writeMeta(planDir, { loop: { completed, escalated: { item: itemId(item), reason, failing } } })
     return finish(planDir, 'escalated', { reason, item: itemId(item), failing, completed, actionErr }, false)
   }
diff --git a/.claude/skills/deep-plan/test/run-ledger-governance.test.mjs b/.claude/skills/deep-plan/test/run-ledger-governance.test.mjs
index 4313ffaa4..144134dae 100644
--- a/.claude/skills/deep-plan/test/run-ledger-governance.test.mjs
+++ b/.claude/skills/deep-plan/test/run-ledger-governance.test.mjs
@@ -79,6 +79,11 @@ test('evaluateGoal: met iff ALL predicate commands pass (exit 0); empty predicat
   assert.equal(evaluateGoal({ predicates: [] }).met, false)  // fail-closed: nothing to say no
   assert.equal(evaluateGoal({}).met, false)
 })
+test('evaluateGoal: a predicate with no cmd is fail-closed with a clean message (not a raw TypeError)', () => {
+  const r = evaluateGoal({ predicates: [{ label: 'x' }] })
+  assert.equal(r.met, false)
+  assert.equal(r.results[0].error, 'predicate has no cmd')
+})
 
 // ---- Phase B: loop bounds (the other half of stop = done ∨ bound) ----
 test('boundExceeded returns the tripped bound name, else null', () => {
diff --git a/.claude/skills/deep-plan/test/run-loop.test.mjs b/.claude/skills/deep-plan/test/run-loop.test.mjs
index 895b924e8..f99ab0b8d 100644
--- a/.claude/skills/deep-plan/test/run-loop.test.mjs
+++ b/.claude/skills/deep-plan/test/run-loop.test.mjs
@@ -8,7 +8,7 @@ import fs from 'node:fs'
 import os from 'node:os'
 import path from 'node:path'
 import { runLoop, firstPendingItem } from '../scripts/run-loop.mjs'
-import { readLedger, EVENTS_FILE } from '../scripts/run-ledger.mjs'
+import { readLedger, writeLeg, EVENTS_FILE } from '../scripts/run-ledger.mjs'
 
 const tmp = () => fs.mkdtempSync(path.join(os.tmpdir(), 'run-loop-'))
 const actionFor = (it) => it.action
@@ -119,3 +119,49 @@ test('firstPendingItem returns the first not-yet-completed item, or undefined wh
   assert.equal(firstPendingItem(q, ['a']).id, 'b')
   assert.equal(firstPendingItem(q, ['a', 'b']), undefined)
 })
+
+// ---- a missing goal is a CONFIG error (goal-absent), distinct from a failing goal ----
+test('a missing/empty goal is goal-ABSENT (fail-closed config error), not goal-not-met', () => {
+  const r = runLoop({
+    planDir: tmp(),
+    queue: [item('a', 'true', 'true'), { id: 'b', action: 'true' }],
+    actionFor,
+    goalFor: (it) => (it.goal ? { predicates: [{ cmd: it.goal, label: it.id }] } : {}), // b has no goal
+    authority: 'implement',
+  })
+  assert.equal(r.status, 'escalated')
+  assert.equal(r.reason, 'goal-absent')
+  assert.equal(r.item, 'b')
+  assert.deepEqual(r.failing, ['<no-predicates>'])
+})
+
+// ---- a goal-only item records actionSkipped (a typo'd real wiring leaves a signal) ----
+test('a goal-only item (no action) is recorded actionSkipped and completes on its goal', () => {
+  const d = tmp()
+  const r = runLoop({ planDir: d, queue: [{ id: 'a', goal: 'true' }], actionFor: () => undefined, goalFor, authority: 'implement' })
+  assert.equal(r.status, 'complete')
+  const ev = fs.readFileSync(path.join(d, EVENTS_FILE), 'utf8').trim().split('\n').map(JSON.parse).find((e) => e.type === 'loop')
+  assert.equal(ev.actionSkipped, true)
+})
+
+// ---- Phase-A × Phase-C: the loop's `loop` field is disjoint from the legs a real pipeline writes ----
+test('the loop preserves existing leg state (loop and legs are disjoint ledger keys)', () => {
+  const d = tmp()
+  writeLeg(d, 'deepPlan', { status: 'done-advance', verdict: 'sound' }) // a real pipeline left legs here
+  runLoop({ planDir: d, queue: [item('a', 'true', 'true')], actionFor, goalFor, authority: 'implement' })
+  const led = readLedger(d)
+  assert.equal(led.legs.deepPlan.status, 'done-advance') // legs untouched by the loop
+  assert.deepEqual(led.loop.completed, ['a'])            // loop state sits alongside, not clobbering
+})
+
+// ---- resume after a BOUND escalation (not just a failure) ----
+test('resume after a maxIterations escalation: raising the cap continues from where it stopped', () => {
+  const d = tmp()
+  const q = ['a', 'b', 'c'].map((id) => item(id, 'true', 'true'))
+  const r1 = runLoop({ planDir: d, queue: q, actionFor, goalFor, authority: 'implement', bounds: { maxIterations: 2 } })
+  assert.equal(r1.reason, 'maxIterations')
+  assert.deepEqual(r1.completed, ['a', 'b'])
+  const r2 = runLoop({ planDir: d, queue: q, actionFor, goalFor, authority: 'implement', bounds: { maxIterations: 5 } })
+  assert.equal(r2.status, 'complete')
+  assert.deepEqual(r2.completed, ['a', 'b', 'c']) // resumed at c; a/b not re-run
+})