From e8bff7656fcbffdf40cb99396180866c3ab17440 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Tue, 24 Mar 2026 20:28:30 -0700 Subject: [PATCH 1/5] fix: Codex description limit + wrong-repo bug Move skill routing table from root SKILL.md.tmpl description (1017/1024 chars) to body. Add 900-char warning threshold test to prevent future creep. Add -C flag to all 14 codex exec calls so Codex always runs in the correct git root. Fix pre-existing package.json version mismatch. Co-Authored-By: Claude Opus 4.6 (1M context) --- SKILL.md.tmpl | 30 ++++++++++++++++++++++-------- autoplan/SKILL.md.tmpl | 6 +++--- codex/SKILL.md.tmpl | 6 +++--- package.json | 2 +- scripts/resolvers/design.ts | 6 +++--- scripts/resolvers/review.ts | 6 +++--- test/gen-skill-docs.test.ts | 18 ++++++++++++++++++ 7 files changed, 53 insertions(+), 21 deletions(-) diff --git a/SKILL.md.tmpl b/SKILL.md.tmpl index c0a39510e..31bd28375 100644 --- a/SKILL.md.tmpl +++ b/SKILL.md.tmpl @@ -7,14 +7,6 @@ description: | elements, verify state, diff before/after, take annotated screenshots, test responsive layouts, forms, uploads, dialogs, and capture bug evidence. Use when asked to open or test a site, verify a deployment, dogfood a user flow, or file a bug with screenshots. - Also suggest adjacent gstack skills by stage: brainstorm /office-hours; strategy - /plan-ceo-review; architecture /plan-eng-review; design /plan-design-review or - /design-consultation; auto-review /autoplan; debugging /investigate; QA /qa; code review - /review; visual audit /design-review; shipping /ship; docs /document-release; retro - /retro; second opinion /codex; prod safety /careful or /guard; scoped edits /freeze or - /unfreeze; gstack upgrades /gstack-upgrade. If the user opts out of suggestions, stop - and run gstack-config set proactive false; if they opt back in, run gstack-config set - proactive true. allowed-tools: - Bash - Read @@ -28,6 +20,28 @@ If `PROACTIVE` is `false`: do NOT proactively suggest other gstack skills during Only run skills the user explicitly invokes. This preference persists across sessions via `gstack-config`. +If `PROACTIVE` is `true` (default): suggest adjacent gstack skills when relevant to the +user's workflow stage: +- Brainstorming → /office-hours +- Strategy → /plan-ceo-review +- Architecture → /plan-eng-review +- Design → /plan-design-review or /design-consultation +- Auto-review → /autoplan +- Debugging → /investigate +- QA → /qa +- Code review → /review +- Visual audit → /design-review +- Shipping → /ship +- Docs → /document-release +- Retro → /retro +- Second opinion → /codex +- Prod safety → /careful or /guard +- Scoped edits → /freeze or /unfreeze +- Upgrades → /gstack-upgrade + +If the user opts out of suggestions, run `gstack-config set proactive false`. +If they opt back in, run `gstack-config set proactive true`. + # gstack browse: QA Testing & Dogfooding Persistent headless Chromium. First call auto-starts (~3s), then ~100-200ms per command. diff --git a/autoplan/SKILL.md.tmpl b/autoplan/SKILL.md.tmpl index b3e0a3402..661e8fb01 100644 --- a/autoplan/SKILL.md.tmpl +++ b/autoplan/SKILL.md.tmpl @@ -204,7 +204,7 @@ Override: every AskUserQuestion → auto-decide using the 6 principles. What alternatives were dismissed too quickly? What competitive or market risks are unaddressed? What scope decisions will look foolish in 6 months? Be adversarial. No compliments. Just the strategic blind spots. - File: " -s read-only --enable web_search_cached` + File: " -C "$(git rev-parse --show-toplevel)" -s read-only --enable web_search_cached` Timeout: 10 minutes **Claude CEO subagent** (via Agent tool): @@ -315,7 +315,7 @@ Override: every AskUserQuestion → auto-decide using the 6 principles. accessibility requirements (keyboard nav, contrast, touch targets) specified or aspirational? Does the plan describe specific UI decisions or generic patterns? What design decisions will haunt the implementer if left ambiguous? - Be opinionated. No hedging." -s read-only --enable web_search_cached` + Be opinionated. No hedging." -C "$(git rev-parse --show-toplevel)" -s read-only --enable web_search_cached` Timeout: 10 minutes **Claude design subagent** (via Agent tool): @@ -380,7 +380,7 @@ Override: every AskUserQuestion → auto-decide using the 6 principles. CEO: Design: - File: " -s read-only --enable web_search_cached` + File: " -C "$(git rev-parse --show-toplevel)" -s read-only --enable web_search_cached` Timeout: 10 minutes **Claude eng subagent** (via Agent tool): diff --git a/codex/SKILL.md.tmpl b/codex/SKILL.md.tmpl index c0b7adb1b..338df93b6 100644 --- a/codex/SKILL.md.tmpl +++ b/codex/SKILL.md.tmpl @@ -159,7 +159,7 @@ With focus (e.g., "security"): 2. Run codex exec with **JSONL output** to capture reasoning traces and tool calls (5-minute timeout): ```bash -codex exec "" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached --json 2>/dev/null | python3 -c " +codex exec "" -C "$(git rev-parse --show-toplevel)" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached --json 2>/dev/null | python3 -c " import sys, json for line in sys.stdin: line = line.strip() @@ -244,7 +244,7 @@ THE PLAN: For a **new session:** ```bash -codex exec "" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached --json 2>"$TMPERR" | python3 -c " +codex exec "" -C "$(git rev-parse --show-toplevel)" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached --json 2>"$TMPERR" | python3 -c " import sys, json for line in sys.stdin: line = line.strip() @@ -277,7 +277,7 @@ for line in sys.stdin: For a **resumed session** (user chose "Continue"): ```bash -codex exec resume "" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached --json 2>"$TMPERR" | python3 -c " +codex exec resume "" -C "$(git rev-parse --show-toplevel)" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached --json 2>"$TMPERR" | python3 -c " " ``` diff --git a/package.json b/package.json index 0130b3514..cbf826161 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "gstack", - "version": "0.11.16.0", + "version": "0.11.16.1", "description": "Garry's Stack — Claude Code skills + fast headless browser. One repo, one install, entire AI engineering workflow.", "license": "MIT", "type": "module", diff --git a/scripts/resolvers/design.ts b/scripts/resolvers/design.ts index 30b1fe2c8..c4926112a 100644 --- a/scripts/resolvers/design.ts +++ b/scripts/resolvers/design.ts @@ -17,7 +17,7 @@ If Codex is available, run a lightweight design check on the diff: \`\`\`bash TMPERR_DRL=$(mktemp /tmp/codex-drl-XXXXXXXX) -codex exec "Review the git diff on this branch. Run 7 litmus checks (YES/NO each): ${litmusList} Flag any hard rejections: ${rejectionList} 5 most important design findings only. Reference file:line." -s read-only -c 'model_reasoning_effort="high"' --enable web_search_cached 2>"$TMPERR_DRL" +codex exec "Review the git diff on this branch. Run 7 litmus checks (YES/NO each): ${litmusList} Flag any hard rejections: ${rejectionList} 5 most important design findings only. Reference file:line." -C "$(git rev-parse --show-toplevel)" -s read-only -c 'model_reasoning_effort="high"' --enable web_search_cached 2>"$TMPERR_DRL" \`\`\` Use a 5-minute timeout (\`timeout: 300000\`). After the command completes, read stderr: @@ -467,7 +467,7 @@ If user chooses A, launch both voices simultaneously: 1. **Codex** (via Bash, \`model_reasoning_effort="medium"\`): \`\`\`bash TMPERR_SKETCH=$(mktemp /tmp/codex-sketch-XXXXXXXX) -codex exec "For this product approach, provide: a visual thesis (one sentence — mood, material, energy), a content plan (hero → support → detail → CTA), and 2 interaction ideas that change page feel. Apply beautiful defaults: composition-first, brand-first, cardless, poster not document. Be opinionated." -s read-only -c 'model_reasoning_effort="medium"' --enable web_search_cached 2>"$TMPERR_SKETCH" +codex exec "For this product approach, provide: a visual thesis (one sentence — mood, material, energy), a content plan (hero → support → detail → CTA), and 2 interaction ideas that change page feel. Apply beautiful defaults: composition-first, brand-first, cardless, poster not document. Be opinionated." -C "$(git rev-parse --show-toplevel)" -s read-only -c 'model_reasoning_effort="medium"' --enable web_search_cached 2>"$TMPERR_SKETCH" \`\`\` Use a 5-minute timeout (\`timeout: 300000\`). After completion: \`cat "$TMPERR_SKETCH" && rm -f "$TMPERR_SKETCH"\` @@ -636,7 +636,7 @@ which codex 2>/dev/null && echo "CODEX_AVAILABLE" || echo "CODEX_NOT_AVAILABLE" 1. **Codex design voice** (via Bash): \`\`\`bash TMPERR_DESIGN=$(mktemp /tmp/codex-design-XXXXXXXX) -codex exec "${escapedCodexPrompt}" -s read-only -c 'model_reasoning_effort="${reasoningEffort}"' --enable web_search_cached 2>"$TMPERR_DESIGN" +codex exec "${escapedCodexPrompt}" -C "$(git rev-parse --show-toplevel)" -s read-only -c 'model_reasoning_effort="${reasoningEffort}"' --enable web_search_cached 2>"$TMPERR_DESIGN" \`\`\` Use a 5-minute timeout (\`timeout: 300000\`). After the command completes, read stderr: \`\`\`bash diff --git a/scripts/resolvers/review.ts b/scripts/resolvers/review.ts index 2f355ef68..0087cda7c 100644 --- a/scripts/resolvers/review.ts +++ b/scripts/resolvers/review.ts @@ -286,7 +286,7 @@ Write the full prompt (context block + instructions) to this file. Use the mode- \`\`\`bash TMPERR_OH=$(mktemp /tmp/codex-oh-err-XXXXXXXX) -codex exec "$(cat "$CODEX_PROMPT_FILE")" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached 2>"$TMPERR_OH" +codex exec "$(cat "$CODEX_PROMPT_FILE")" -C "$(git rev-parse --show-toplevel)" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached 2>"$TMPERR_OH" \`\`\` Use a 5-minute timeout (\`timeout: 300000\`). After the command completes, read stderr: @@ -370,7 +370,7 @@ Claude's structured review already ran. Now add a **cross-model adversarial chal \`\`\`bash TMPERR_ADV=$(mktemp /tmp/codex-adv-XXXXXXXX) -codex exec "Review the changes on this branch against the base branch. Run git diff origin/ to see the diff. Your job is to find ways this code will fail in production. Think like an attacker and a chaos engineer. Find edge cases, race conditions, security holes, resource leaks, failure modes, and silent data corruption paths. Be adversarial. Be thorough. No compliments — just the problems." -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached 2>"$TMPERR_ADV" +codex exec "Review the changes on this branch against the base branch. Run git diff origin/ to see the diff. Your job is to find ways this code will fail in production. Think like an attacker and a chaos engineer. Find edge cases, race conditions, security holes, resource leaks, failure modes, and silent data corruption paths. Be adversarial. Be thorough. No compliments — just the problems." -C "$(git rev-parse --show-toplevel)" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached 2>"$TMPERR_ADV" \`\`\` Set the Bash tool's \`timeout\` parameter to \`300000\` (5 minutes). Do NOT use the \`timeout\` shell command — it doesn't exist on macOS. After the command completes, read stderr: @@ -525,7 +525,7 @@ THE PLAN: \`\`\`bash TMPERR_PV=$(mktemp /tmp/codex-planreview-XXXXXXXX) -codex exec "" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached 2>"$TMPERR_PV" +codex exec "" -C "$(git rev-parse --show-toplevel)" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached 2>"$TMPERR_PV" \`\`\` Use a 5-minute timeout (\`timeout: 300000\`). After the command completes, read stderr: diff --git a/test/gen-skill-docs.test.ts b/test/gen-skill-docs.test.ts index d0da767a8..590f49504 100644 --- a/test/gen-skill-docs.test.ts +++ b/test/gen-skill-docs.test.ts @@ -152,6 +152,24 @@ describe('gen-skill-docs', () => { } }); + test('every Codex SKILL.md description stays under 900-char warning threshold', () => { + const WARN_THRESHOLD = 900; + const agentsDir = path.join(ROOT, '.agents', 'skills'); + if (!fs.existsSync(agentsDir)) return; + const violations: string[] = []; + for (const entry of fs.readdirSync(agentsDir, { withFileTypes: true })) { + if (!entry.isDirectory()) continue; + const skillMd = path.join(agentsDir, entry.name, 'SKILL.md'); + if (!fs.existsSync(skillMd)) continue; + const content = fs.readFileSync(skillMd, 'utf-8'); + const description = extractDescription(content); + if (description.length > WARN_THRESHOLD) { + violations.push(`${entry.name}: ${description.length} chars (limit ${MAX_SKILL_DESCRIPTION_LENGTH}, ${MAX_SKILL_DESCRIPTION_LENGTH - description.length} remaining)`); + } + } + expect(violations).toEqual([]); + }); + test('package.json version matches VERSION file', () => { const pkg = JSON.parse(fs.readFileSync(path.join(ROOT, 'package.json'), 'utf-8')); const version = fs.readFileSync(path.join(ROOT, 'VERSION'), 'utf-8').trim(); From fa5d0803ee20cd5f07629e4297445b009c3f1d4a Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Tue, 24 Mar 2026 20:32:28 -0700 Subject: [PATCH 2/5] fix: Codex description limit + wrong-repo bug Move skill routing table from root SKILL.md.tmpl description (1017/1024 chars) to body where there's no length limit. Add 900-char warning threshold test. Add -C flag to all codex exec calls so Codex always runs in the correct git root directory. Co-Authored-By: Claude Opus 4.6 (1M context) --- SKILL.md.tmpl | 22 - autoplan/SKILL.md.tmpl | 6 +- benchmark/SKILL.md | 106 +- bin/gstack-telemetry-log | 44 +- browse/SKILL.md | 106 +- canary/SKILL.md | 93 +- careful/SKILL.md | 1 - codex/SKILL.md.tmpl | 6 +- cso/SKILL.md | 93 +- document-release/SKILL.md | 93 +- freeze/SKILL.md | 1 - gstack-upgrade/SKILL.md | 1 - guard/SKILL.md | 1 - investigate/SKILL.md | 93 +- land-and-deploy/SKILL.md | 91 +- qa-only/SKILL.md | 91 +- qa/SKILL.md | 91 +- retro/SKILL.md | 115 +- retro/SKILL.md.tmpl | 22 + review/SKILL.md.tmpl | 5 +- scripts/gen-skill-docs.ts | 2943 +++++++++++++++++++++++++++++++- scripts/resolvers/design.ts | 6 +- scripts/resolvers/index.ts | 5 +- scripts/resolvers/preamble.ts | 33 +- scripts/resolvers/review.ts | 239 ++- scripts/resolvers/testing.ts | 52 +- setup-browser-cookies/SKILL.md | 106 +- setup-deploy/SKILL.md | 93 +- ship/SKILL.md.tmpl | 51 +- test/gen-skill-docs.test.ts | 180 +- test/helpers/e2e-helpers.ts | 2 +- test/helpers/touchfiles.ts | 7 +- unfreeze/SKILL.md | 1 - 33 files changed, 4517 insertions(+), 282 deletions(-) diff --git a/SKILL.md.tmpl b/SKILL.md.tmpl index 31bd28375..fca8fa605 100644 --- a/SKILL.md.tmpl +++ b/SKILL.md.tmpl @@ -20,28 +20,6 @@ If `PROACTIVE` is `false`: do NOT proactively suggest other gstack skills during Only run skills the user explicitly invokes. This preference persists across sessions via `gstack-config`. -If `PROACTIVE` is `true` (default): suggest adjacent gstack skills when relevant to the -user's workflow stage: -- Brainstorming → /office-hours -- Strategy → /plan-ceo-review -- Architecture → /plan-eng-review -- Design → /plan-design-review or /design-consultation -- Auto-review → /autoplan -- Debugging → /investigate -- QA → /qa -- Code review → /review -- Visual audit → /design-review -- Shipping → /ship -- Docs → /document-release -- Retro → /retro -- Second opinion → /codex -- Prod safety → /careful or /guard -- Scoped edits → /freeze or /unfreeze -- Upgrades → /gstack-upgrade - -If the user opts out of suggestions, run `gstack-config set proactive false`. -If they opt back in, run `gstack-config set proactive true`. - # gstack browse: QA Testing & Dogfooding Persistent headless Chromium. First call auto-starts (~3s), then ~100-200ms per command. diff --git a/autoplan/SKILL.md.tmpl b/autoplan/SKILL.md.tmpl index 661e8fb01..b3e0a3402 100644 --- a/autoplan/SKILL.md.tmpl +++ b/autoplan/SKILL.md.tmpl @@ -204,7 +204,7 @@ Override: every AskUserQuestion → auto-decide using the 6 principles. What alternatives were dismissed too quickly? What competitive or market risks are unaddressed? What scope decisions will look foolish in 6 months? Be adversarial. No compliments. Just the strategic blind spots. - File: " -C "$(git rev-parse --show-toplevel)" -s read-only --enable web_search_cached` + File: " -s read-only --enable web_search_cached` Timeout: 10 minutes **Claude CEO subagent** (via Agent tool): @@ -315,7 +315,7 @@ Override: every AskUserQuestion → auto-decide using the 6 principles. accessibility requirements (keyboard nav, contrast, touch targets) specified or aspirational? Does the plan describe specific UI decisions or generic patterns? What design decisions will haunt the implementer if left ambiguous? - Be opinionated. No hedging." -C "$(git rev-parse --show-toplevel)" -s read-only --enable web_search_cached` + Be opinionated. No hedging." -s read-only --enable web_search_cached` Timeout: 10 minutes **Claude design subagent** (via Agent tool): @@ -380,7 +380,7 @@ Override: every AskUserQuestion → auto-decide using the 6 principles. CEO: Design: - File: " -C "$(git rev-parse --show-toplevel)" -s read-only --enable web_search_cached` + File: " -s read-only --enable web_search_cached` Timeout: 10 minutes **Claude eng subagent** (via Agent tool): diff --git a/benchmark/SKILL.md b/benchmark/SKILL.md index c6d2f72c4..d9138a034 100644 --- a/benchmark/SKILL.md +++ b/benchmark/SKILL.md @@ -3,7 +3,6 @@ name: benchmark preamble-tier: 1 version: 1.0.0 description: | - MANUAL TRIGGER ONLY: invoke only when user types /benchmark. Performance regression detection using the browse daemon. Establishes baselines for page load times, Core Web Vitals, and resource sizes. Compares before/after on every PR. Tracks performance trends over time. @@ -100,23 +99,112 @@ touch ~/.gstack/.telemetry-prompted This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. +## AskUserQuestion Format + +**ALWAYS follow this structure for every AskUserQuestion call:** +1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) +2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. +3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. +4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` +5. **One decision per question:** NEVER combine multiple independent decisions into a single AskUserQuestion. Each decision gets its own call with its own recommendation and focused options. Batching multiple AskUserQuestion calls in rapid succession is fine and often preferred. Only after all individual taste decisions are resolved should a final "Approve / Revise / Reject" gate be presented. + +Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. + +Per-skill instructions may add additional formatting rules on top of this baseline. + +## Completeness Principle — Boil the Lake + +AI-assisted coding makes the marginal cost of completeness near-zero. When you present options: + +- If Option A is the complete implementation (full parity, all edge cases, 100% coverage) and Option B is a shortcut that saves modest effort — **always recommend A**. The delta between 80 lines and 150 lines is meaningless with CC+gstack. "Good enough" is the wrong instinct when "complete" costs minutes more. +- **Lake vs. ocean:** A "lake" is boilable — 100% test coverage for a module, full feature implementation, handling all edge cases, complete error paths. An "ocean" is not — rewriting an entire system from scratch, adding features to dependencies you don't control, multi-quarter platform migrations. Recommend boiling lakes. Flag oceans as out of scope. +- **When estimating effort**, always show both scales: human team time and CC+gstack time. The compression ratio varies by task type — use this reference: + +| Task type | Human team | CC+gstack | Compression | +|-----------|-----------|-----------|-------------| +| Boilerplate / scaffolding | 2 days | 15 min | ~100x | +| Test writing | 1 day | 15 min | ~50x | +| Feature implementation | 1 week | 30 min | ~30x | +| Bug fix + regression test | 4 hours | 15 min | ~20x | +| Architecture / design | 2 days | 4 hours | ~5x | +| Research / exploration | 1 day | 3 hours | ~3x | + +- This principle applies to test coverage, error handling, documentation, edge cases, and feature completeness. Don't skip the last 10% to "save time" — with AI, that 10% costs seconds. + +**Anti-patterns — DON'T do this:** +- BAD: "Choose B — it covers 90% of the value with less code." (If A is only 70 lines more, choose A.) +- BAD: "We can skip edge case handling to save time." (Edge case handling costs minutes with CC.) +- BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) +- BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") + +## Repo Ownership Mode — See Something, Say Something + +`REPO_MODE` from the preamble tells you who owns issues in this repo: + +- **`solo`** — One person does 80%+ of the work. They own everything. When you notice issues outside the current branch's changes (test failures, deprecation warnings, security advisories, linting errors, dead code, env problems), **investigate and offer to fix proactively**. The solo dev is the only person who will fix it. Default to action. +- **`collaborative`** — Multiple active contributors. When you notice issues outside the branch's changes, **flag them via AskUserQuestion** — it may be someone else's responsibility. Default to asking, not fixing. +- **`unknown`** — Treat as collaborative (safer default — ask before fixing). + +**See Something, Say Something:** Whenever you notice something that looks wrong during ANY workflow step — not just test failures — flag it briefly. One sentence: what you noticed and its impact. In solo mode, follow up with "Want me to fix it?" In collaborative mode, just flag it and move on. + +Never let a noticed issue silently pass. The whole point is proactive communication. + +## Search Before Building + +Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.claude/skills/gstack/ETHOS.md` for the full philosophy. + +**Three layers of knowledge:** +- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. +- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. +- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. + +**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: +"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." + +Log eureka moments: +```bash +jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true +``` +Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. + +**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." + ## Contributor Mode -If `_CONTRIB` is `true`: you are in **contributor mode**. At the end of each major workflow step, rate your gstack experience 0-10. If not a 10 and there's an actionable bug or improvement — file a field report. +If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. -**File only:** gstack tooling bugs where the input was reasonable but gstack failed. **Skip:** user app bugs, network errors, auth failures on user's site. +**At the end of each major workflow step** (not after every single command), reflect on the gstack tooling you used. Rate your experience 0 to 10. If it wasn't a 10, think about why. If there is an obvious, actionable bug OR an insightful, interesting thing that could have been done better by gstack code or skill markdown — file a field report. Maybe our contributor will help make us better! + +**Calibration — this is the bar:** For example, `$B js "await fetch(...)"` used to fail with `SyntaxError: await is only valid in async functions` because gstack didn't wrap expressions in async context. Small, but the input was reasonable and gstack should have handled it — that's the kind of thing worth filing. Things less consequential than this, ignore. + +**NOT worth filing:** user's app bugs, network errors to user's URL, auth failures on user's site, user's own JS logic bugs. + +**To file:** write `~/.gstack/contributor-logs/{slug}.md` with **all sections below** (do not truncate — include every section through the Date/Version footer): -**To file:** write `~/.gstack/contributor-logs/{slug}.md`: ``` # {Title} -**What I tried:** {action} | **What happened:** {result} | **Rating:** {0-10} -## Repro + +Hey gstack team — ran into this while using /{skill-name}: + +**What I was trying to do:** {what the user/agent was attempting} +**What happened instead:** {what actually happened} +**My rating:** {0-10} — {one sentence on why it wasn't a 10} + +## Steps to reproduce 1. {step} + +## Raw output +``` +{paste the actual error or unexpected output here} +``` + ## What would make this a 10 -{one sentence} -**Date:** {YYYY-MM-DD} | **Version:** {version} | **Skill:** /{skill} +{one sentence: what gstack should have done differently} + +**Date:** {YYYY-MM-DD} | **Version:** {gstack version} | **Skill:** /{skill} ``` -Slug: lowercase hyphens, max 60 chars. Skip if exists. Max 3/session. File inline, don't stop. + +Slug: lowercase, hyphens, max 60 chars (e.g. `browse-js-no-await`). Skip if file already exists. Max 3 reports per session. File inline and continue — don't stop the workflow. Tell user: "Filed gstack field report: {title}" ## Completion Status Protocol diff --git a/bin/gstack-telemetry-log b/bin/gstack-telemetry-log index 885dfc2b2..5cddc519f 100755 --- a/bin/gstack-telemetry-log +++ b/bin/gstack-telemetry-log @@ -32,21 +32,30 @@ OUTCOME="unknown" USED_BROWSE="false" SESSION_ID="" ERROR_CLASS="" +ERROR_MESSAGE="" +FAILED_STEP="" EVENT_TYPE="skill_run" +SOURCE="" while [ $# -gt 0 ]; do case "$1" in - --skill) SKILL="$2"; shift 2 ;; - --duration) DURATION="$2"; shift 2 ;; - --outcome) OUTCOME="$2"; shift 2 ;; - --used-browse) USED_BROWSE="$2"; shift 2 ;; - --session-id) SESSION_ID="$2"; shift 2 ;; - --error-class) ERROR_CLASS="$2"; shift 2 ;; - --event-type) EVENT_TYPE="$2"; shift 2 ;; + --skill) SKILL="$2"; shift 2 ;; + --duration) DURATION="$2"; shift 2 ;; + --outcome) OUTCOME="$2"; shift 2 ;; + --used-browse) USED_BROWSE="$2"; shift 2 ;; + --session-id) SESSION_ID="$2"; shift 2 ;; + --error-class) ERROR_CLASS="$2"; shift 2 ;; + --error-message) ERROR_MESSAGE="$2"; shift 2 ;; + --failed-step) FAILED_STEP="$2"; shift 2 ;; + --event-type) EVENT_TYPE="$2"; shift 2 ;; + --source) SOURCE="$2"; shift 2 ;; *) shift ;; esac done +# Source: flag > env > default 'live' +SOURCE="${SOURCE:-${GSTACK_TELEMETRY_SOURCE:-live}}" + # ─── Read telemetry tier ───────────────────────────────────── TIER="$("$CONFIG_CMD" get telemetry 2>/dev/null || true)" TIER="${TIER:-off}" @@ -146,6 +155,20 @@ mkdir -p "$ANALYTICS_DIR" ERR_FIELD="null" [ -n "$ERROR_CLASS" ] && ERR_FIELD="\"$ERROR_CLASS\"" +ERR_MSG_FIELD="null" +[ -n "$ERROR_MESSAGE" ] && ERR_MSG_FIELD="\"$(echo "$ERROR_MESSAGE" | head -c 200 | sed 's/"/\\"/g')\"" + +STEP_FIELD="null" +[ -n "$FAILED_STEP" ] && STEP_FIELD="\"$(echo "$FAILED_STEP" | head -c 30)\"" + +# Cap unreasonable durations +if [ -n "$DURATION" ] && [ "$DURATION" -gt 86400 ] 2>/dev/null; then + DURATION="" # null if > 24h +fi +if [ -n "$DURATION" ] && [ "$DURATION" -lt 0 ] 2>/dev/null; then + DURATION="" # null if negative +fi + DUR_FIELD="null" [ -n "$DURATION" ] && DUR_FIELD="$DURATION" @@ -155,10 +178,11 @@ INSTALL_FIELD="null" BROWSE_BOOL="false" [ "$USED_BROWSE" = "true" ] && BROWSE_BOOL="true" -printf '{"v":1,"ts":"%s","event_type":"%s","skill":"%s","session_id":"%s","gstack_version":"%s","os":"%s","arch":"%s","duration_s":%s,"outcome":"%s","error_class":%s,"used_browse":%s,"sessions":%s,"installation_id":%s,"_repo_slug":"%s","_branch":"%s"}\n' \ +printf '{"v":1,"ts":"%s","event_type":"%s","skill":"%s","session_id":"%s","gstack_version":"%s","os":"%s","arch":"%s","duration_s":%s,"outcome":"%s","error_class":%s,"error_message":%s,"failed_step":%s,"used_browse":%s,"sessions":%s,"installation_id":%s,"source":"%s","_repo_slug":"%s","_branch":"%s"}\n' \ "$TS" "$EVENT_TYPE" "$SKILL" "$SESSION_ID" "$GSTACK_VERSION" "$OS" "$ARCH" \ - "$DUR_FIELD" "$OUTCOME" "$ERR_FIELD" "$BROWSE_BOOL" "${SESSIONS:-1}" \ - "$INSTALL_FIELD" "$REPO_SLUG" "$BRANCH" >> "$JSONL_FILE" 2>/dev/null || true + "$DUR_FIELD" "$OUTCOME" "$ERR_FIELD" "$ERR_MSG_FIELD" "$STEP_FIELD" \ + "$BROWSE_BOOL" "${SESSIONS:-1}" \ + "$INSTALL_FIELD" "$SOURCE" "$REPO_SLUG" "$BRANCH" >> "$JSONL_FILE" 2>/dev/null || true # ─── Trigger sync if tier is not off ───────────────────────── SYNC_CMD="$GSTACK_DIR/bin/gstack-telemetry-sync" diff --git a/browse/SKILL.md b/browse/SKILL.md index af3274c32..91845a99d 100644 --- a/browse/SKILL.md +++ b/browse/SKILL.md @@ -3,7 +3,6 @@ name: browse preamble-tier: 1 version: 1.1.0 description: | - MANUAL TRIGGER ONLY: invoke only when user types /browse. Fast headless browser for QA testing and site dogfooding. Navigate any URL, interact with elements, verify page state, diff before/after actions, take annotated screenshots, check responsive layouts, test forms and uploads, handle dialogs, and assert element states. @@ -100,23 +99,112 @@ touch ~/.gstack/.telemetry-prompted This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. +## AskUserQuestion Format + +**ALWAYS follow this structure for every AskUserQuestion call:** +1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) +2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. +3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. +4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` +5. **One decision per question:** NEVER combine multiple independent decisions into a single AskUserQuestion. Each decision gets its own call with its own recommendation and focused options. Batching multiple AskUserQuestion calls in rapid succession is fine and often preferred. Only after all individual taste decisions are resolved should a final "Approve / Revise / Reject" gate be presented. + +Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. + +Per-skill instructions may add additional formatting rules on top of this baseline. + +## Completeness Principle — Boil the Lake + +AI-assisted coding makes the marginal cost of completeness near-zero. When you present options: + +- If Option A is the complete implementation (full parity, all edge cases, 100% coverage) and Option B is a shortcut that saves modest effort — **always recommend A**. The delta between 80 lines and 150 lines is meaningless with CC+gstack. "Good enough" is the wrong instinct when "complete" costs minutes more. +- **Lake vs. ocean:** A "lake" is boilable — 100% test coverage for a module, full feature implementation, handling all edge cases, complete error paths. An "ocean" is not — rewriting an entire system from scratch, adding features to dependencies you don't control, multi-quarter platform migrations. Recommend boiling lakes. Flag oceans as out of scope. +- **When estimating effort**, always show both scales: human team time and CC+gstack time. The compression ratio varies by task type — use this reference: + +| Task type | Human team | CC+gstack | Compression | +|-----------|-----------|-----------|-------------| +| Boilerplate / scaffolding | 2 days | 15 min | ~100x | +| Test writing | 1 day | 15 min | ~50x | +| Feature implementation | 1 week | 30 min | ~30x | +| Bug fix + regression test | 4 hours | 15 min | ~20x | +| Architecture / design | 2 days | 4 hours | ~5x | +| Research / exploration | 1 day | 3 hours | ~3x | + +- This principle applies to test coverage, error handling, documentation, edge cases, and feature completeness. Don't skip the last 10% to "save time" — with AI, that 10% costs seconds. + +**Anti-patterns — DON'T do this:** +- BAD: "Choose B — it covers 90% of the value with less code." (If A is only 70 lines more, choose A.) +- BAD: "We can skip edge case handling to save time." (Edge case handling costs minutes with CC.) +- BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) +- BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") + +## Repo Ownership Mode — See Something, Say Something + +`REPO_MODE` from the preamble tells you who owns issues in this repo: + +- **`solo`** — One person does 80%+ of the work. They own everything. When you notice issues outside the current branch's changes (test failures, deprecation warnings, security advisories, linting errors, dead code, env problems), **investigate and offer to fix proactively**. The solo dev is the only person who will fix it. Default to action. +- **`collaborative`** — Multiple active contributors. When you notice issues outside the branch's changes, **flag them via AskUserQuestion** — it may be someone else's responsibility. Default to asking, not fixing. +- **`unknown`** — Treat as collaborative (safer default — ask before fixing). + +**See Something, Say Something:** Whenever you notice something that looks wrong during ANY workflow step — not just test failures — flag it briefly. One sentence: what you noticed and its impact. In solo mode, follow up with "Want me to fix it?" In collaborative mode, just flag it and move on. + +Never let a noticed issue silently pass. The whole point is proactive communication. + +## Search Before Building + +Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.claude/skills/gstack/ETHOS.md` for the full philosophy. + +**Three layers of knowledge:** +- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. +- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. +- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. + +**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: +"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." + +Log eureka moments: +```bash +jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true +``` +Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. + +**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." + ## Contributor Mode -If `_CONTRIB` is `true`: you are in **contributor mode**. At the end of each major workflow step, rate your gstack experience 0-10. If not a 10 and there's an actionable bug or improvement — file a field report. +If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. -**File only:** gstack tooling bugs where the input was reasonable but gstack failed. **Skip:** user app bugs, network errors, auth failures on user's site. +**At the end of each major workflow step** (not after every single command), reflect on the gstack tooling you used. Rate your experience 0 to 10. If it wasn't a 10, think about why. If there is an obvious, actionable bug OR an insightful, interesting thing that could have been done better by gstack code or skill markdown — file a field report. Maybe our contributor will help make us better! + +**Calibration — this is the bar:** For example, `$B js "await fetch(...)"` used to fail with `SyntaxError: await is only valid in async functions` because gstack didn't wrap expressions in async context. Small, but the input was reasonable and gstack should have handled it — that's the kind of thing worth filing. Things less consequential than this, ignore. + +**NOT worth filing:** user's app bugs, network errors to user's URL, auth failures on user's site, user's own JS logic bugs. + +**To file:** write `~/.gstack/contributor-logs/{slug}.md` with **all sections below** (do not truncate — include every section through the Date/Version footer): -**To file:** write `~/.gstack/contributor-logs/{slug}.md`: ``` # {Title} -**What I tried:** {action} | **What happened:** {result} | **Rating:** {0-10} -## Repro + +Hey gstack team — ran into this while using /{skill-name}: + +**What I was trying to do:** {what the user/agent was attempting} +**What happened instead:** {what actually happened} +**My rating:** {0-10} — {one sentence on why it wasn't a 10} + +## Steps to reproduce 1. {step} + +## Raw output +``` +{paste the actual error or unexpected output here} +``` + ## What would make this a 10 -{one sentence} -**Date:** {YYYY-MM-DD} | **Version:** {version} | **Skill:** /{skill} +{one sentence: what gstack should have done differently} + +**Date:** {YYYY-MM-DD} | **Version:** {gstack version} | **Skill:** /{skill} ``` -Slug: lowercase hyphens, max 60 chars. Skip if exists. Max 3/session. File inline, don't stop. + +Slug: lowercase, hyphens, max 60 chars (e.g. `browse-js-no-await`). Skip if file already exists. Max 3 reports per session. File inline and continue — don't stop the workflow. Tell user: "Filed gstack field report: {title}" ## Completion Status Protocol diff --git a/canary/SKILL.md b/canary/SKILL.md index 26868d55c..fe889c74f 100644 --- a/canary/SKILL.md +++ b/canary/SKILL.md @@ -3,7 +3,6 @@ name: canary preamble-tier: 2 version: 1.0.0 description: | - MANUAL TRIGGER ONLY: invoke only when user types /canary. Post-deploy canary monitoring. Watches the live app for console errors, performance regressions, and page failures using the browse daemon. Takes periodic screenshots, compares against pre-deploy baselines, and alerts @@ -107,6 +106,7 @@ This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. 3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. 4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` +5. **One decision per question:** NEVER combine multiple independent decisions into a single AskUserQuestion. Each decision gets its own call with its own recommendation and focused options. Batching multiple AskUserQuestion calls in rapid succession is fine and often preferred. Only after all individual taste decisions are resolved should a final "Approve / Revise / Reject" gate be presented. Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -114,36 +114,97 @@ Per-skill instructions may add additional formatting rules on top of this baseli ## Completeness Principle — Boil the Lake -AI makes completeness near-free. Always recommend the complete option over shortcuts — the delta is minutes with CC+gstack. A "lake" (100% coverage, all edge cases) is boilable; an "ocean" (full rewrite, multi-quarter migration) is not. Boil lakes, flag oceans. +AI-assisted coding makes the marginal cost of completeness near-zero. When you present options: -**Effort reference** — always show both scales: +- If Option A is the complete implementation (full parity, all edge cases, 100% coverage) and Option B is a shortcut that saves modest effort — **always recommend A**. The delta between 80 lines and 150 lines is meaningless with CC+gstack. "Good enough" is the wrong instinct when "complete" costs minutes more. +- **Lake vs. ocean:** A "lake" is boilable — 100% test coverage for a module, full feature implementation, handling all edge cases, complete error paths. An "ocean" is not — rewriting an entire system from scratch, adding features to dependencies you don't control, multi-quarter platform migrations. Recommend boiling lakes. Flag oceans as out of scope. +- **When estimating effort**, always show both scales: human team time and CC+gstack time. The compression ratio varies by task type — use this reference: | Task type | Human team | CC+gstack | Compression | |-----------|-----------|-----------|-------------| -| Boilerplate | 2 days | 15 min | ~100x | -| Tests | 1 day | 15 min | ~50x | -| Feature | 1 week | 30 min | ~30x | -| Bug fix | 4 hours | 15 min | ~20x | +| Boilerplate / scaffolding | 2 days | 15 min | ~100x | +| Test writing | 1 day | 15 min | ~50x | +| Feature implementation | 1 week | 30 min | ~30x | +| Bug fix + regression test | 4 hours | 15 min | ~20x | +| Architecture / design | 2 days | 4 hours | ~5x | +| Research / exploration | 1 day | 3 hours | ~3x | -Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). +- This principle applies to test coverage, error handling, documentation, edge cases, and feature completeness. Don't skip the last 10% to "save time" — with AI, that 10% costs seconds. + +**Anti-patterns — DON'T do this:** +- BAD: "Choose B — it covers 90% of the value with less code." (If A is only 70 lines more, choose A.) +- BAD: "We can skip edge case handling to save time." (Edge case handling costs minutes with CC.) +- BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) +- BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") + +## Repo Ownership Mode — See Something, Say Something + +`REPO_MODE` from the preamble tells you who owns issues in this repo: + +- **`solo`** — One person does 80%+ of the work. They own everything. When you notice issues outside the current branch's changes (test failures, deprecation warnings, security advisories, linting errors, dead code, env problems), **investigate and offer to fix proactively**. The solo dev is the only person who will fix it. Default to action. +- **`collaborative`** — Multiple active contributors. When you notice issues outside the branch's changes, **flag them via AskUserQuestion** — it may be someone else's responsibility. Default to asking, not fixing. +- **`unknown`** — Treat as collaborative (safer default — ask before fixing). + +**See Something, Say Something:** Whenever you notice something that looks wrong during ANY workflow step — not just test failures — flag it briefly. One sentence: what you noticed and its impact. In solo mode, follow up with "Want me to fix it?" In collaborative mode, just flag it and move on. + +Never let a noticed issue silently pass. The whole point is proactive communication. + +## Search Before Building + +Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.claude/skills/gstack/ETHOS.md` for the full philosophy. + +**Three layers of knowledge:** +- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. +- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. +- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. + +**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: +"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." + +Log eureka moments: +```bash +jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true +``` +Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. + +**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." ## Contributor Mode -If `_CONTRIB` is `true`: you are in **contributor mode**. At the end of each major workflow step, rate your gstack experience 0-10. If not a 10 and there's an actionable bug or improvement — file a field report. +If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. + +**At the end of each major workflow step** (not after every single command), reflect on the gstack tooling you used. Rate your experience 0 to 10. If it wasn't a 10, think about why. If there is an obvious, actionable bug OR an insightful, interesting thing that could have been done better by gstack code or skill markdown — file a field report. Maybe our contributor will help make us better! + +**Calibration — this is the bar:** For example, `$B js "await fetch(...)"` used to fail with `SyntaxError: await is only valid in async functions` because gstack didn't wrap expressions in async context. Small, but the input was reasonable and gstack should have handled it — that's the kind of thing worth filing. Things less consequential than this, ignore. + +**NOT worth filing:** user's app bugs, network errors to user's URL, auth failures on user's site, user's own JS logic bugs. -**File only:** gstack tooling bugs where the input was reasonable but gstack failed. **Skip:** user app bugs, network errors, auth failures on user's site. +**To file:** write `~/.gstack/contributor-logs/{slug}.md` with **all sections below** (do not truncate — include every section through the Date/Version footer): -**To file:** write `~/.gstack/contributor-logs/{slug}.md`: ``` # {Title} -**What I tried:** {action} | **What happened:** {result} | **Rating:** {0-10} -## Repro + +Hey gstack team — ran into this while using /{skill-name}: + +**What I was trying to do:** {what the user/agent was attempting} +**What happened instead:** {what actually happened} +**My rating:** {0-10} — {one sentence on why it wasn't a 10} + +## Steps to reproduce 1. {step} + +## Raw output +``` +{paste the actual error or unexpected output here} +``` + ## What would make this a 10 -{one sentence} -**Date:** {YYYY-MM-DD} | **Version:** {version} | **Skill:** /{skill} +{one sentence: what gstack should have done differently} + +**Date:** {YYYY-MM-DD} | **Version:** {gstack version} | **Skill:** /{skill} ``` -Slug: lowercase hyphens, max 60 chars. Skip if exists. Max 3/session. File inline, don't stop. + +Slug: lowercase, hyphens, max 60 chars (e.g. `browse-js-no-await`). Skip if file already exists. Max 3 reports per session. File inline and continue — don't stop the workflow. Tell user: "Filed gstack field report: {title}" ## Completion Status Protocol diff --git a/careful/SKILL.md b/careful/SKILL.md index 943436534..7513b2937 100644 --- a/careful/SKILL.md +++ b/careful/SKILL.md @@ -2,7 +2,6 @@ name: careful version: 0.1.0 description: | - MANUAL TRIGGER ONLY: invoke only when user types /careful. Safety guardrails for destructive commands. Warns before rm -rf, DROP TABLE, force-push, git reset --hard, kubectl delete, and similar destructive operations. User can override each warning. Use when touching prod, debugging live systems, diff --git a/codex/SKILL.md.tmpl b/codex/SKILL.md.tmpl index 338df93b6..c0b7adb1b 100644 --- a/codex/SKILL.md.tmpl +++ b/codex/SKILL.md.tmpl @@ -159,7 +159,7 @@ With focus (e.g., "security"): 2. Run codex exec with **JSONL output** to capture reasoning traces and tool calls (5-minute timeout): ```bash -codex exec "" -C "$(git rev-parse --show-toplevel)" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached --json 2>/dev/null | python3 -c " +codex exec "" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached --json 2>/dev/null | python3 -c " import sys, json for line in sys.stdin: line = line.strip() @@ -244,7 +244,7 @@ THE PLAN: For a **new session:** ```bash -codex exec "" -C "$(git rev-parse --show-toplevel)" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached --json 2>"$TMPERR" | python3 -c " +codex exec "" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached --json 2>"$TMPERR" | python3 -c " import sys, json for line in sys.stdin: line = line.strip() @@ -277,7 +277,7 @@ for line in sys.stdin: For a **resumed session** (user chose "Continue"): ```bash -codex exec resume "" -C "$(git rev-parse --show-toplevel)" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached --json 2>"$TMPERR" | python3 -c " +codex exec resume "" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached --json 2>"$TMPERR" | python3 -c " " ``` diff --git a/cso/SKILL.md b/cso/SKILL.md index 47a7a0432..c023e1ebb 100644 --- a/cso/SKILL.md +++ b/cso/SKILL.md @@ -3,7 +3,6 @@ name: cso preamble-tier: 2 version: 2.0.0 description: | - MANUAL TRIGGER ONLY: invoke only when user types /cso. Chief Security Officer mode. Infrastructure-first security audit: secrets archaeology, dependency supply chain, CI/CD pipeline security, LLM/AI security, skill supply chain scanning, plus OWASP Top 10, STRIDE threat modeling, and active verification. @@ -111,6 +110,7 @@ This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. 3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. 4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` +5. **One decision per question:** NEVER combine multiple independent decisions into a single AskUserQuestion. Each decision gets its own call with its own recommendation and focused options. Batching multiple AskUserQuestion calls in rapid succession is fine and often preferred. Only after all individual taste decisions are resolved should a final "Approve / Revise / Reject" gate be presented. Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -118,36 +118,97 @@ Per-skill instructions may add additional formatting rules on top of this baseli ## Completeness Principle — Boil the Lake -AI makes completeness near-free. Always recommend the complete option over shortcuts — the delta is minutes with CC+gstack. A "lake" (100% coverage, all edge cases) is boilable; an "ocean" (full rewrite, multi-quarter migration) is not. Boil lakes, flag oceans. +AI-assisted coding makes the marginal cost of completeness near-zero. When you present options: -**Effort reference** — always show both scales: +- If Option A is the complete implementation (full parity, all edge cases, 100% coverage) and Option B is a shortcut that saves modest effort — **always recommend A**. The delta between 80 lines and 150 lines is meaningless with CC+gstack. "Good enough" is the wrong instinct when "complete" costs minutes more. +- **Lake vs. ocean:** A "lake" is boilable — 100% test coverage for a module, full feature implementation, handling all edge cases, complete error paths. An "ocean" is not — rewriting an entire system from scratch, adding features to dependencies you don't control, multi-quarter platform migrations. Recommend boiling lakes. Flag oceans as out of scope. +- **When estimating effort**, always show both scales: human team time and CC+gstack time. The compression ratio varies by task type — use this reference: | Task type | Human team | CC+gstack | Compression | |-----------|-----------|-----------|-------------| -| Boilerplate | 2 days | 15 min | ~100x | -| Tests | 1 day | 15 min | ~50x | -| Feature | 1 week | 30 min | ~30x | -| Bug fix | 4 hours | 15 min | ~20x | +| Boilerplate / scaffolding | 2 days | 15 min | ~100x | +| Test writing | 1 day | 15 min | ~50x | +| Feature implementation | 1 week | 30 min | ~30x | +| Bug fix + regression test | 4 hours | 15 min | ~20x | +| Architecture / design | 2 days | 4 hours | ~5x | +| Research / exploration | 1 day | 3 hours | ~3x | -Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). +- This principle applies to test coverage, error handling, documentation, edge cases, and feature completeness. Don't skip the last 10% to "save time" — with AI, that 10% costs seconds. + +**Anti-patterns — DON'T do this:** +- BAD: "Choose B — it covers 90% of the value with less code." (If A is only 70 lines more, choose A.) +- BAD: "We can skip edge case handling to save time." (Edge case handling costs minutes with CC.) +- BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) +- BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") + +## Repo Ownership Mode — See Something, Say Something + +`REPO_MODE` from the preamble tells you who owns issues in this repo: + +- **`solo`** — One person does 80%+ of the work. They own everything. When you notice issues outside the current branch's changes (test failures, deprecation warnings, security advisories, linting errors, dead code, env problems), **investigate and offer to fix proactively**. The solo dev is the only person who will fix it. Default to action. +- **`collaborative`** — Multiple active contributors. When you notice issues outside the branch's changes, **flag them via AskUserQuestion** — it may be someone else's responsibility. Default to asking, not fixing. +- **`unknown`** — Treat as collaborative (safer default — ask before fixing). + +**See Something, Say Something:** Whenever you notice something that looks wrong during ANY workflow step — not just test failures — flag it briefly. One sentence: what you noticed and its impact. In solo mode, follow up with "Want me to fix it?" In collaborative mode, just flag it and move on. + +Never let a noticed issue silently pass. The whole point is proactive communication. + +## Search Before Building + +Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.claude/skills/gstack/ETHOS.md` for the full philosophy. + +**Three layers of knowledge:** +- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. +- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. +- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. + +**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: +"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." + +Log eureka moments: +```bash +jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true +``` +Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. + +**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." ## Contributor Mode -If `_CONTRIB` is `true`: you are in **contributor mode**. At the end of each major workflow step, rate your gstack experience 0-10. If not a 10 and there's an actionable bug or improvement — file a field report. +If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. + +**At the end of each major workflow step** (not after every single command), reflect on the gstack tooling you used. Rate your experience 0 to 10. If it wasn't a 10, think about why. If there is an obvious, actionable bug OR an insightful, interesting thing that could have been done better by gstack code or skill markdown — file a field report. Maybe our contributor will help make us better! + +**Calibration — this is the bar:** For example, `$B js "await fetch(...)"` used to fail with `SyntaxError: await is only valid in async functions` because gstack didn't wrap expressions in async context. Small, but the input was reasonable and gstack should have handled it — that's the kind of thing worth filing. Things less consequential than this, ignore. + +**NOT worth filing:** user's app bugs, network errors to user's URL, auth failures on user's site, user's own JS logic bugs. -**File only:** gstack tooling bugs where the input was reasonable but gstack failed. **Skip:** user app bugs, network errors, auth failures on user's site. +**To file:** write `~/.gstack/contributor-logs/{slug}.md` with **all sections below** (do not truncate — include every section through the Date/Version footer): -**To file:** write `~/.gstack/contributor-logs/{slug}.md`: ``` # {Title} -**What I tried:** {action} | **What happened:** {result} | **Rating:** {0-10} -## Repro + +Hey gstack team — ran into this while using /{skill-name}: + +**What I was trying to do:** {what the user/agent was attempting} +**What happened instead:** {what actually happened} +**My rating:** {0-10} — {one sentence on why it wasn't a 10} + +## Steps to reproduce 1. {step} + +## Raw output +``` +{paste the actual error or unexpected output here} +``` + ## What would make this a 10 -{one sentence} -**Date:** {YYYY-MM-DD} | **Version:** {version} | **Skill:** /{skill} +{one sentence: what gstack should have done differently} + +**Date:** {YYYY-MM-DD} | **Version:** {gstack version} | **Skill:** /{skill} ``` -Slug: lowercase hyphens, max 60 chars. Skip if exists. Max 3/session. File inline, don't stop. + +Slug: lowercase, hyphens, max 60 chars (e.g. `browse-js-no-await`). Skip if file already exists. Max 3 reports per session. File inline and continue — don't stop the workflow. Tell user: "Filed gstack field report: {title}" ## Completion Status Protocol diff --git a/document-release/SKILL.md b/document-release/SKILL.md index 9748b2d60..1364e4d9f 100644 --- a/document-release/SKILL.md +++ b/document-release/SKILL.md @@ -3,7 +3,6 @@ name: document-release preamble-tier: 2 version: 1.0.0 description: | - MANUAL TRIGGER ONLY: invoke only when user types /document-release. Post-ship documentation update. Reads all project docs, cross-references the diff, updates README/ARCHITECTURE/CONTRIBUTING/CLAUDE.md to match what shipped, polishes CHANGELOG voice, cleans up TODOS, and optionally bumps VERSION. Use when @@ -109,6 +108,7 @@ This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. 3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. 4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` +5. **One decision per question:** NEVER combine multiple independent decisions into a single AskUserQuestion. Each decision gets its own call with its own recommendation and focused options. Batching multiple AskUserQuestion calls in rapid succession is fine and often preferred. Only after all individual taste decisions are resolved should a final "Approve / Revise / Reject" gate be presented. Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -116,36 +116,97 @@ Per-skill instructions may add additional formatting rules on top of this baseli ## Completeness Principle — Boil the Lake -AI makes completeness near-free. Always recommend the complete option over shortcuts — the delta is minutes with CC+gstack. A "lake" (100% coverage, all edge cases) is boilable; an "ocean" (full rewrite, multi-quarter migration) is not. Boil lakes, flag oceans. +AI-assisted coding makes the marginal cost of completeness near-zero. When you present options: -**Effort reference** — always show both scales: +- If Option A is the complete implementation (full parity, all edge cases, 100% coverage) and Option B is a shortcut that saves modest effort — **always recommend A**. The delta between 80 lines and 150 lines is meaningless with CC+gstack. "Good enough" is the wrong instinct when "complete" costs minutes more. +- **Lake vs. ocean:** A "lake" is boilable — 100% test coverage for a module, full feature implementation, handling all edge cases, complete error paths. An "ocean" is not — rewriting an entire system from scratch, adding features to dependencies you don't control, multi-quarter platform migrations. Recommend boiling lakes. Flag oceans as out of scope. +- **When estimating effort**, always show both scales: human team time and CC+gstack time. The compression ratio varies by task type — use this reference: | Task type | Human team | CC+gstack | Compression | |-----------|-----------|-----------|-------------| -| Boilerplate | 2 days | 15 min | ~100x | -| Tests | 1 day | 15 min | ~50x | -| Feature | 1 week | 30 min | ~30x | -| Bug fix | 4 hours | 15 min | ~20x | +| Boilerplate / scaffolding | 2 days | 15 min | ~100x | +| Test writing | 1 day | 15 min | ~50x | +| Feature implementation | 1 week | 30 min | ~30x | +| Bug fix + regression test | 4 hours | 15 min | ~20x | +| Architecture / design | 2 days | 4 hours | ~5x | +| Research / exploration | 1 day | 3 hours | ~3x | -Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). +- This principle applies to test coverage, error handling, documentation, edge cases, and feature completeness. Don't skip the last 10% to "save time" — with AI, that 10% costs seconds. + +**Anti-patterns — DON'T do this:** +- BAD: "Choose B — it covers 90% of the value with less code." (If A is only 70 lines more, choose A.) +- BAD: "We can skip edge case handling to save time." (Edge case handling costs minutes with CC.) +- BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) +- BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") + +## Repo Ownership Mode — See Something, Say Something + +`REPO_MODE` from the preamble tells you who owns issues in this repo: + +- **`solo`** — One person does 80%+ of the work. They own everything. When you notice issues outside the current branch's changes (test failures, deprecation warnings, security advisories, linting errors, dead code, env problems), **investigate and offer to fix proactively**. The solo dev is the only person who will fix it. Default to action. +- **`collaborative`** — Multiple active contributors. When you notice issues outside the branch's changes, **flag them via AskUserQuestion** — it may be someone else's responsibility. Default to asking, not fixing. +- **`unknown`** — Treat as collaborative (safer default — ask before fixing). + +**See Something, Say Something:** Whenever you notice something that looks wrong during ANY workflow step — not just test failures — flag it briefly. One sentence: what you noticed and its impact. In solo mode, follow up with "Want me to fix it?" In collaborative mode, just flag it and move on. + +Never let a noticed issue silently pass. The whole point is proactive communication. + +## Search Before Building + +Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.claude/skills/gstack/ETHOS.md` for the full philosophy. + +**Three layers of knowledge:** +- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. +- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. +- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. + +**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: +"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." + +Log eureka moments: +```bash +jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true +``` +Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. + +**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." ## Contributor Mode -If `_CONTRIB` is `true`: you are in **contributor mode**. At the end of each major workflow step, rate your gstack experience 0-10. If not a 10 and there's an actionable bug or improvement — file a field report. +If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. + +**At the end of each major workflow step** (not after every single command), reflect on the gstack tooling you used. Rate your experience 0 to 10. If it wasn't a 10, think about why. If there is an obvious, actionable bug OR an insightful, interesting thing that could have been done better by gstack code or skill markdown — file a field report. Maybe our contributor will help make us better! + +**Calibration — this is the bar:** For example, `$B js "await fetch(...)"` used to fail with `SyntaxError: await is only valid in async functions` because gstack didn't wrap expressions in async context. Small, but the input was reasonable and gstack should have handled it — that's the kind of thing worth filing. Things less consequential than this, ignore. + +**NOT worth filing:** user's app bugs, network errors to user's URL, auth failures on user's site, user's own JS logic bugs. -**File only:** gstack tooling bugs where the input was reasonable but gstack failed. **Skip:** user app bugs, network errors, auth failures on user's site. +**To file:** write `~/.gstack/contributor-logs/{slug}.md` with **all sections below** (do not truncate — include every section through the Date/Version footer): -**To file:** write `~/.gstack/contributor-logs/{slug}.md`: ``` # {Title} -**What I tried:** {action} | **What happened:** {result} | **Rating:** {0-10} -## Repro + +Hey gstack team — ran into this while using /{skill-name}: + +**What I was trying to do:** {what the user/agent was attempting} +**What happened instead:** {what actually happened} +**My rating:** {0-10} — {one sentence on why it wasn't a 10} + +## Steps to reproduce 1. {step} + +## Raw output +``` +{paste the actual error or unexpected output here} +``` + ## What would make this a 10 -{one sentence} -**Date:** {YYYY-MM-DD} | **Version:** {version} | **Skill:** /{skill} +{one sentence: what gstack should have done differently} + +**Date:** {YYYY-MM-DD} | **Version:** {gstack version} | **Skill:** /{skill} ``` -Slug: lowercase hyphens, max 60 chars. Skip if exists. Max 3/session. File inline, don't stop. + +Slug: lowercase, hyphens, max 60 chars (e.g. `browse-js-no-await`). Skip if file already exists. Max 3 reports per session. File inline and continue — don't stop the workflow. Tell user: "Filed gstack field report: {title}" ## Completion Status Protocol diff --git a/freeze/SKILL.md b/freeze/SKILL.md index 6fa53992f..00aaef613 100644 --- a/freeze/SKILL.md +++ b/freeze/SKILL.md @@ -2,7 +2,6 @@ name: freeze version: 0.1.0 description: | - MANUAL TRIGGER ONLY: invoke only when user types /freeze. Restrict file edits to a specific directory for the session. Blocks Edit and Write outside the allowed path. Use when debugging to prevent accidentally "fixing" unrelated code, or when you want to scope changes to one module. diff --git a/gstack-upgrade/SKILL.md b/gstack-upgrade/SKILL.md index 7f70a28b5..f97f11fb7 100644 --- a/gstack-upgrade/SKILL.md +++ b/gstack-upgrade/SKILL.md @@ -2,7 +2,6 @@ name: gstack-upgrade version: 1.1.0 description: | - MANUAL TRIGGER ONLY: invoke only when user types /gstack-upgrade. Upgrade gstack to the latest version. Detects global vs vendored install, runs the upgrade, and shows what's new. Use when asked to "upgrade gstack", "update gstack", or "get latest version". diff --git a/guard/SKILL.md b/guard/SKILL.md index 4758ded52..f846d38a1 100644 --- a/guard/SKILL.md +++ b/guard/SKILL.md @@ -2,7 +2,6 @@ name: guard version: 0.1.0 description: | - MANUAL TRIGGER ONLY: invoke only when user types /guard. Full safety mode: destructive command warnings + directory-scoped edits. Combines /careful (warns before rm -rf, DROP TABLE, force-push, etc.) with /freeze (blocks edits outside a specified directory). Use for maximum safety diff --git a/investigate/SKILL.md b/investigate/SKILL.md index 270c08290..b1df5ca29 100644 --- a/investigate/SKILL.md +++ b/investigate/SKILL.md @@ -3,7 +3,6 @@ name: investigate preamble-tier: 2 version: 1.0.0 description: | - MANUAL TRIGGER ONLY: invoke only when user types /investigate. Systematic debugging with root cause investigation. Four phases: investigate, analyze, hypothesize, implement. Iron Law: no fixes without root cause. Use when asked to "debug this", "fix this bug", "why is this broken", @@ -123,6 +122,7 @@ This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. 3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. 4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` +5. **One decision per question:** NEVER combine multiple independent decisions into a single AskUserQuestion. Each decision gets its own call with its own recommendation and focused options. Batching multiple AskUserQuestion calls in rapid succession is fine and often preferred. Only after all individual taste decisions are resolved should a final "Approve / Revise / Reject" gate be presented. Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -130,36 +130,97 @@ Per-skill instructions may add additional formatting rules on top of this baseli ## Completeness Principle — Boil the Lake -AI makes completeness near-free. Always recommend the complete option over shortcuts — the delta is minutes with CC+gstack. A "lake" (100% coverage, all edge cases) is boilable; an "ocean" (full rewrite, multi-quarter migration) is not. Boil lakes, flag oceans. +AI-assisted coding makes the marginal cost of completeness near-zero. When you present options: -**Effort reference** — always show both scales: +- If Option A is the complete implementation (full parity, all edge cases, 100% coverage) and Option B is a shortcut that saves modest effort — **always recommend A**. The delta between 80 lines and 150 lines is meaningless with CC+gstack. "Good enough" is the wrong instinct when "complete" costs minutes more. +- **Lake vs. ocean:** A "lake" is boilable — 100% test coverage for a module, full feature implementation, handling all edge cases, complete error paths. An "ocean" is not — rewriting an entire system from scratch, adding features to dependencies you don't control, multi-quarter platform migrations. Recommend boiling lakes. Flag oceans as out of scope. +- **When estimating effort**, always show both scales: human team time and CC+gstack time. The compression ratio varies by task type — use this reference: | Task type | Human team | CC+gstack | Compression | |-----------|-----------|-----------|-------------| -| Boilerplate | 2 days | 15 min | ~100x | -| Tests | 1 day | 15 min | ~50x | -| Feature | 1 week | 30 min | ~30x | -| Bug fix | 4 hours | 15 min | ~20x | +| Boilerplate / scaffolding | 2 days | 15 min | ~100x | +| Test writing | 1 day | 15 min | ~50x | +| Feature implementation | 1 week | 30 min | ~30x | +| Bug fix + regression test | 4 hours | 15 min | ~20x | +| Architecture / design | 2 days | 4 hours | ~5x | +| Research / exploration | 1 day | 3 hours | ~3x | -Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). +- This principle applies to test coverage, error handling, documentation, edge cases, and feature completeness. Don't skip the last 10% to "save time" — with AI, that 10% costs seconds. + +**Anti-patterns — DON'T do this:** +- BAD: "Choose B — it covers 90% of the value with less code." (If A is only 70 lines more, choose A.) +- BAD: "We can skip edge case handling to save time." (Edge case handling costs minutes with CC.) +- BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) +- BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") + +## Repo Ownership Mode — See Something, Say Something + +`REPO_MODE` from the preamble tells you who owns issues in this repo: + +- **`solo`** — One person does 80%+ of the work. They own everything. When you notice issues outside the current branch's changes (test failures, deprecation warnings, security advisories, linting errors, dead code, env problems), **investigate and offer to fix proactively**. The solo dev is the only person who will fix it. Default to action. +- **`collaborative`** — Multiple active contributors. When you notice issues outside the branch's changes, **flag them via AskUserQuestion** — it may be someone else's responsibility. Default to asking, not fixing. +- **`unknown`** — Treat as collaborative (safer default — ask before fixing). + +**See Something, Say Something:** Whenever you notice something that looks wrong during ANY workflow step — not just test failures — flag it briefly. One sentence: what you noticed and its impact. In solo mode, follow up with "Want me to fix it?" In collaborative mode, just flag it and move on. + +Never let a noticed issue silently pass. The whole point is proactive communication. + +## Search Before Building + +Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.claude/skills/gstack/ETHOS.md` for the full philosophy. + +**Three layers of knowledge:** +- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. +- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. +- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. + +**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: +"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." + +Log eureka moments: +```bash +jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true +``` +Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. + +**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." ## Contributor Mode -If `_CONTRIB` is `true`: you are in **contributor mode**. At the end of each major workflow step, rate your gstack experience 0-10. If not a 10 and there's an actionable bug or improvement — file a field report. +If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. + +**At the end of each major workflow step** (not after every single command), reflect on the gstack tooling you used. Rate your experience 0 to 10. If it wasn't a 10, think about why. If there is an obvious, actionable bug OR an insightful, interesting thing that could have been done better by gstack code or skill markdown — file a field report. Maybe our contributor will help make us better! + +**Calibration — this is the bar:** For example, `$B js "await fetch(...)"` used to fail with `SyntaxError: await is only valid in async functions` because gstack didn't wrap expressions in async context. Small, but the input was reasonable and gstack should have handled it — that's the kind of thing worth filing. Things less consequential than this, ignore. + +**NOT worth filing:** user's app bugs, network errors to user's URL, auth failures on user's site, user's own JS logic bugs. -**File only:** gstack tooling bugs where the input was reasonable but gstack failed. **Skip:** user app bugs, network errors, auth failures on user's site. +**To file:** write `~/.gstack/contributor-logs/{slug}.md` with **all sections below** (do not truncate — include every section through the Date/Version footer): -**To file:** write `~/.gstack/contributor-logs/{slug}.md`: ``` # {Title} -**What I tried:** {action} | **What happened:** {result} | **Rating:** {0-10} -## Repro + +Hey gstack team — ran into this while using /{skill-name}: + +**What I was trying to do:** {what the user/agent was attempting} +**What happened instead:** {what actually happened} +**My rating:** {0-10} — {one sentence on why it wasn't a 10} + +## Steps to reproduce 1. {step} + +## Raw output +``` +{paste the actual error or unexpected output here} +``` + ## What would make this a 10 -{one sentence} -**Date:** {YYYY-MM-DD} | **Version:** {version} | **Skill:** /{skill} +{one sentence: what gstack should have done differently} + +**Date:** {YYYY-MM-DD} | **Version:** {gstack version} | **Skill:** /{skill} ``` -Slug: lowercase hyphens, max 60 chars. Skip if exists. Max 3/session. File inline, don't stop. + +Slug: lowercase, hyphens, max 60 chars (e.g. `browse-js-no-await`). Skip if file already exists. Max 3 reports per session. File inline and continue — don't stop the workflow. Tell user: "Filed gstack field report: {title}" ## Completion Status Protocol diff --git a/land-and-deploy/SKILL.md b/land-and-deploy/SKILL.md index 455658fa0..85e52e4eb 100644 --- a/land-and-deploy/SKILL.md +++ b/land-and-deploy/SKILL.md @@ -3,7 +3,6 @@ name: land-and-deploy preamble-tier: 4 version: 1.0.0 description: | - MANUAL TRIGGER ONLY: invoke only when user types /land-and-deploy. Land and deploy workflow. Merges the PR, waits for CI and deploy, verifies production health via canary checks. Takes over after /ship creates the PR. Use when: "merge", "land", "deploy", "merge and verify", @@ -106,6 +105,7 @@ This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. 3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. 4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` +5. **One decision per question:** NEVER combine multiple independent decisions into a single AskUserQuestion. Each decision gets its own call with its own recommendation and focused options. Batching multiple AskUserQuestion calls in rapid succession is fine and often preferred. Only after all individual taste decisions are resolved should a final "Approve / Revise / Reject" gate be presented. Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -113,54 +113,97 @@ Per-skill instructions may add additional formatting rules on top of this baseli ## Completeness Principle — Boil the Lake -AI makes completeness near-free. Always recommend the complete option over shortcuts — the delta is minutes with CC+gstack. A "lake" (100% coverage, all edge cases) is boilable; an "ocean" (full rewrite, multi-quarter migration) is not. Boil lakes, flag oceans. +AI-assisted coding makes the marginal cost of completeness near-zero. When you present options: -**Effort reference** — always show both scales: +- If Option A is the complete implementation (full parity, all edge cases, 100% coverage) and Option B is a shortcut that saves modest effort — **always recommend A**. The delta between 80 lines and 150 lines is meaningless with CC+gstack. "Good enough" is the wrong instinct when "complete" costs minutes more. +- **Lake vs. ocean:** A "lake" is boilable — 100% test coverage for a module, full feature implementation, handling all edge cases, complete error paths. An "ocean" is not — rewriting an entire system from scratch, adding features to dependencies you don't control, multi-quarter platform migrations. Recommend boiling lakes. Flag oceans as out of scope. +- **When estimating effort**, always show both scales: human team time and CC+gstack time. The compression ratio varies by task type — use this reference: | Task type | Human team | CC+gstack | Compression | |-----------|-----------|-----------|-------------| -| Boilerplate | 2 days | 15 min | ~100x | -| Tests | 1 day | 15 min | ~50x | -| Feature | 1 week | 30 min | ~30x | -| Bug fix | 4 hours | 15 min | ~20x | +| Boilerplate / scaffolding | 2 days | 15 min | ~100x | +| Test writing | 1 day | 15 min | ~50x | +| Feature implementation | 1 week | 30 min | ~30x | +| Bug fix + regression test | 4 hours | 15 min | ~20x | +| Architecture / design | 2 days | 4 hours | ~5x | +| Research / exploration | 1 day | 3 hours | ~3x | -Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). +- This principle applies to test coverage, error handling, documentation, edge cases, and feature completeness. Don't skip the last 10% to "save time" — with AI, that 10% costs seconds. -## Repo Ownership — See Something, Say Something +**Anti-patterns — DON'T do this:** +- BAD: "Choose B — it covers 90% of the value with less code." (If A is only 70 lines more, choose A.) +- BAD: "We can skip edge case handling to save time." (Edge case handling costs minutes with CC.) +- BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) +- BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") -`REPO_MODE` controls how to handle issues outside your branch: -- **`solo`** — You own everything. Investigate and offer to fix proactively. -- **`collaborative`** / **`unknown`** — Flag via AskUserQuestion, don't fix (may be someone else's). +## Repo Ownership Mode — See Something, Say Something -Always flag anything that looks wrong — one sentence, what you noticed and its impact. +`REPO_MODE` from the preamble tells you who owns issues in this repo: + +- **`solo`** — One person does 80%+ of the work. They own everything. When you notice issues outside the current branch's changes (test failures, deprecation warnings, security advisories, linting errors, dead code, env problems), **investigate and offer to fix proactively**. The solo dev is the only person who will fix it. Default to action. +- **`collaborative`** — Multiple active contributors. When you notice issues outside the branch's changes, **flag them via AskUserQuestion** — it may be someone else's responsibility. Default to asking, not fixing. +- **`unknown`** — Treat as collaborative (safer default — ask before fixing). + +**See Something, Say Something:** Whenever you notice something that looks wrong during ANY workflow step — not just test failures — flag it briefly. One sentence: what you noticed and its impact. In solo mode, follow up with "Want me to fix it?" In collaborative mode, just flag it and move on. + +Never let a noticed issue silently pass. The whole point is proactive communication. ## Search Before Building -Before building anything unfamiliar, **search first.** See `~/.claude/skills/gstack/ETHOS.md`. -- **Layer 1** (tried and true) — don't reinvent. **Layer 2** (new and popular) — scrutinize. **Layer 3** (first principles) — prize above all. +Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.claude/skills/gstack/ETHOS.md` for the full philosophy. + +**Three layers of knowledge:** +- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. +- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. +- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. + +**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: +"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." -**Eureka:** When first-principles reasoning contradicts conventional wisdom, name it and log: +Log eureka moments: ```bash jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true ``` +Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. + +**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." ## Contributor Mode -If `_CONTRIB` is `true`: you are in **contributor mode**. At the end of each major workflow step, rate your gstack experience 0-10. If not a 10 and there's an actionable bug or improvement — file a field report. +If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. + +**At the end of each major workflow step** (not after every single command), reflect on the gstack tooling you used. Rate your experience 0 to 10. If it wasn't a 10, think about why. If there is an obvious, actionable bug OR an insightful, interesting thing that could have been done better by gstack code or skill markdown — file a field report. Maybe our contributor will help make us better! + +**Calibration — this is the bar:** For example, `$B js "await fetch(...)"` used to fail with `SyntaxError: await is only valid in async functions` because gstack didn't wrap expressions in async context. Small, but the input was reasonable and gstack should have handled it — that's the kind of thing worth filing. Things less consequential than this, ignore. + +**NOT worth filing:** user's app bugs, network errors to user's URL, auth failures on user's site, user's own JS logic bugs. -**File only:** gstack tooling bugs where the input was reasonable but gstack failed. **Skip:** user app bugs, network errors, auth failures on user's site. +**To file:** write `~/.gstack/contributor-logs/{slug}.md` with **all sections below** (do not truncate — include every section through the Date/Version footer): -**To file:** write `~/.gstack/contributor-logs/{slug}.md`: ``` # {Title} -**What I tried:** {action} | **What happened:** {result} | **Rating:** {0-10} -## Repro + +Hey gstack team — ran into this while using /{skill-name}: + +**What I was trying to do:** {what the user/agent was attempting} +**What happened instead:** {what actually happened} +**My rating:** {0-10} — {one sentence on why it wasn't a 10} + +## Steps to reproduce 1. {step} + +## Raw output +``` +{paste the actual error or unexpected output here} +``` + ## What would make this a 10 -{one sentence} -**Date:** {YYYY-MM-DD} | **Version:** {version} | **Skill:** /{skill} +{one sentence: what gstack should have done differently} + +**Date:** {YYYY-MM-DD} | **Version:** {gstack version} | **Skill:** /{skill} ``` -Slug: lowercase hyphens, max 60 chars. Skip if exists. Max 3/session. File inline, don't stop. + +Slug: lowercase, hyphens, max 60 chars (e.g. `browse-js-no-await`). Skip if file already exists. Max 3 reports per session. File inline and continue — don't stop the workflow. Tell user: "Filed gstack field report: {title}" ## Completion Status Protocol diff --git a/qa-only/SKILL.md b/qa-only/SKILL.md index d1dd3ad33..6736211e3 100644 --- a/qa-only/SKILL.md +++ b/qa-only/SKILL.md @@ -3,7 +3,6 @@ name: qa-only preamble-tier: 4 version: 1.0.0 description: | - MANUAL TRIGGER ONLY: invoke only when user types /qa-only. Report-only QA testing. Systematically tests a web application and produces a structured report with health score, screenshots, and repro steps — but never fixes anything. Use when asked to "just report bugs", "qa report only", or @@ -107,6 +106,7 @@ This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. 3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. 4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` +5. **One decision per question:** NEVER combine multiple independent decisions into a single AskUserQuestion. Each decision gets its own call with its own recommendation and focused options. Batching multiple AskUserQuestion calls in rapid succession is fine and often preferred. Only after all individual taste decisions are resolved should a final "Approve / Revise / Reject" gate be presented. Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -114,54 +114,97 @@ Per-skill instructions may add additional formatting rules on top of this baseli ## Completeness Principle — Boil the Lake -AI makes completeness near-free. Always recommend the complete option over shortcuts — the delta is minutes with CC+gstack. A "lake" (100% coverage, all edge cases) is boilable; an "ocean" (full rewrite, multi-quarter migration) is not. Boil lakes, flag oceans. +AI-assisted coding makes the marginal cost of completeness near-zero. When you present options: -**Effort reference** — always show both scales: +- If Option A is the complete implementation (full parity, all edge cases, 100% coverage) and Option B is a shortcut that saves modest effort — **always recommend A**. The delta between 80 lines and 150 lines is meaningless with CC+gstack. "Good enough" is the wrong instinct when "complete" costs minutes more. +- **Lake vs. ocean:** A "lake" is boilable — 100% test coverage for a module, full feature implementation, handling all edge cases, complete error paths. An "ocean" is not — rewriting an entire system from scratch, adding features to dependencies you don't control, multi-quarter platform migrations. Recommend boiling lakes. Flag oceans as out of scope. +- **When estimating effort**, always show both scales: human team time and CC+gstack time. The compression ratio varies by task type — use this reference: | Task type | Human team | CC+gstack | Compression | |-----------|-----------|-----------|-------------| -| Boilerplate | 2 days | 15 min | ~100x | -| Tests | 1 day | 15 min | ~50x | -| Feature | 1 week | 30 min | ~30x | -| Bug fix | 4 hours | 15 min | ~20x | +| Boilerplate / scaffolding | 2 days | 15 min | ~100x | +| Test writing | 1 day | 15 min | ~50x | +| Feature implementation | 1 week | 30 min | ~30x | +| Bug fix + regression test | 4 hours | 15 min | ~20x | +| Architecture / design | 2 days | 4 hours | ~5x | +| Research / exploration | 1 day | 3 hours | ~3x | -Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). +- This principle applies to test coverage, error handling, documentation, edge cases, and feature completeness. Don't skip the last 10% to "save time" — with AI, that 10% costs seconds. -## Repo Ownership — See Something, Say Something +**Anti-patterns — DON'T do this:** +- BAD: "Choose B — it covers 90% of the value with less code." (If A is only 70 lines more, choose A.) +- BAD: "We can skip edge case handling to save time." (Edge case handling costs minutes with CC.) +- BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) +- BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") -`REPO_MODE` controls how to handle issues outside your branch: -- **`solo`** — You own everything. Investigate and offer to fix proactively. -- **`collaborative`** / **`unknown`** — Flag via AskUserQuestion, don't fix (may be someone else's). +## Repo Ownership Mode — See Something, Say Something -Always flag anything that looks wrong — one sentence, what you noticed and its impact. +`REPO_MODE` from the preamble tells you who owns issues in this repo: + +- **`solo`** — One person does 80%+ of the work. They own everything. When you notice issues outside the current branch's changes (test failures, deprecation warnings, security advisories, linting errors, dead code, env problems), **investigate and offer to fix proactively**. The solo dev is the only person who will fix it. Default to action. +- **`collaborative`** — Multiple active contributors. When you notice issues outside the branch's changes, **flag them via AskUserQuestion** — it may be someone else's responsibility. Default to asking, not fixing. +- **`unknown`** — Treat as collaborative (safer default — ask before fixing). + +**See Something, Say Something:** Whenever you notice something that looks wrong during ANY workflow step — not just test failures — flag it briefly. One sentence: what you noticed and its impact. In solo mode, follow up with "Want me to fix it?" In collaborative mode, just flag it and move on. + +Never let a noticed issue silently pass. The whole point is proactive communication. ## Search Before Building -Before building anything unfamiliar, **search first.** See `~/.claude/skills/gstack/ETHOS.md`. -- **Layer 1** (tried and true) — don't reinvent. **Layer 2** (new and popular) — scrutinize. **Layer 3** (first principles) — prize above all. +Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.claude/skills/gstack/ETHOS.md` for the full philosophy. + +**Three layers of knowledge:** +- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. +- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. +- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. + +**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: +"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." -**Eureka:** When first-principles reasoning contradicts conventional wisdom, name it and log: +Log eureka moments: ```bash jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true ``` +Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. + +**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." ## Contributor Mode -If `_CONTRIB` is `true`: you are in **contributor mode**. At the end of each major workflow step, rate your gstack experience 0-10. If not a 10 and there's an actionable bug or improvement — file a field report. +If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. + +**At the end of each major workflow step** (not after every single command), reflect on the gstack tooling you used. Rate your experience 0 to 10. If it wasn't a 10, think about why. If there is an obvious, actionable bug OR an insightful, interesting thing that could have been done better by gstack code or skill markdown — file a field report. Maybe our contributor will help make us better! + +**Calibration — this is the bar:** For example, `$B js "await fetch(...)"` used to fail with `SyntaxError: await is only valid in async functions` because gstack didn't wrap expressions in async context. Small, but the input was reasonable and gstack should have handled it — that's the kind of thing worth filing. Things less consequential than this, ignore. + +**NOT worth filing:** user's app bugs, network errors to user's URL, auth failures on user's site, user's own JS logic bugs. -**File only:** gstack tooling bugs where the input was reasonable but gstack failed. **Skip:** user app bugs, network errors, auth failures on user's site. +**To file:** write `~/.gstack/contributor-logs/{slug}.md` with **all sections below** (do not truncate — include every section through the Date/Version footer): -**To file:** write `~/.gstack/contributor-logs/{slug}.md`: ``` # {Title} -**What I tried:** {action} | **What happened:** {result} | **Rating:** {0-10} -## Repro + +Hey gstack team — ran into this while using /{skill-name}: + +**What I was trying to do:** {what the user/agent was attempting} +**What happened instead:** {what actually happened} +**My rating:** {0-10} — {one sentence on why it wasn't a 10} + +## Steps to reproduce 1. {step} + +## Raw output +``` +{paste the actual error or unexpected output here} +``` + ## What would make this a 10 -{one sentence} -**Date:** {YYYY-MM-DD} | **Version:** {version} | **Skill:** /{skill} +{one sentence: what gstack should have done differently} + +**Date:** {YYYY-MM-DD} | **Version:** {gstack version} | **Skill:** /{skill} ``` -Slug: lowercase hyphens, max 60 chars. Skip if exists. Max 3/session. File inline, don't stop. + +Slug: lowercase, hyphens, max 60 chars (e.g. `browse-js-no-await`). Skip if file already exists. Max 3 reports per session. File inline and continue — don't stop the workflow. Tell user: "Filed gstack field report: {title}" ## Completion Status Protocol diff --git a/qa/SKILL.md b/qa/SKILL.md index b63d6fbd0..290c89af2 100644 --- a/qa/SKILL.md +++ b/qa/SKILL.md @@ -3,7 +3,6 @@ name: qa preamble-tier: 4 version: 2.0.0 description: | - MANUAL TRIGGER ONLY: invoke only when user types /qa. Systematically QA test a web application and fix bugs found. Runs QA testing, then iteratively fixes bugs in source code, committing each fix atomically and re-verifying. Use when asked to "qa", "QA", "test this site", "find bugs", @@ -113,6 +112,7 @@ This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. 3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. 4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` +5. **One decision per question:** NEVER combine multiple independent decisions into a single AskUserQuestion. Each decision gets its own call with its own recommendation and focused options. Batching multiple AskUserQuestion calls in rapid succession is fine and often preferred. Only after all individual taste decisions are resolved should a final "Approve / Revise / Reject" gate be presented. Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -120,54 +120,97 @@ Per-skill instructions may add additional formatting rules on top of this baseli ## Completeness Principle — Boil the Lake -AI makes completeness near-free. Always recommend the complete option over shortcuts — the delta is minutes with CC+gstack. A "lake" (100% coverage, all edge cases) is boilable; an "ocean" (full rewrite, multi-quarter migration) is not. Boil lakes, flag oceans. +AI-assisted coding makes the marginal cost of completeness near-zero. When you present options: -**Effort reference** — always show both scales: +- If Option A is the complete implementation (full parity, all edge cases, 100% coverage) and Option B is a shortcut that saves modest effort — **always recommend A**. The delta between 80 lines and 150 lines is meaningless with CC+gstack. "Good enough" is the wrong instinct when "complete" costs minutes more. +- **Lake vs. ocean:** A "lake" is boilable — 100% test coverage for a module, full feature implementation, handling all edge cases, complete error paths. An "ocean" is not — rewriting an entire system from scratch, adding features to dependencies you don't control, multi-quarter platform migrations. Recommend boiling lakes. Flag oceans as out of scope. +- **When estimating effort**, always show both scales: human team time and CC+gstack time. The compression ratio varies by task type — use this reference: | Task type | Human team | CC+gstack | Compression | |-----------|-----------|-----------|-------------| -| Boilerplate | 2 days | 15 min | ~100x | -| Tests | 1 day | 15 min | ~50x | -| Feature | 1 week | 30 min | ~30x | -| Bug fix | 4 hours | 15 min | ~20x | +| Boilerplate / scaffolding | 2 days | 15 min | ~100x | +| Test writing | 1 day | 15 min | ~50x | +| Feature implementation | 1 week | 30 min | ~30x | +| Bug fix + regression test | 4 hours | 15 min | ~20x | +| Architecture / design | 2 days | 4 hours | ~5x | +| Research / exploration | 1 day | 3 hours | ~3x | -Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). +- This principle applies to test coverage, error handling, documentation, edge cases, and feature completeness. Don't skip the last 10% to "save time" — with AI, that 10% costs seconds. -## Repo Ownership — See Something, Say Something +**Anti-patterns — DON'T do this:** +- BAD: "Choose B — it covers 90% of the value with less code." (If A is only 70 lines more, choose A.) +- BAD: "We can skip edge case handling to save time." (Edge case handling costs minutes with CC.) +- BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) +- BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") -`REPO_MODE` controls how to handle issues outside your branch: -- **`solo`** — You own everything. Investigate and offer to fix proactively. -- **`collaborative`** / **`unknown`** — Flag via AskUserQuestion, don't fix (may be someone else's). +## Repo Ownership Mode — See Something, Say Something -Always flag anything that looks wrong — one sentence, what you noticed and its impact. +`REPO_MODE` from the preamble tells you who owns issues in this repo: + +- **`solo`** — One person does 80%+ of the work. They own everything. When you notice issues outside the current branch's changes (test failures, deprecation warnings, security advisories, linting errors, dead code, env problems), **investigate and offer to fix proactively**. The solo dev is the only person who will fix it. Default to action. +- **`collaborative`** — Multiple active contributors. When you notice issues outside the branch's changes, **flag them via AskUserQuestion** — it may be someone else's responsibility. Default to asking, not fixing. +- **`unknown`** — Treat as collaborative (safer default — ask before fixing). + +**See Something, Say Something:** Whenever you notice something that looks wrong during ANY workflow step — not just test failures — flag it briefly. One sentence: what you noticed and its impact. In solo mode, follow up with "Want me to fix it?" In collaborative mode, just flag it and move on. + +Never let a noticed issue silently pass. The whole point is proactive communication. ## Search Before Building -Before building anything unfamiliar, **search first.** See `~/.claude/skills/gstack/ETHOS.md`. -- **Layer 1** (tried and true) — don't reinvent. **Layer 2** (new and popular) — scrutinize. **Layer 3** (first principles) — prize above all. +Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.claude/skills/gstack/ETHOS.md` for the full philosophy. + +**Three layers of knowledge:** +- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. +- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. +- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. + +**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: +"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." -**Eureka:** When first-principles reasoning contradicts conventional wisdom, name it and log: +Log eureka moments: ```bash jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true ``` +Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. + +**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." ## Contributor Mode -If `_CONTRIB` is `true`: you are in **contributor mode**. At the end of each major workflow step, rate your gstack experience 0-10. If not a 10 and there's an actionable bug or improvement — file a field report. +If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. + +**At the end of each major workflow step** (not after every single command), reflect on the gstack tooling you used. Rate your experience 0 to 10. If it wasn't a 10, think about why. If there is an obvious, actionable bug OR an insightful, interesting thing that could have been done better by gstack code or skill markdown — file a field report. Maybe our contributor will help make us better! + +**Calibration — this is the bar:** For example, `$B js "await fetch(...)"` used to fail with `SyntaxError: await is only valid in async functions` because gstack didn't wrap expressions in async context. Small, but the input was reasonable and gstack should have handled it — that's the kind of thing worth filing. Things less consequential than this, ignore. + +**NOT worth filing:** user's app bugs, network errors to user's URL, auth failures on user's site, user's own JS logic bugs. -**File only:** gstack tooling bugs where the input was reasonable but gstack failed. **Skip:** user app bugs, network errors, auth failures on user's site. +**To file:** write `~/.gstack/contributor-logs/{slug}.md` with **all sections below** (do not truncate — include every section through the Date/Version footer): -**To file:** write `~/.gstack/contributor-logs/{slug}.md`: ``` # {Title} -**What I tried:** {action} | **What happened:** {result} | **Rating:** {0-10} -## Repro + +Hey gstack team — ran into this while using /{skill-name}: + +**What I was trying to do:** {what the user/agent was attempting} +**What happened instead:** {what actually happened} +**My rating:** {0-10} — {one sentence on why it wasn't a 10} + +## Steps to reproduce 1. {step} + +## Raw output +``` +{paste the actual error or unexpected output here} +``` + ## What would make this a 10 -{one sentence} -**Date:** {YYYY-MM-DD} | **Version:** {version} | **Skill:** /{skill} +{one sentence: what gstack should have done differently} + +**Date:** {YYYY-MM-DD} | **Version:** {gstack version} | **Skill:** /{skill} ``` -Slug: lowercase hyphens, max 60 chars. Skip if exists. Max 3/session. File inline, don't stop. + +Slug: lowercase, hyphens, max 60 chars (e.g. `browse-js-no-await`). Skip if file already exists. Max 3 reports per session. File inline and continue — don't stop the workflow. Tell user: "Filed gstack field report: {title}" ## Completion Status Protocol diff --git a/retro/SKILL.md b/retro/SKILL.md index 141605541..806ffde30 100644 --- a/retro/SKILL.md +++ b/retro/SKILL.md @@ -3,7 +3,6 @@ name: retro preamble-tier: 2 version: 2.0.0 description: | - MANUAL TRIGGER ONLY: invoke only when user types /retro. Weekly engineering retrospective. Analyzes commit history, work patterns, and code quality metrics with persistent history and trend tracking. Team-aware: breaks down per-person contributions with praise and growth areas. @@ -107,6 +106,7 @@ This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. 3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. 4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` +5. **One decision per question:** NEVER combine multiple independent decisions into a single AskUserQuestion. Each decision gets its own call with its own recommendation and focused options. Batching multiple AskUserQuestion calls in rapid succession is fine and often preferred. Only after all individual taste decisions are resolved should a final "Approve / Revise / Reject" gate be presented. Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -114,36 +114,97 @@ Per-skill instructions may add additional formatting rules on top of this baseli ## Completeness Principle — Boil the Lake -AI makes completeness near-free. Always recommend the complete option over shortcuts — the delta is minutes with CC+gstack. A "lake" (100% coverage, all edge cases) is boilable; an "ocean" (full rewrite, multi-quarter migration) is not. Boil lakes, flag oceans. +AI-assisted coding makes the marginal cost of completeness near-zero. When you present options: -**Effort reference** — always show both scales: +- If Option A is the complete implementation (full parity, all edge cases, 100% coverage) and Option B is a shortcut that saves modest effort — **always recommend A**. The delta between 80 lines and 150 lines is meaningless with CC+gstack. "Good enough" is the wrong instinct when "complete" costs minutes more. +- **Lake vs. ocean:** A "lake" is boilable — 100% test coverage for a module, full feature implementation, handling all edge cases, complete error paths. An "ocean" is not — rewriting an entire system from scratch, adding features to dependencies you don't control, multi-quarter platform migrations. Recommend boiling lakes. Flag oceans as out of scope. +- **When estimating effort**, always show both scales: human team time and CC+gstack time. The compression ratio varies by task type — use this reference: | Task type | Human team | CC+gstack | Compression | |-----------|-----------|-----------|-------------| -| Boilerplate | 2 days | 15 min | ~100x | -| Tests | 1 day | 15 min | ~50x | -| Feature | 1 week | 30 min | ~30x | -| Bug fix | 4 hours | 15 min | ~20x | +| Boilerplate / scaffolding | 2 days | 15 min | ~100x | +| Test writing | 1 day | 15 min | ~50x | +| Feature implementation | 1 week | 30 min | ~30x | +| Bug fix + regression test | 4 hours | 15 min | ~20x | +| Architecture / design | 2 days | 4 hours | ~5x | +| Research / exploration | 1 day | 3 hours | ~3x | -Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). +- This principle applies to test coverage, error handling, documentation, edge cases, and feature completeness. Don't skip the last 10% to "save time" — with AI, that 10% costs seconds. + +**Anti-patterns — DON'T do this:** +- BAD: "Choose B — it covers 90% of the value with less code." (If A is only 70 lines more, choose A.) +- BAD: "We can skip edge case handling to save time." (Edge case handling costs minutes with CC.) +- BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) +- BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") + +## Repo Ownership Mode — See Something, Say Something + +`REPO_MODE` from the preamble tells you who owns issues in this repo: + +- **`solo`** — One person does 80%+ of the work. They own everything. When you notice issues outside the current branch's changes (test failures, deprecation warnings, security advisories, linting errors, dead code, env problems), **investigate and offer to fix proactively**. The solo dev is the only person who will fix it. Default to action. +- **`collaborative`** — Multiple active contributors. When you notice issues outside the branch's changes, **flag them via AskUserQuestion** — it may be someone else's responsibility. Default to asking, not fixing. +- **`unknown`** — Treat as collaborative (safer default — ask before fixing). + +**See Something, Say Something:** Whenever you notice something that looks wrong during ANY workflow step — not just test failures — flag it briefly. One sentence: what you noticed and its impact. In solo mode, follow up with "Want me to fix it?" In collaborative mode, just flag it and move on. + +Never let a noticed issue silently pass. The whole point is proactive communication. + +## Search Before Building + +Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.claude/skills/gstack/ETHOS.md` for the full philosophy. + +**Three layers of knowledge:** +- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. +- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. +- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. + +**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: +"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." + +Log eureka moments: +```bash +jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true +``` +Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. + +**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." ## Contributor Mode -If `_CONTRIB` is `true`: you are in **contributor mode**. At the end of each major workflow step, rate your gstack experience 0-10. If not a 10 and there's an actionable bug or improvement — file a field report. +If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. -**File only:** gstack tooling bugs where the input was reasonable but gstack failed. **Skip:** user app bugs, network errors, auth failures on user's site. +**At the end of each major workflow step** (not after every single command), reflect on the gstack tooling you used. Rate your experience 0 to 10. If it wasn't a 10, think about why. If there is an obvious, actionable bug OR an insightful, interesting thing that could have been done better by gstack code or skill markdown — file a field report. Maybe our contributor will help make us better! + +**Calibration — this is the bar:** For example, `$B js "await fetch(...)"` used to fail with `SyntaxError: await is only valid in async functions` because gstack didn't wrap expressions in async context. Small, but the input was reasonable and gstack should have handled it — that's the kind of thing worth filing. Things less consequential than this, ignore. + +**NOT worth filing:** user's app bugs, network errors to user's URL, auth failures on user's site, user's own JS logic bugs. + +**To file:** write `~/.gstack/contributor-logs/{slug}.md` with **all sections below** (do not truncate — include every section through the Date/Version footer): -**To file:** write `~/.gstack/contributor-logs/{slug}.md`: ``` # {Title} -**What I tried:** {action} | **What happened:** {result} | **Rating:** {0-10} -## Repro + +Hey gstack team — ran into this while using /{skill-name}: + +**What I was trying to do:** {what the user/agent was attempting} +**What happened instead:** {what actually happened} +**My rating:** {0-10} — {one sentence on why it wasn't a 10} + +## Steps to reproduce 1. {step} + +## Raw output +``` +{paste the actual error or unexpected output here} +``` + ## What would make this a 10 -{one sentence} -**Date:** {YYYY-MM-DD} | **Version:** {version} | **Skill:** /{skill} +{one sentence: what gstack should have done differently} + +**Date:** {YYYY-MM-DD} | **Version:** {gstack version} | **Skill:** /{skill} ``` -Slug: lowercase hyphens, max 60 chars. Skip if exists. Max 3/session. File inline, don't stop. + +Slug: lowercase, hyphens, max 60 chars (e.g. `browse-js-no-await`). Skip if file already exists. Max 3 reports per session. File inline and continue — don't stop the workflow. Tell user: "Filed gstack field report: {title}" ## Completion Status Protocol @@ -676,6 +737,28 @@ Narrative covering: - If prior retro exists and has `test_health`: show delta "Test count: {last} → {now} (+{delta})" - If test ratio < 20%: flag as growth area — "100% test coverage is the goal. Tests make vibe coding safe." +### Plan Completion +Check review JSONL logs for plan completion data from /ship runs this period: + +```bash +eval "$(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null)" +cat ~/.gstack/projects/$SLUG/*-reviews.jsonl 2>/dev/null | grep '"skill":"ship"' | grep '"plan_items_total"' || echo "NO_PLAN_DATA" +``` + +If plan completion data exists within the retro time window: +- Count branches shipped with plans (entries that have `plan_items_total` > 0) +- Compute average completion: sum of `plan_items_done` / sum of `plan_items_total` +- Identify most-skipped item category if data supports it + +Output: +``` +Plan Completion This Period: + {N} branches shipped with plans + Average completion: {X}% ({done}/{total} items) +``` + +If no plan data exists, skip this section silently. + ### Focus & Highlights (from Step 8) - Focus score with interpretation diff --git a/retro/SKILL.md.tmpl b/retro/SKILL.md.tmpl index 57a3759a7..dae967ef4 100644 --- a/retro/SKILL.md.tmpl +++ b/retro/SKILL.md.tmpl @@ -460,6 +460,28 @@ Narrative covering: - If prior retro exists and has `test_health`: show delta "Test count: {last} → {now} (+{delta})" - If test ratio < 20%: flag as growth area — "100% test coverage is the goal. Tests make vibe coding safe." +### Plan Completion +Check review JSONL logs for plan completion data from /ship runs this period: + +```bash +eval "$(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null)" +cat ~/.gstack/projects/$SLUG/*-reviews.jsonl 2>/dev/null | grep '"skill":"ship"' | grep '"plan_items_total"' || echo "NO_PLAN_DATA" +``` + +If plan completion data exists within the retro time window: +- Count branches shipped with plans (entries that have `plan_items_total` > 0) +- Compute average completion: sum of `plan_items_done` / sum of `plan_items_total` +- Identify most-skipped item category if data supports it + +Output: +``` +Plan Completion This Period: + {N} branches shipped with plans + Average completion: {X}% ({done}/{total} items) +``` + +If no plan data exists, skip this section silently. + ### Focus & Highlights (from Step 8) - Focus score with interpretation diff --git a/review/SKILL.md.tmpl b/review/SKILL.md.tmpl index 712b91a90..bb9a3bc73 100644 --- a/review/SKILL.md.tmpl +++ b/review/SKILL.md.tmpl @@ -46,7 +46,10 @@ Before reviewing code quality, check: **did they build what was requested — no **If no PR exists:** rely on commit messages and TODOS.md for stated intent — this is the common case since /review runs before /ship creates the PR. 2. Identify the **stated intent** — what was this branch supposed to accomplish? 3. Run `git diff origin/...HEAD --stat` and compare the files changed against the stated intent. -4. Evaluate with skepticism: + +{{PLAN_COMPLETION_AUDIT_REVIEW}} + +4. Evaluate with skepticism (incorporating plan completion results if available): **SCOPE CREEP detection:** - Files changed that are unrelated to the stated intent diff --git a/scripts/gen-skill-docs.ts b/scripts/gen-skill-docs.ts index 9f1db9bcb..8d483dad2 100644 --- a/scripts/gen-skill-docs.ts +++ b/scripts/gen-skill-docs.ts @@ -18,6 +18,7 @@ import type { Host, TemplateContext } from './resolvers/types'; import { HOST_PATHS } from './resolvers/types'; import { RESOLVERS } from './resolvers/index'; import { codexSkillName, transformFrontmatter, extractHookSafetyProse, extractNameAndDescription, condenseOpenAIShortDescription, generateOpenAIYaml } from './resolvers/codex-helpers'; +import { generatePlanCompletionAuditShip, generatePlanCompletionAuditReview, generatePlanVerificationExec } from './resolvers/review'; const ROOT = path.resolve(import.meta.dir, '..'); const DRY_RUN = process.argv.includes('--dry-run'); @@ -33,6 +34,2937 @@ const HOST: Host = (() => { throw new Error(`Unknown host: ${val}. Use claude, codex, or agents.`); })(); +interface HostPaths { + skillRoot: string; + localSkillRoot: string; + binDir: string; + browseDir: string; +} + +const HOST_PATHS: Record = { + claude: { + skillRoot: '~/.claude/skills/gstack', + localSkillRoot: '.claude/skills/gstack', + binDir: '~/.claude/skills/gstack/bin', + browseDir: '~/.claude/skills/gstack/browse/dist', + }, + codex: { + skillRoot: '$GSTACK_ROOT', + localSkillRoot: '.agents/skills/gstack', + binDir: '$GSTACK_BIN', + browseDir: '$GSTACK_BROWSE', + }, +}; + +interface TemplateContext { + skillName: string; + tmplPath: string; + benefitsFrom?: string[]; + host: Host; + paths: HostPaths; +} + +// ─── Shared Design Constants ──────────────────────────────── + +/** gstack's 10 AI slop anti-patterns — shared between DESIGN_METHODOLOGY and DESIGN_HARD_RULES */ +const AI_SLOP_BLACKLIST = [ + 'Purple/violet/indigo gradient backgrounds or blue-to-purple color schemes', + '**The 3-column feature grid:** icon-in-colored-circle + bold title + 2-line description, repeated 3x symmetrically. THE most recognizable AI layout.', + 'Icons in colored circles as section decoration (SaaS starter template look)', + 'Centered everything (`text-align: center` on all headings, descriptions, cards)', + 'Uniform bubbly border-radius on every element (same large radius on everything)', + 'Decorative blobs, floating circles, wavy SVG dividers (if a section feels empty, it needs better content, not decoration)', + 'Emoji as design elements (rockets in headings, emoji as bullet points)', + 'Colored left-border on cards (`border-left: 3px solid `)', + 'Generic hero copy ("Welcome to [X]", "Unlock the power of...", "Your all-in-one solution for...")', + 'Cookie-cutter section rhythm (hero → 3 features → testimonials → pricing → CTA, every section same height)', +]; + +/** OpenAI hard rejection criteria (from "Designing Delightful Frontends with GPT-5.4", Mar 2026) */ +const OPENAI_HARD_REJECTIONS = [ + 'Generic SaaS card grid as first impression', + 'Beautiful image with weak brand', + 'Strong headline with no clear action', + 'Busy imagery behind text', + 'Sections repeating same mood statement', + 'Carousel with no narrative purpose', + 'App UI made of stacked cards instead of layout', +]; + +/** OpenAI litmus checks — 7 yes/no tests for cross-model consensus scoring */ +const OPENAI_LITMUS_CHECKS = [ + 'Brand/product unmistakable in first screen?', + 'One strong visual anchor present?', + 'Page understandable by scanning headlines only?', + 'Each section has one job?', + 'Are cards actually necessary?', + 'Does motion improve hierarchy or atmosphere?', + 'Would design feel premium with all decorative shadows removed?', +]; + +// ─── Placeholder Resolvers ────────────────────────────────── + +function generateCommandReference(_ctx: TemplateContext): string { + // Group commands by category + const groups = new Map>(); + for (const [cmd, meta] of Object.entries(COMMAND_DESCRIPTIONS)) { + const list = groups.get(meta.category) || []; + list.push({ command: cmd, description: meta.description, usage: meta.usage }); + groups.set(meta.category, list); + } + + // Category display order + const categoryOrder = [ + 'Navigation', 'Reading', 'Interaction', 'Inspection', + 'Visual', 'Snapshot', 'Meta', 'Tabs', 'Server', + ]; + + const sections: string[] = []; + for (const category of categoryOrder) { + const commands = groups.get(category); + if (!commands || commands.length === 0) continue; + + // Sort alphabetically within category + commands.sort((a, b) => a.command.localeCompare(b.command)); + + sections.push(`### ${category}`); + sections.push('| Command | Description |'); + sections.push('|---------|-------------|'); + for (const cmd of commands) { + const display = cmd.usage ? `\`${cmd.usage}\`` : `\`${cmd.command}\``; + sections.push(`| ${display} | ${cmd.description} |`); + } + sections.push(''); + } + + return sections.join('\n').trimEnd(); +} + +function generateSnapshotFlags(_ctx: TemplateContext): string { + const lines: string[] = [ + 'The snapshot is your primary tool for understanding and interacting with pages.', + '', + '```', + ]; + + for (const flag of SNAPSHOT_FLAGS) { + const label = flag.valueHint ? `${flag.short} ${flag.valueHint}` : flag.short; + lines.push(`${label.padEnd(10)}${flag.long.padEnd(24)}${flag.description}`); + } + + lines.push('```'); + lines.push(''); + lines.push('All flags can be combined freely. `-o` only applies when `-a` is also used.'); + lines.push('Example: `$B snapshot -i -a -C -o /tmp/annotated.png`'); + lines.push(''); + lines.push('**Ref numbering:** @e refs are assigned sequentially (@e1, @e2, ...) in tree order.'); + lines.push('@c refs from `-C` are numbered separately (@c1, @c2, ...).'); + lines.push(''); + lines.push('After snapshot, use @refs as selectors in any command:'); + lines.push('```bash'); + lines.push('$B click @e3 $B fill @e4 "value" $B hover @e1'); + lines.push('$B html @e2 $B css @e5 "color" $B attrs @e6'); + lines.push('$B click @c1 # cursor-interactive ref (from -C)'); + lines.push('```'); + lines.push(''); + lines.push('**Output format:** indented accessibility tree with @ref IDs, one element per line.'); + lines.push('```'); + lines.push(' @e1 [heading] "Welcome" [level=1]'); + lines.push(' @e2 [textbox] "Email"'); + lines.push(' @e3 [button] "Submit"'); + lines.push('```'); + lines.push(''); + lines.push('Refs are invalidated on navigation — run `snapshot` again after `goto`.'); + + return lines.join('\n'); +} + +function generatePreambleBash(ctx: TemplateContext): string { + const runtimeRoot = ctx.host === 'codex' + ? `_ROOT=$(git rev-parse --show-toplevel 2>/dev/null) +GSTACK_ROOT="$HOME/.codex/skills/gstack" +[ -n "$_ROOT" ] && [ -d "$_ROOT/.agents/skills/gstack" ] && GSTACK_ROOT="$_ROOT/.agents/skills/gstack" +GSTACK_BIN="$GSTACK_ROOT/bin" +GSTACK_BROWSE="$GSTACK_ROOT/browse/dist" +` + : ''; + + return `## Preamble (run first) + +\`\`\`bash +${runtimeRoot}_UPD=$(${ctx.paths.binDir}/gstack-update-check 2>/dev/null || ${ctx.paths.localSkillRoot}/bin/gstack-update-check 2>/dev/null || true) +[ -n "$_UPD" ] && echo "$_UPD" || true +mkdir -p ~/.gstack/sessions +touch ~/.gstack/sessions/"$PPID" +_SESSIONS=$(find ~/.gstack/sessions -mmin -120 -type f 2>/dev/null | wc -l | tr -d ' ') +find ~/.gstack/sessions -mmin +120 -type f -delete 2>/dev/null || true +_CONTRIB=$(${ctx.paths.binDir}/gstack-config get gstack_contributor 2>/dev/null || true) +_PROACTIVE=$(${ctx.paths.binDir}/gstack-config get proactive 2>/dev/null || echo "true") +_BRANCH=$(git branch --show-current 2>/dev/null || echo "unknown") +echo "BRANCH: $_BRANCH" +echo "PROACTIVE: $_PROACTIVE" +source <(${ctx.paths.binDir}/gstack-repo-mode 2>/dev/null) || true +REPO_MODE=\${REPO_MODE:-unknown} +echo "REPO_MODE: $REPO_MODE" +_LAKE_SEEN=$([ -f ~/.gstack/.completeness-intro-seen ] && echo "yes" || echo "no") +echo "LAKE_INTRO: $_LAKE_SEEN" +_TEL=$(${ctx.paths.binDir}/gstack-config get telemetry 2>/dev/null || true) +_TEL_PROMPTED=$([ -f ~/.gstack/.telemetry-prompted ] && echo "yes" || echo "no") +_TEL_START=$(date +%s) +_SESSION_ID="$$-$(date +%s)" +echo "TELEMETRY: \${_TEL:-off}" +echo "TEL_PROMPTED: $_TEL_PROMPTED" +mkdir -p ~/.gstack/analytics +echo '{"skill":"${ctx.skillName}","ts":"'$(date -u +%Y-%m-%dT%H:%M:%SZ)'","repo":"'$(basename "$(git rev-parse --show-toplevel 2>/dev/null)" 2>/dev/null || echo "unknown")'"}' >> ~/.gstack/analytics/skill-usage.jsonl 2>/dev/null || true +# zsh-compatible: use find instead of glob to avoid NOMATCH error +for _PF in $(find ~/.gstack/analytics -maxdepth 1 -name '.pending-*' 2>/dev/null); do [ -f "$_PF" ] && ${ctx.paths.binDir}/gstack-telemetry-log --event-type skill_run --skill _pending_finalize --outcome unknown --session-id "$_SESSION_ID" 2>/dev/null || true; break; done +\`\`\``; +} + +function generateUpgradeCheck(ctx: TemplateContext): string { + return `If \`PROACTIVE\` is \`"false"\`, do not proactively suggest gstack skills — only invoke +them when the user explicitly asks. The user opted out of proactive suggestions. + +If output shows \`UPGRADE_AVAILABLE \`: read \`${ctx.paths.skillRoot}/gstack-upgrade/SKILL.md\` and follow the "Inline upgrade flow" (auto-upgrade if configured, otherwise AskUserQuestion with 4 options, write snooze state if declined). If \`JUST_UPGRADED \`: tell user "Running gstack v{to} (just updated!)" and continue.`; +} + +function generateLakeIntro(): string { + return `If \`LAKE_INTRO\` is \`no\`: Before continuing, introduce the Completeness Principle. +Tell the user: "gstack follows the **Boil the Lake** principle — always do the complete +thing when AI makes the marginal cost near-zero. Read more: https://garryslist.org/posts/boil-the-ocean" +Then offer to open the essay in their default browser: + +\`\`\`bash +open https://garryslist.org/posts/boil-the-ocean +touch ~/.gstack/.completeness-intro-seen +\`\`\` + +Only run \`open\` if the user says yes. Always run \`touch\` to mark as seen. This only happens once.`; +} + +function generateTelemetryPrompt(ctx: TemplateContext): string { + return `If \`TEL_PROMPTED\` is \`no\` AND \`LAKE_INTRO\` is \`yes\`: After the lake intro is handled, +ask the user about telemetry. Use AskUserQuestion: + +> Help gstack get better! Community mode shares usage data (which skills you use, how long +> they take, crash info) with a stable device ID so we can track trends and fix bugs faster. +> No code, file paths, or repo names are ever sent. +> Change anytime with \`gstack-config set telemetry off\`. + +Options: +- A) Help gstack get better! (recommended) +- B) No thanks + +If A: run \`${ctx.paths.binDir}/gstack-config set telemetry community\` + +If B: ask a follow-up AskUserQuestion: + +> How about anonymous mode? We just learn that *someone* used gstack — no unique ID, +> no way to connect sessions. Just a counter that helps us know if anyone's out there. + +Options: +- A) Sure, anonymous is fine +- B) No thanks, fully off + +If B→A: run \`${ctx.paths.binDir}/gstack-config set telemetry anonymous\` +If B→B: run \`${ctx.paths.binDir}/gstack-config set telemetry off\` + +Always run: +\`\`\`bash +touch ~/.gstack/.telemetry-prompted +\`\`\` + +This only happens once. If \`TEL_PROMPTED\` is \`yes\`, skip this entirely.`; +} + +function generateAskUserFormat(_ctx: TemplateContext): string { + return `## AskUserQuestion Format + +**ALWAYS follow this structure for every AskUserQuestion call:** +1. **Re-ground:** State the project, the current branch (use the \`_BRANCH\` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) +2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. +3. **Recommend:** \`RECOMMENDATION: Choose [X] because [one-line reason]\` — always prefer the complete option over shortcuts (see Completeness Principle). Include \`Completeness: X/10\` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. +4. **Options:** Lettered options: \`A) ... B) ... C) ...\` — when an option involves effort, show both scales: \`(human: ~X / CC: ~Y)\` +5. **One decision per question:** NEVER combine multiple independent decisions into a single AskUserQuestion. Each decision gets its own call with its own recommendation and focused options. Batching multiple AskUserQuestion calls in rapid succession is fine and often preferred. Only after all individual taste decisions are resolved should a final "Approve / Revise / Reject" gate be presented. + +Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. + +Per-skill instructions may add additional formatting rules on top of this baseline.`; +} + +function generateCompletenessSection(): string { + return `## Completeness Principle — Boil the Lake + +AI-assisted coding makes the marginal cost of completeness near-zero. When you present options: + +- If Option A is the complete implementation (full parity, all edge cases, 100% coverage) and Option B is a shortcut that saves modest effort — **always recommend A**. The delta between 80 lines and 150 lines is meaningless with CC+gstack. "Good enough" is the wrong instinct when "complete" costs minutes more. +- **Lake vs. ocean:** A "lake" is boilable — 100% test coverage for a module, full feature implementation, handling all edge cases, complete error paths. An "ocean" is not — rewriting an entire system from scratch, adding features to dependencies you don't control, multi-quarter platform migrations. Recommend boiling lakes. Flag oceans as out of scope. +- **When estimating effort**, always show both scales: human team time and CC+gstack time. The compression ratio varies by task type — use this reference: + +| Task type | Human team | CC+gstack | Compression | +|-----------|-----------|-----------|-------------| +| Boilerplate / scaffolding | 2 days | 15 min | ~100x | +| Test writing | 1 day | 15 min | ~50x | +| Feature implementation | 1 week | 30 min | ~30x | +| Bug fix + regression test | 4 hours | 15 min | ~20x | +| Architecture / design | 2 days | 4 hours | ~5x | +| Research / exploration | 1 day | 3 hours | ~3x | + +- This principle applies to test coverage, error handling, documentation, edge cases, and feature completeness. Don't skip the last 10% to "save time" — with AI, that 10% costs seconds. + +**Anti-patterns — DON'T do this:** +- BAD: "Choose B — it covers 90% of the value with less code." (If A is only 70 lines more, choose A.) +- BAD: "We can skip edge case handling to save time." (Edge case handling costs minutes with CC.) +- BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) +- BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.")`; +} + +function generateRepoModeSection(): string { + return `## Repo Ownership Mode — See Something, Say Something + +\`REPO_MODE\` from the preamble tells you who owns issues in this repo: + +- **\`solo\`** — One person does 80%+ of the work. They own everything. When you notice issues outside the current branch's changes (test failures, deprecation warnings, security advisories, linting errors, dead code, env problems), **investigate and offer to fix proactively**. The solo dev is the only person who will fix it. Default to action. +- **\`collaborative\`** — Multiple active contributors. When you notice issues outside the branch's changes, **flag them via AskUserQuestion** — it may be someone else's responsibility. Default to asking, not fixing. +- **\`unknown\`** — Treat as collaborative (safer default — ask before fixing). + +**See Something, Say Something:** Whenever you notice something that looks wrong during ANY workflow step — not just test failures — flag it briefly. One sentence: what you noticed and its impact. In solo mode, follow up with "Want me to fix it?" In collaborative mode, just flag it and move on. + +Never let a noticed issue silently pass. The whole point is proactive communication.`; +} + +function generateTestFailureTriage(): string { + return `## Test Failure Ownership Triage + +When tests fail, do NOT immediately stop. First, determine ownership: + +### Step T1: Classify each failure + +For each failing test: + +1. **Get the files changed on this branch:** + \`\`\`bash + git diff origin/...HEAD --name-only + \`\`\` + +2. **Classify the failure:** + - **In-branch** if: the failing test file itself was modified on this branch, OR the test output references code that was changed on this branch, OR you can trace the failure to a change in the branch diff. + - **Likely pre-existing** if: neither the test file nor the code it tests was modified on this branch, AND the failure is unrelated to any branch change you can identify. + - **When ambiguous, default to in-branch.** It is safer to stop the developer than to let a broken test ship. Only classify as pre-existing when you are confident. + + This classification is heuristic — use your judgment reading the diff and the test output. You do not have a programmatic dependency graph. + +### Step T2: Handle in-branch failures + +**STOP.** These are your failures. Show them and do not proceed. The developer must fix their own broken tests before shipping. + +### Step T3: Handle pre-existing failures + +Check \`REPO_MODE\` from the preamble output. + +**If REPO_MODE is \`solo\`:** + +Use AskUserQuestion: + +> These test failures appear pre-existing (not caused by your branch changes): +> +> [list each failure with file:line and brief error description] +> +> Since this is a solo repo, you're the only one who will fix these. +> +> RECOMMENDATION: Choose A — fix now while the context is fresh. Completeness: 9/10. +> A) Investigate and fix now (human: ~2-4h / CC: ~15min) — Completeness: 10/10 +> B) Add as P0 TODO — fix after this branch lands — Completeness: 7/10 +> C) Skip — I know about this, ship anyway — Completeness: 3/10 + +**If REPO_MODE is \`collaborative\` or \`unknown\`:** + +Use AskUserQuestion: + +> These test failures appear pre-existing (not caused by your branch changes): +> +> [list each failure with file:line and brief error description] +> +> This is a collaborative repo — these may be someone else's responsibility. +> +> RECOMMENDATION: Choose B — assign it to whoever broke it so the right person fixes it. Completeness: 9/10. +> A) Investigate and fix now anyway — Completeness: 10/10 +> B) Blame + assign GitHub issue to the author — Completeness: 9/10 +> C) Add as P0 TODO — Completeness: 7/10 +> D) Skip — ship anyway — Completeness: 3/10 + +### Step T4: Execute the chosen action + +**If "Investigate and fix now":** +- Switch to /investigate mindset: root cause first, then minimal fix. +- Fix the pre-existing failure. +- Commit the fix separately from the branch's changes: \`git commit -m "fix: pre-existing test failure in "\` +- Continue with the workflow. + +**If "Add as P0 TODO":** +- If \`TODOS.md\` exists, add the entry following the format in \`review/TODOS-format.md\` (or \`.claude/skills/review/TODOS-format.md\`). +- If \`TODOS.md\` does not exist, create it with the standard header and add the entry. +- Entry should include: title, the error output, which branch it was noticed on, and priority P0. +- Continue with the workflow — treat the pre-existing failure as non-blocking. + +**If "Blame + assign GitHub issue" (collaborative only):** +- Find who likely broke it. Check BOTH the test file AND the production code it tests: + \`\`\`bash + # Who last touched the failing test? + git log --format="%an (%ae)" -1 -- + # Who last touched the production code the test covers? (often the actual breaker) + git log --format="%an (%ae)" -1 -- + \`\`\` + If these are different people, prefer the production code author — they likely introduced the regression. +- Create a GitHub issue assigned to that person: + \`\`\`bash + gh issue create \\ + --title "Pre-existing test failure: " \\ + --body "Found failing on branch . Failure is pre-existing.\\n\\n**Error:**\\n\`\`\`\\n\\n\`\`\`\\n\\n**Last modified by:** \\n**Noticed by:** gstack /ship on " \\ + --assignee "" + \`\`\` +- If \`gh\` is not available or \`--assignee\` fails (user not in org, etc.), create the issue without assignee and note who should look at it in the body. +- Continue with the workflow. + +**If "Skip":** +- Continue with the workflow. +- Note in output: "Pre-existing test failure skipped: "`; +} + +function generateSearchBeforeBuildingSection(ctx: TemplateContext): string { + return `## Search Before Building + +Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read \`${ctx.paths.skillRoot}/ETHOS.md\` for the full philosophy. + +**Three layers of knowledge:** +- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. +- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. +- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. + +**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: +"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." + +Log eureka moments: +\`\`\`bash +jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true +\`\`\` +Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. + +**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only."`; +} + +function generateContributorMode(): string { + return `## Contributor Mode + +If \`_CONTRIB\` is \`true\`: you are in **contributor mode**. You're a gstack user who also helps make it better. + +**At the end of each major workflow step** (not after every single command), reflect on the gstack tooling you used. Rate your experience 0 to 10. If it wasn't a 10, think about why. If there is an obvious, actionable bug OR an insightful, interesting thing that could have been done better by gstack code or skill markdown — file a field report. Maybe our contributor will help make us better! + +**Calibration — this is the bar:** For example, \`$B js "await fetch(...)"\` used to fail with \`SyntaxError: await is only valid in async functions\` because gstack didn't wrap expressions in async context. Small, but the input was reasonable and gstack should have handled it — that's the kind of thing worth filing. Things less consequential than this, ignore. + +**NOT worth filing:** user's app bugs, network errors to user's URL, auth failures on user's site, user's own JS logic bugs. + +**To file:** write \`~/.gstack/contributor-logs/{slug}.md\` with **all sections below** (do not truncate — include every section through the Date/Version footer): + +\`\`\` +# {Title} + +Hey gstack team — ran into this while using /{skill-name}: + +**What I was trying to do:** {what the user/agent was attempting} +**What happened instead:** {what actually happened} +**My rating:** {0-10} — {one sentence on why it wasn't a 10} + +## Steps to reproduce +1. {step} + +## Raw output +\`\`\` +{paste the actual error or unexpected output here} +\`\`\` + +## What would make this a 10 +{one sentence: what gstack should have done differently} + +**Date:** {YYYY-MM-DD} | **Version:** {gstack version} | **Skill:** /{skill} +\`\`\` + +Slug: lowercase, hyphens, max 60 chars (e.g. \`browse-js-no-await\`). Skip if file already exists. Max 3 reports per session. File inline and continue — don't stop the workflow. Tell user: "Filed gstack field report: {title}"`; +} + +function generateCompletionStatus(): string { + return `## Completion Status Protocol + +When completing a skill workflow, report status using one of: +- **DONE** — All steps completed successfully. Evidence provided for each claim. +- **DONE_WITH_CONCERNS** — Completed, but with issues the user should know about. List each concern. +- **BLOCKED** — Cannot proceed. State what is blocking and what was tried. +- **NEEDS_CONTEXT** — Missing information required to continue. State exactly what you need. + +### Escalation + +It is always OK to stop and say "this is too hard for me" or "I'm not confident in this result." + +Bad work is worse than no work. You will not be penalized for escalating. +- If you have attempted a task 3 times without success, STOP and escalate. +- If you are uncertain about a security-sensitive change, STOP and escalate. +- If the scope of work exceeds what you can verify, STOP and escalate. + +Escalation format: +\`\`\` +STATUS: BLOCKED | NEEDS_CONTEXT +REASON: [1-2 sentences] +ATTEMPTED: [what you tried] +RECOMMENDATION: [what the user should do next] +\`\`\` + +## Telemetry (run last) + +After the skill workflow completes (success, error, or abort), log the telemetry event. +Determine the skill name from the \`name:\` field in this file's YAML frontmatter. +Determine the outcome from the workflow result (success if completed normally, error +if it failed, abort if the user interrupted). + +**PLAN MODE EXCEPTION — ALWAYS RUN:** This command writes telemetry to +\`~/.gstack/analytics/\` (user config directory, not project files). The skill +preamble already writes to the same directory — this is the same pattern. +Skipping this command loses session duration and outcome data. + +Run this bash: + +\`\`\`bash +_TEL_END=$(date +%s) +_TEL_DUR=$(( _TEL_END - _TEL_START )) +rm -f ~/.gstack/analytics/.pending-"$_SESSION_ID" 2>/dev/null || true +~/.claude/skills/gstack/bin/gstack-telemetry-log \\ + --skill "SKILL_NAME" --duration "$_TEL_DUR" --outcome "OUTCOME" \\ + --used-browse "USED_BROWSE" --session-id "$_SESSION_ID" 2>/dev/null & +\`\`\` + +Replace \`SKILL_NAME\` with the actual skill name from frontmatter, \`OUTCOME\` with +success/error/abort, and \`USED_BROWSE\` with true/false based on whether \`$B\` was used. +If you cannot determine the outcome, use "unknown". This runs in the background and +never blocks the user. + +## Plan Status Footer + +When you are in plan mode and about to call ExitPlanMode: + +1. Check if the plan file already has a \`## GSTACK REVIEW REPORT\` section. +2. If it DOES — skip (a review skill already wrote a richer report). +3. If it does NOT — run this command: + +\\\`\\\`\\\`bash +~/.claude/skills/gstack/bin/gstack-review-read +\\\`\\\`\\\` + +Then write a \`## GSTACK REVIEW REPORT\` section to the end of the plan file: + +- If the output contains review entries (JSONL lines before \`---CONFIG---\`): format the + standard report table with runs/status/findings per skill, same format as the review + skills use. +- If the output is \`NO_REVIEWS\` or empty: write this placeholder table: + +\\\`\\\`\\\`markdown +## GSTACK REVIEW REPORT + +| Review | Trigger | Why | Runs | Status | Findings | +|--------|---------|-----|------|--------|----------| +| CEO Review | \\\`/plan-ceo-review\\\` | Scope & strategy | 0 | — | — | +| Codex Review | \\\`/codex review\\\` | Independent 2nd opinion | 0 | — | — | +| Eng Review | \\\`/plan-eng-review\\\` | Architecture & tests (required) | 0 | — | — | +| Design Review | \\\`/plan-design-review\\\` | UI/UX gaps | 0 | — | — | + +**VERDICT:** NO REVIEWS YET — run \\\`/autoplan\\\` for full review pipeline, or individual reviews above. +\\\`\\\`\\\` + +**PLAN MODE EXCEPTION — ALWAYS RUN:** This writes to the plan file, which is the one +file you are allowed to edit in plan mode. The plan file review report is part of the +plan's living status.`; +} + +function generatePreamble(ctx: TemplateContext): string { + return [ + generatePreambleBash(ctx), + generateUpgradeCheck(ctx), + generateLakeIntro(), + generateTelemetryPrompt(ctx), + generateAskUserFormat(ctx), + generateCompletenessSection(), + generateRepoModeSection(), + generateSearchBeforeBuildingSection(ctx), + generateContributorMode(), + generateCompletionStatus(), + ].join('\n\n'); +} + +function generateBrowseSetup(ctx: TemplateContext): string { + return `## SETUP (run this check BEFORE any browse command) + +\`\`\`bash +_ROOT=$(git rev-parse --show-toplevel 2>/dev/null) +B="" +[ -n "$_ROOT" ] && [ -x "$_ROOT/${ctx.paths.localSkillRoot}/browse/dist/browse" ] && B="$_ROOT/${ctx.paths.localSkillRoot}/browse/dist/browse" +[ -z "$B" ] && B=${ctx.paths.browseDir}/browse +if [ -x "$B" ]; then + echo "READY: $B" +else + echo "NEEDS_SETUP" +fi +\`\`\` + +If \`NEEDS_SETUP\`: +1. Tell the user: "gstack browse needs a one-time build (~10 seconds). OK to proceed?" Then STOP and wait. +2. Run: \`cd && ./setup\` +3. If \`bun\` is not installed: \`curl -fsSL https://bun.sh/install | bash\``; +} + +function generateBaseBranchDetect(_ctx: TemplateContext): string { + return `## Step 0: Detect base branch + +Determine which branch this PR targets. Use the result as "the base branch" in all subsequent steps. + +1. Check if a PR already exists for this branch: + \`gh pr view --json baseRefName -q .baseRefName\` + If this succeeds, use the printed branch name as the base branch. + +2. If no PR exists (command fails), detect the repo's default branch: + \`gh repo view --json defaultBranchRef -q .defaultBranchRef.name\` + +3. If both commands fail, fall back to \`main\`. + +Print the detected base branch name. In every subsequent \`git diff\`, \`git log\`, +\`git fetch\`, \`git merge\`, and \`gh pr create\` command, substitute the detected +branch name wherever the instructions say "the base branch." + +---`; +} + +function generateQAMethodology(_ctx: TemplateContext): string { + return `## Modes + +### Diff-aware (automatic when on a feature branch with no URL) + +This is the **primary mode** for developers verifying their work. When the user says \`/qa\` without a URL and the repo is on a feature branch, automatically: + +1. **Analyze the branch diff** to understand what changed: + \`\`\`bash + git diff main...HEAD --name-only + git log main..HEAD --oneline + \`\`\` + +2. **Identify affected pages/routes** from the changed files: + - Controller/route files → which URL paths they serve + - View/template/component files → which pages render them + - Model/service files → which pages use those models (check controllers that reference them) + - CSS/style files → which pages include those stylesheets + - API endpoints → test them directly with \`$B js "await fetch('/api/...')"\` + - Static pages (markdown, HTML) → navigate to them directly + + **If no obvious pages/routes are identified from the diff:** Do not skip browser testing. The user invoked /qa because they want browser-based verification. Fall back to Quick mode — navigate to the homepage, follow the top 5 navigation targets, check console for errors, and test any interactive elements found. Backend, config, and infrastructure changes affect app behavior — always verify the app still works. + +3. **Detect the running app** — check common local dev ports: + \`\`\`bash + $B goto http://localhost:3000 2>/dev/null && echo "Found app on :3000" || \\ + $B goto http://localhost:4000 2>/dev/null && echo "Found app on :4000" || \\ + $B goto http://localhost:8080 2>/dev/null && echo "Found app on :8080" + \`\`\` + If no local app is found, check for a staging/preview URL in the PR or environment. If nothing works, ask the user for the URL. + +4. **Test each affected page/route:** + - Navigate to the page + - Take a screenshot + - Check console for errors + - If the change was interactive (forms, buttons, flows), test the interaction end-to-end + - Use \`snapshot -D\` before and after actions to verify the change had the expected effect + +5. **Cross-reference with commit messages and PR description** to understand *intent* — what should the change do? Verify it actually does that. + +6. **Check TODOS.md** (if it exists) for known bugs or issues related to the changed files. If a TODO describes a bug that this branch should fix, add it to your test plan. If you find a new bug during QA that isn't in TODOS.md, note it in the report. + +7. **Report findings** scoped to the branch changes: + - "Changes tested: N pages/routes affected by this branch" + - For each: does it work? Screenshot evidence. + - Any regressions on adjacent pages? + +**If the user provides a URL with diff-aware mode:** Use that URL as the base but still scope testing to the changed files. + +### Full (default when URL is provided) +Systematic exploration. Visit every reachable page. Document 5-10 well-evidenced issues. Produce health score. Takes 5-15 minutes depending on app size. + +### Quick (\`--quick\`) +30-second smoke test. Visit homepage + top 5 navigation targets. Check: page loads? Console errors? Broken links? Produce health score. No detailed issue documentation. + +### Regression (\`--regression \`) +Run full mode, then load \`baseline.json\` from a previous run. Diff: which issues are fixed? Which are new? What's the score delta? Append regression section to report. + +--- + +## Workflow + +### Phase 1: Initialize + +1. Find browse binary (see Setup above) +2. Create output directories +3. Copy report template from \`qa/templates/qa-report-template.md\` to output dir +4. Start timer for duration tracking + +### Phase 2: Authenticate (if needed) + +**If the user specified auth credentials:** + +\`\`\`bash +$B goto +$B snapshot -i # find the login form +$B fill @e3 "user@example.com" +$B fill @e4 "[REDACTED]" # NEVER include real passwords in report +$B click @e5 # submit +$B snapshot -D # verify login succeeded +\`\`\` + +**If the user provided a cookie file:** + +\`\`\`bash +$B cookie-import cookies.json +$B goto +\`\`\` + +**If 2FA/OTP is required:** Ask the user for the code and wait. + +**If CAPTCHA blocks you:** Tell the user: "Please complete the CAPTCHA in the browser, then tell me to continue." + +### Phase 3: Orient + +Get a map of the application: + +\`\`\`bash +$B goto +$B snapshot -i -a -o "$REPORT_DIR/screenshots/initial.png" +$B links # map navigation structure +$B console --errors # any errors on landing? +\`\`\` + +**Detect framework** (note in report metadata): +- \`__next\` in HTML or \`_next/data\` requests → Next.js +- \`csrf-token\` meta tag → Rails +- \`wp-content\` in URLs → WordPress +- Client-side routing with no page reloads → SPA + +**For SPAs:** The \`links\` command may return few results because navigation is client-side. Use \`snapshot -i\` to find nav elements (buttons, menu items) instead. + +### Phase 4: Explore + +Visit pages systematically. At each page: + +\`\`\`bash +$B goto +$B snapshot -i -a -o "$REPORT_DIR/screenshots/page-name.png" +$B console --errors +\`\`\` + +Then follow the **per-page exploration checklist** (see \`qa/references/issue-taxonomy.md\`): + +1. **Visual scan** — Look at the annotated screenshot for layout issues +2. **Interactive elements** — Click buttons, links, controls. Do they work? +3. **Forms** — Fill and submit. Test empty, invalid, edge cases +4. **Navigation** — Check all paths in and out +5. **States** — Empty state, loading, error, overflow +6. **Console** — Any new JS errors after interactions? +7. **Responsiveness** — Check mobile viewport if relevant: + \`\`\`bash + $B viewport 375x812 + $B screenshot "$REPORT_DIR/screenshots/page-mobile.png" + $B viewport 1280x720 + \`\`\` + +**Depth judgment:** Spend more time on core features (homepage, dashboard, checkout, search) and less on secondary pages (about, terms, privacy). + +**Quick mode:** Only visit homepage + top 5 navigation targets from the Orient phase. Skip the per-page checklist — just check: loads? Console errors? Broken links visible? + +### Phase 5: Document + +Document each issue **immediately when found** — don't batch them. + +**Two evidence tiers:** + +**Interactive bugs** (broken flows, dead buttons, form failures): +1. Take a screenshot before the action +2. Perform the action +3. Take a screenshot showing the result +4. Use \`snapshot -D\` to show what changed +5. Write repro steps referencing screenshots + +\`\`\`bash +$B screenshot "$REPORT_DIR/screenshots/issue-001-step-1.png" +$B click @e5 +$B screenshot "$REPORT_DIR/screenshots/issue-001-result.png" +$B snapshot -D +\`\`\` + +**Static bugs** (typos, layout issues, missing images): +1. Take a single annotated screenshot showing the problem +2. Describe what's wrong + +\`\`\`bash +$B snapshot -i -a -o "$REPORT_DIR/screenshots/issue-002.png" +\`\`\` + +**Write each issue to the report immediately** using the template format from \`qa/templates/qa-report-template.md\`. + +### Phase 6: Wrap Up + +1. **Compute health score** using the rubric below +2. **Write "Top 3 Things to Fix"** — the 3 highest-severity issues +3. **Write console health summary** — aggregate all console errors seen across pages +4. **Update severity counts** in the summary table +5. **Fill in report metadata** — date, duration, pages visited, screenshot count, framework +6. **Save baseline** — write \`baseline.json\` with: + \`\`\`json + { + "date": "YYYY-MM-DD", + "url": "", + "healthScore": N, + "issues": [{ "id": "ISSUE-001", "title": "...", "severity": "...", "category": "..." }], + "categoryScores": { "console": N, "links": N, ... } + } + \`\`\` + +**Regression mode:** After writing the report, load the baseline file. Compare: +- Health score delta +- Issues fixed (in baseline but not current) +- New issues (in current but not baseline) +- Append the regression section to the report + +--- + +## Health Score Rubric + +Compute each category score (0-100), then take the weighted average. + +### Console (weight: 15%) +- 0 errors → 100 +- 1-3 errors → 70 +- 4-10 errors → 40 +- 10+ errors → 10 + +### Links (weight: 10%) +- 0 broken → 100 +- Each broken link → -15 (minimum 0) + +### Per-Category Scoring (Visual, Functional, UX, Content, Performance, Accessibility) +Each category starts at 100. Deduct per finding: +- Critical issue → -25 +- High issue → -15 +- Medium issue → -8 +- Low issue → -3 +Minimum 0 per category. + +### Weights +| Category | Weight | +|----------|--------| +| Console | 15% | +| Links | 10% | +| Visual | 10% | +| Functional | 20% | +| UX | 15% | +| Performance | 10% | +| Content | 5% | +| Accessibility | 15% | + +### Final Score +\`score = Σ (category_score × weight)\` + +--- + +## Framework-Specific Guidance + +### Next.js +- Check console for hydration errors (\`Hydration failed\`, \`Text content did not match\`) +- Monitor \`_next/data\` requests in network — 404s indicate broken data fetching +- Test client-side navigation (click links, don't just \`goto\`) — catches routing issues +- Check for CLS (Cumulative Layout Shift) on pages with dynamic content + +### Rails +- Check for N+1 query warnings in console (if development mode) +- Verify CSRF token presence in forms +- Test Turbo/Stimulus integration — do page transitions work smoothly? +- Check for flash messages appearing and dismissing correctly + +### WordPress +- Check for plugin conflicts (JS errors from different plugins) +- Verify admin bar visibility for logged-in users +- Test REST API endpoints (\`/wp-json/\`) +- Check for mixed content warnings (common with WP) + +### General SPA (React, Vue, Angular) +- Use \`snapshot -i\` for navigation — \`links\` command misses client-side routes +- Check for stale state (navigate away and back — does data refresh?) +- Test browser back/forward — does the app handle history correctly? +- Check for memory leaks (monitor console after extended use) + +--- + +## Important Rules + +1. **Repro is everything.** Every issue needs at least one screenshot. No exceptions. +2. **Verify before documenting.** Retry the issue once to confirm it's reproducible, not a fluke. +3. **Never include credentials.** Write \`[REDACTED]\` for passwords in repro steps. +4. **Write incrementally.** Append each issue to the report as you find it. Don't batch. +5. **Never read source code.** Test as a user, not a developer. +6. **Check console after every interaction.** JS errors that don't surface visually are still bugs. +7. **Test like a user.** Use realistic data. Walk through complete workflows end-to-end. +8. **Depth over breadth.** 5-10 well-documented issues with evidence > 20 vague descriptions. +9. **Never delete output files.** Screenshots and reports accumulate — that's intentional. +10. **Use \`snapshot -C\` for tricky UIs.** Finds clickable divs that the accessibility tree misses. +11. **Show screenshots to the user.** After every \`$B screenshot\`, \`$B snapshot -a -o\`, or \`$B responsive\` command, use the Read tool on the output file(s) so the user can see them inline. For \`responsive\` (3 files), Read all three. This is critical — without it, screenshots are invisible to the user. +12. **Never refuse to use the browser.** When the user invokes /qa or /qa-only, they are requesting browser-based testing. Never suggest evals, unit tests, or other alternatives as a substitute. Even if the diff appears to have no UI changes, backend changes affect app behavior — always open the browser and test.`; +} + +function generateDesignReviewLite(ctx: TemplateContext): string { + const litmusList = OPENAI_LITMUS_CHECKS.map((item, i) => `${i + 1}. ${item}`).join(' '); + const rejectionList = OPENAI_HARD_REJECTIONS.map((item, i) => `${i + 1}. ${item}`).join(' '); + // Codex block only for Claude host + const codexBlock = ctx.host === 'codex' ? '' : ` + +7. **Codex design voice** (optional, automatic if available): + +\`\`\`bash +which codex 2>/dev/null && echo "CODEX_AVAILABLE" || echo "CODEX_NOT_AVAILABLE" +\`\`\` + +If Codex is available, run a lightweight design check on the diff: + +\`\`\`bash +TMPERR_DRL=$(mktemp /tmp/codex-drl-XXXXXXXX) +codex exec "Review the git diff on this branch. Run 7 litmus checks (YES/NO each): ${litmusList} Flag any hard rejections: ${rejectionList} 5 most important design findings only. Reference file:line." -s read-only -c 'model_reasoning_effort="high"' --enable web_search_cached 2>"$TMPERR_DRL" +\`\`\` + +Use a 5-minute timeout (\`timeout: 300000\`). After the command completes, read stderr: +\`\`\`bash +cat "$TMPERR_DRL" && rm -f "$TMPERR_DRL" +\`\`\` + +**Error handling:** All errors are non-blocking. On auth failure, timeout, or empty response — skip with a brief note and continue. + +Present Codex output under a \`CODEX (design):\` header, merged with the checklist findings above.`; + + return `## Design Review (conditional, diff-scoped) + +Check if the diff touches frontend files using \`gstack-diff-scope\`: + +\`\`\`bash +source <(${ctx.paths.binDir}/gstack-diff-scope 2>/dev/null) +\`\`\` + +**If \`SCOPE_FRONTEND=false\`:** Skip design review silently. No output. + +**If \`SCOPE_FRONTEND=true\`:** + +1. **Check for DESIGN.md.** If \`DESIGN.md\` or \`design-system.md\` exists in the repo root, read it. All design findings are calibrated against it — patterns blessed in DESIGN.md are not flagged. If not found, use universal design principles. + +2. **Read \`.claude/skills/review/design-checklist.md\`.** If the file cannot be read, skip design review with a note: "Design checklist not found — skipping design review." + +3. **Read each changed frontend file** (full file, not just diff hunks). Frontend files are identified by the patterns listed in the checklist. + +4. **Apply the design checklist** against the changed files. For each item: + - **[HIGH] mechanical CSS fix** (\`outline: none\`, \`!important\`, \`font-size < 16px\`): classify as AUTO-FIX + - **[HIGH/MEDIUM] design judgment needed**: classify as ASK + - **[LOW] intent-based detection**: present as "Possible — verify visually or run /design-review" + +5. **Include findings** in the review output under a "Design Review" header, following the output format in the checklist. Design findings merge with code review findings into the same Fix-First flow. + +6. **Log the result** for the Review Readiness Dashboard: + +\`\`\`bash +${ctx.paths.binDir}/gstack-review-log '{"skill":"design-review-lite","timestamp":"TIMESTAMP","status":"STATUS","findings":N,"auto_fixed":M,"commit":"COMMIT"}' +\`\`\` + +Substitute: TIMESTAMP = ISO 8601 datetime, STATUS = "clean" if 0 findings or "issues_found", N = total findings, M = auto-fixed count, COMMIT = output of \`git rev-parse --short HEAD\`.${codexBlock}`; +} + +// NOTE: design-checklist.md is a subset of this methodology for code-level detection. +// When adding items here, also update review/design-checklist.md, and vice versa. +function generateDesignMethodology(_ctx: TemplateContext): string { + return `## Modes + +### Full (default) +Systematic review of all pages reachable from homepage. Visit 5-8 pages. Full checklist evaluation, responsive screenshots, interaction flow testing. Produces complete design audit report with letter grades. + +### Quick (\`--quick\`) +Homepage + 2 key pages only. First Impression + Design System Extraction + abbreviated checklist. Fastest path to a design score. + +### Deep (\`--deep\`) +Comprehensive review: 10-15 pages, every interaction flow, exhaustive checklist. For pre-launch audits or major redesigns. + +### Diff-aware (automatic when on a feature branch with no URL) +When on a feature branch, scope to pages affected by the branch changes: +1. Analyze the branch diff: \`git diff main...HEAD --name-only\` +2. Map changed files to affected pages/routes +3. Detect running app on common local ports (3000, 4000, 8080) +4. Audit only affected pages, compare design quality before/after + +### Regression (\`--regression\` or previous \`design-baseline.json\` found) +Run full audit, then load previous \`design-baseline.json\`. Compare: per-category grade deltas, new findings, resolved findings. Output regression table in report. + +--- + +## Phase 1: First Impression + +The most uniquely designer-like output. Form a gut reaction before analyzing anything. + +1. Navigate to the target URL +2. Take a full-page desktop screenshot: \`$B screenshot "$REPORT_DIR/screenshots/first-impression.png"\` +3. Write the **First Impression** using this structured critique format: + - "The site communicates **[what]**." (what it says at a glance — competence? playfulness? confusion?) + - "I notice **[observation]**." (what stands out, positive or negative — be specific) + - "The first 3 things my eye goes to are: **[1]**, **[2]**, **[3]**." (hierarchy check — are these intentional?) + - "If I had to describe this in one word: **[word]**." (gut verdict) + +This is the section users read first. Be opinionated. A designer doesn't hedge — they react. + +--- + +## Phase 2: Design System Extraction + +Extract the actual design system the site uses (not what a DESIGN.md says, but what's rendered): + +\`\`\`bash +# Fonts in use (capped at 500 elements to avoid timeout) +$B js "JSON.stringify([...new Set([...document.querySelectorAll('*')].slice(0,500).map(e => getComputedStyle(e).fontFamily))])" + +# Color palette in use +$B js "JSON.stringify([...new Set([...document.querySelectorAll('*')].slice(0,500).flatMap(e => [getComputedStyle(e).color, getComputedStyle(e).backgroundColor]).filter(c => c !== 'rgba(0, 0, 0, 0)'))])" + +# Heading hierarchy +$B js "JSON.stringify([...document.querySelectorAll('h1,h2,h3,h4,h5,h6')].map(h => ({tag:h.tagName, text:h.textContent.trim().slice(0,50), size:getComputedStyle(h).fontSize, weight:getComputedStyle(h).fontWeight})))" + +# Touch target audit (find undersized interactive elements) +$B js "JSON.stringify([...document.querySelectorAll('a,button,input,[role=button]')].filter(e => {const r=e.getBoundingClientRect(); return r.width>0 && (r.width<44||r.height<44)}).map(e => ({tag:e.tagName, text:(e.textContent||'').trim().slice(0,30), w:Math.round(e.getBoundingClientRect().width), h:Math.round(e.getBoundingClientRect().height)})).slice(0,20))" + +# Performance baseline +$B perf +\`\`\` + +Structure findings as an **Inferred Design System**: +- **Fonts:** list with usage counts. Flag if >3 distinct font families. +- **Colors:** palette extracted. Flag if >12 unique non-gray colors. Note warm/cool/mixed. +- **Heading Scale:** h1-h6 sizes. Flag skipped levels, non-systematic size jumps. +- **Spacing Patterns:** sample padding/margin values. Flag non-scale values. + +After extraction, offer: *"Want me to save this as your DESIGN.md? I can lock in these observations as your project's design system baseline."* + +--- + +## Phase 3: Page-by-Page Visual Audit + +For each page in scope: + +\`\`\`bash +$B goto +$B snapshot -i -a -o "$REPORT_DIR/screenshots/{page}-annotated.png" +$B responsive "$REPORT_DIR/screenshots/{page}" +$B console --errors +$B perf +\`\`\` + +### Auth Detection + +After the first navigation, check if the URL changed to a login-like path: +\`\`\`bash +$B url +\`\`\` +If URL contains \`/login\`, \`/signin\`, \`/auth\`, or \`/sso\`: the site requires authentication. AskUserQuestion: "This site requires authentication. Want to import cookies from your browser? Run \`/setup-browser-cookies\` first if needed." + +### Design Audit Checklist (10 categories, ~80 items) + +Apply these at each page. Each finding gets an impact rating (high/medium/polish) and category. + +**1. Visual Hierarchy & Composition** (8 items) +- Clear focal point? One primary CTA per view? +- Eye flows naturally top-left to bottom-right? +- Visual noise — competing elements fighting for attention? +- Information density appropriate for content type? +- Z-index clarity — nothing unexpectedly overlapping? +- Above-the-fold content communicates purpose in 3 seconds? +- Squint test: hierarchy still visible when blurred? +- White space is intentional, not leftover? + +**2. Typography** (15 items) +- Font count <=3 (flag if more) +- Scale follows ratio (1.25 major third or 1.333 perfect fourth) +- Line-height: 1.5x body, 1.15-1.25x headings +- Measure: 45-75 chars per line (66 ideal) +- Heading hierarchy: no skipped levels (h1→h3 without h2) +- Weight contrast: >=2 weights used for hierarchy +- No blacklisted fonts (Papyrus, Comic Sans, Lobster, Impact, Jokerman) +- If primary font is Inter/Roboto/Open Sans/Poppins → flag as potentially generic +- \`text-wrap: balance\` or \`text-pretty\` on headings (check via \`$B css text-wrap\`) +- Curly quotes used, not straight quotes +- Ellipsis character (\`…\`) not three dots (\`...\`) +- \`font-variant-numeric: tabular-nums\` on number columns +- Body text >= 16px +- Caption/label >= 12px +- No letterspacing on lowercase text + +**3. Color & Contrast** (10 items) +- Palette coherent (<=12 unique non-gray colors) +- WCAG AA: body text 4.5:1, large text (18px+) 3:1, UI components 3:1 +- Semantic colors consistent (success=green, error=red, warning=yellow/amber) +- No color-only encoding (always add labels, icons, or patterns) +- Dark mode: surfaces use elevation, not just lightness inversion +- Dark mode: text off-white (~#E0E0E0), not pure white +- Primary accent desaturated 10-20% in dark mode +- \`color-scheme: dark\` on html element (if dark mode present) +- No red/green only combinations (8% of men have red-green deficiency) +- Neutral palette is warm or cool consistently — not mixed + +**4. Spacing & Layout** (12 items) +- Grid consistent at all breakpoints +- Spacing uses a scale (4px or 8px base), not arbitrary values +- Alignment is consistent — nothing floats outside the grid +- Rhythm: related items closer together, distinct sections further apart +- Border-radius hierarchy (not uniform bubbly radius on everything) +- Inner radius = outer radius - gap (nested elements) +- No horizontal scroll on mobile +- Max content width set (no full-bleed body text) +- \`env(safe-area-inset-*)\` for notch devices +- URL reflects state (filters, tabs, pagination in query params) +- Flex/grid used for layout (not JS measurement) +- Breakpoints: mobile (375), tablet (768), desktop (1024), wide (1440) + +**5. Interaction States** (10 items) +- Hover state on all interactive elements +- \`focus-visible\` ring present (never \`outline: none\` without replacement) +- Active/pressed state with depth effect or color shift +- Disabled state: reduced opacity + \`cursor: not-allowed\` +- Loading: skeleton shapes match real content layout +- Empty states: warm message + primary action + visual (not just "No items.") +- Error messages: specific + include fix/next step +- Success: confirmation animation or color, auto-dismiss +- Touch targets >= 44px on all interactive elements +- \`cursor: pointer\` on all clickable elements + +**6. Responsive Design** (8 items) +- Mobile layout makes *design* sense (not just stacked desktop columns) +- Touch targets sufficient on mobile (>= 44px) +- No horizontal scroll on any viewport +- Images handle responsive (srcset, sizes, or CSS containment) +- Text readable without zooming on mobile (>= 16px body) +- Navigation collapses appropriately (hamburger, bottom nav, etc.) +- Forms usable on mobile (correct input types, no autoFocus on mobile) +- No \`user-scalable=no\` or \`maximum-scale=1\` in viewport meta + +**7. Motion & Animation** (6 items) +- Easing: ease-out for entering, ease-in for exiting, ease-in-out for moving +- Duration: 50-700ms range (nothing slower unless page transition) +- Purpose: every animation communicates something (state change, attention, spatial relationship) +- \`prefers-reduced-motion\` respected (check: \`$B js "matchMedia('(prefers-reduced-motion: reduce)').matches"\`) +- No \`transition: all\` — properties listed explicitly +- Only \`transform\` and \`opacity\` animated (not layout properties like width, height, top, left) + +**8. Content & Microcopy** (8 items) +- Empty states designed with warmth (message + action + illustration/icon) +- Error messages specific: what happened + why + what to do next +- Button labels specific ("Save API Key" not "Continue" or "Submit") +- No placeholder/lorem ipsum text visible in production +- Truncation handled (\`text-overflow: ellipsis\`, \`line-clamp\`, or \`break-words\`) +- Active voice ("Install the CLI" not "The CLI will be installed") +- Loading states end with \`…\` ("Saving…" not "Saving...") +- Destructive actions have confirmation modal or undo window + +**9. AI Slop Detection** (10 anti-patterns — the blacklist) + +The test: would a human designer at a respected studio ever ship this? + +${AI_SLOP_BLACKLIST.map(item => `- ${item}`).join('\n')} + +**10. Performance as Design** (6 items) +- LCP < 2.0s (web apps), < 1.5s (informational sites) +- CLS < 0.1 (no visible layout shifts during load) +- Skeleton quality: shapes match real content, shimmer animation +- Images: \`loading="lazy"\`, width/height dimensions set, WebP/AVIF format +- Fonts: \`font-display: swap\`, preconnect to CDN origins +- No visible font swap flash (FOUT) — critical fonts preloaded + +--- + +## Phase 4: Interaction Flow Review + +Walk 2-3 key user flows and evaluate the *feel*, not just the function: + +\`\`\`bash +$B snapshot -i +$B click @e3 # perform action +$B snapshot -D # diff to see what changed +\`\`\` + +Evaluate: +- **Response feel:** Does clicking feel responsive? Any delays or missing loading states? +- **Transition quality:** Are transitions intentional or generic/absent? +- **Feedback clarity:** Did the action clearly succeed or fail? Is the feedback immediate? +- **Form polish:** Focus states visible? Validation timing correct? Errors near the source? + +--- + +## Phase 5: Cross-Page Consistency + +Compare screenshots and observations across pages for: +- Navigation bar consistent across all pages? +- Footer consistent? +- Component reuse vs one-off designs (same button styled differently on different pages?) +- Tone consistency (one page playful while another is corporate?) +- Spacing rhythm carries across pages? + +--- + +## Phase 6: Compile Report + +### Output Locations + +**Local:** \`.gstack/design-reports/design-audit-{domain}-{YYYY-MM-DD}.md\` + +**Project-scoped:** +\`\`\`bash +eval "$(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null)" && mkdir -p ~/.gstack/projects/$SLUG +\`\`\` +Write to: \`~/.gstack/projects/{slug}/{user}-{branch}-design-audit-{datetime}.md\` + +**Baseline:** Write \`design-baseline.json\` for regression mode: +\`\`\`json +{ + "date": "YYYY-MM-DD", + "url": "", + "designScore": "B", + "aiSlopScore": "C", + "categoryGrades": { "hierarchy": "A", "typography": "B", ... }, + "findings": [{ "id": "FINDING-001", "title": "...", "impact": "high", "category": "typography" }] +} +\`\`\` + +### Scoring System + +**Dual headline scores:** +- **Design Score: {A-F}** — weighted average of all 10 categories +- **AI Slop Score: {A-F}** — standalone grade with pithy verdict + +**Per-category grades:** +- **A:** Intentional, polished, delightful. Shows design thinking. +- **B:** Solid fundamentals, minor inconsistencies. Looks professional. +- **C:** Functional but generic. No major problems, no design point of view. +- **D:** Noticeable problems. Feels unfinished or careless. +- **F:** Actively hurting user experience. Needs significant rework. + +**Grade computation:** Each category starts at A. Each High-impact finding drops one letter grade. Each Medium-impact finding drops half a letter grade. Polish findings are noted but do not affect grade. Minimum is F. + +**Category weights for Design Score:** +| Category | Weight | +|----------|--------| +| Visual Hierarchy | 15% | +| Typography | 15% | +| Spacing & Layout | 15% | +| Color & Contrast | 10% | +| Interaction States | 10% | +| Responsive | 10% | +| Content Quality | 10% | +| AI Slop | 5% | +| Motion | 5% | +| Performance Feel | 5% | + +AI Slop is 5% of Design Score but also graded independently as a headline metric. + +### Regression Output + +When previous \`design-baseline.json\` exists or \`--regression\` flag is used: +- Load baseline grades +- Compare: per-category deltas, new findings, resolved findings +- Append regression table to report + +--- + +## Design Critique Format + +Use structured feedback, not opinions: +- "I notice..." — observation (e.g., "I notice the primary CTA competes with the secondary action") +- "I wonder..." — question (e.g., "I wonder if users will understand what 'Process' means here") +- "What if..." — suggestion (e.g., "What if we moved search to a more prominent position?") +- "I think... because..." — reasoned opinion (e.g., "I think the spacing between sections is too uniform because it doesn't create hierarchy") + +Tie everything to user goals and product objectives. Always suggest specific improvements alongside problems. + +--- + +## Important Rules + +1. **Think like a designer, not a QA engineer.** You care whether things feel right, look intentional, and respect the user. You do NOT just care whether things "work." +2. **Screenshots are evidence.** Every finding needs at least one screenshot. Use annotated screenshots (\`snapshot -a\`) to highlight elements. +3. **Be specific and actionable.** "Change X to Y because Z" — not "the spacing feels off." +4. **Never read source code.** Evaluate the rendered site, not the implementation. (Exception: offer to write DESIGN.md from extracted observations.) +5. **AI Slop detection is your superpower.** Most developers can't evaluate whether their site looks AI-generated. You can. Be direct about it. +6. **Quick wins matter.** Always include a "Quick Wins" section — the 3-5 highest-impact fixes that take <30 minutes each. +7. **Use \`snapshot -C\` for tricky UIs.** Finds clickable divs that the accessibility tree misses. +8. **Responsive is design, not just "not broken."** A stacked desktop layout on mobile is not responsive design — it's lazy. Evaluate whether the mobile layout makes *design* sense. +9. **Document incrementally.** Write each finding to the report as you find it. Don't batch. +10. **Depth over breadth.** 5-10 well-documented findings with screenshots and specific suggestions > 20 vague observations. +11. **Show screenshots to the user.** After every \`$B screenshot\`, \`$B snapshot -a -o\`, or \`$B responsive\` command, use the Read tool on the output file(s) so the user can see them inline. For \`responsive\` (3 files), Read all three. This is critical — without it, screenshots are invisible to the user.`; +} + +function generateReviewDashboard(_ctx: TemplateContext): string { + return `## Review Readiness Dashboard + +After completing the review, read the review log and config to display the dashboard. + +\`\`\`bash +~/.claude/skills/gstack/bin/gstack-review-read +\`\`\` + +Parse the output. Find the most recent entry for each skill (plan-ceo-review, plan-eng-review, review, plan-design-review, design-review-lite, adversarial-review, codex-review, codex-plan-review). Ignore entries with timestamps older than 7 days. For the Eng Review row, show whichever is more recent between \`review\` (diff-scoped pre-landing review) and \`plan-eng-review\` (plan-stage architecture review). Append "(DIFF)" or "(PLAN)" to the status to distinguish. For the Adversarial row, show whichever is more recent between \`adversarial-review\` (new auto-scaled) and \`codex-review\` (legacy). For Design Review, show whichever is more recent between \`plan-design-review\` (full visual audit) and \`design-review-lite\` (code-level check). Append "(FULL)" or "(LITE)" to the status to distinguish. Display: + +\`\`\` ++====================================================================+ +| REVIEW READINESS DASHBOARD | ++====================================================================+ +| Review | Runs | Last Run | Status | Required | +|-----------------|------|---------------------|-----------|----------| +| Eng Review | 1 | 2026-03-16 15:00 | CLEAR | YES | +| CEO Review | 0 | — | — | no | +| Design Review | 0 | — | — | no | +| Adversarial | 0 | — | — | no | +| Outside Voice | 0 | — | — | no | ++--------------------------------------------------------------------+ +| VERDICT: CLEARED — Eng Review passed | ++====================================================================+ +\`\`\` + +**Review tiers:** +- **Eng Review (required by default):** The only review that gates shipping. Covers architecture, code quality, tests, performance. Can be disabled globally with \\\`gstack-config set skip_eng_review true\\\` (the "don't bother me" setting). +- **CEO Review (optional):** Use your judgment. Recommend it for big product/business changes, new user-facing features, or scope decisions. Skip for bug fixes, refactors, infra, and cleanup. +- **Design Review (optional):** Use your judgment. Recommend it for UI/UX changes. Skip for backend-only, infra, or prompt-only changes. +- **Adversarial Review (automatic):** Auto-scales by diff size. Small diffs (<50 lines) skip adversarial. Medium diffs (50–199) get cross-model adversarial. Large diffs (200+) get all 4 passes: Claude structured, Codex structured, Claude adversarial subagent, Codex adversarial. No configuration needed. +- **Outside Voice (optional):** Independent plan review from a different AI model. Offered after all review sections complete in /plan-ceo-review and /plan-eng-review. Falls back to Claude subagent if Codex is unavailable. Never gates shipping. + +**Verdict logic:** +- **CLEARED**: Eng Review has >= 1 entry within 7 days from either \\\`review\\\` or \\\`plan-eng-review\\\` with status "clean" (or \\\`skip_eng_review\\\` is \\\`true\\\`) +- **NOT CLEARED**: Eng Review missing, stale (>7 days), or has open issues +- CEO, Design, and Codex reviews are shown for context but never block shipping +- If \\\`skip_eng_review\\\` config is \\\`true\\\`, Eng Review shows "SKIPPED (global)" and verdict is CLEARED + +**Staleness detection:** After displaying the dashboard, check if any existing reviews may be stale: +- Parse the \\\`---HEAD---\\\` section from the bash output to get the current HEAD commit hash +- For each review entry that has a \\\`commit\\\` field: compare it against the current HEAD. If different, count elapsed commits: \\\`git rev-list --count STORED_COMMIT..HEAD\\\`. Display: "Note: {skill} review from {date} may be stale — {N} commits since review" +- For entries without a \\\`commit\\\` field (legacy entries): display "Note: {skill} review from {date} has no commit tracking — consider re-running for accurate staleness detection" +- If all reviews match the current HEAD, do not display any staleness notes`; +} + +function generatePlanFileReviewReport(_ctx: TemplateContext): string { + return `## Plan File Review Report + +After displaying the Review Readiness Dashboard in conversation output, also update the +**plan file** itself so review status is visible to anyone reading the plan. + +### Detect the plan file + +1. Check if there is an active plan file in this conversation (the host provides plan file + paths in system messages — look for plan file references in the conversation context). +2. If not found, skip this section silently — not every review runs in plan mode. + +### Generate the report + +Read the review log output you already have from the Review Readiness Dashboard step above. +Parse each JSONL entry. Each skill logs different fields: + +- **plan-ceo-review**: \\\`status\\\`, \\\`unresolved\\\`, \\\`critical_gaps\\\`, \\\`mode\\\`, \\\`scope_proposed\\\`, \\\`scope_accepted\\\`, \\\`scope_deferred\\\`, \\\`commit\\\` + → Findings: "{scope_proposed} proposals, {scope_accepted} accepted, {scope_deferred} deferred" + → If scope fields are 0 or missing (HOLD/REDUCTION mode): "mode: {mode}, {critical_gaps} critical gaps" +- **plan-eng-review**: \\\`status\\\`, \\\`unresolved\\\`, \\\`critical_gaps\\\`, \\\`issues_found\\\`, \\\`mode\\\`, \\\`commit\\\` + → Findings: "{issues_found} issues, {critical_gaps} critical gaps" +- **plan-design-review**: \\\`status\\\`, \\\`initial_score\\\`, \\\`overall_score\\\`, \\\`unresolved\\\`, \\\`decisions_made\\\`, \\\`commit\\\` + → Findings: "score: {initial_score}/10 → {overall_score}/10, {decisions_made} decisions" +- **codex-review**: \\\`status\\\`, \\\`gate\\\`, \\\`findings\\\`, \\\`findings_fixed\\\` + → Findings: "{findings} findings, {findings_fixed}/{findings} fixed" + +All fields needed for the Findings column are now present in the JSONL entries. +For the review you just completed, you may use richer details from your own Completion +Summary. For prior reviews, use the JSONL fields directly — they contain all required data. + +Produce this markdown table: + +\\\`\\\`\\\`markdown +## GSTACK REVIEW REPORT + +| Review | Trigger | Why | Runs | Status | Findings | +|--------|---------|-----|------|--------|----------| +| CEO Review | \\\`/plan-ceo-review\\\` | Scope & strategy | {runs} | {status} | {findings} | +| Codex Review | \\\`/codex review\\\` | Independent 2nd opinion | {runs} | {status} | {findings} | +| Eng Review | \\\`/plan-eng-review\\\` | Architecture & tests (required) | {runs} | {status} | {findings} | +| Design Review | \\\`/plan-design-review\\\` | UI/UX gaps | {runs} | {status} | {findings} | +\\\`\\\`\\\` + +Below the table, add these lines (omit any that are empty/not applicable): + +- **CODEX:** (only if codex-review ran) — one-line summary of codex fixes +- **CROSS-MODEL:** (only if both Claude and Codex reviews exist) — overlap analysis +- **UNRESOLVED:** total unresolved decisions across all reviews +- **VERDICT:** list reviews that are CLEAR (e.g., "CEO + ENG CLEARED — ready to implement"). + If Eng Review is not CLEAR and not skipped globally, append "eng review required". + +### Write to the plan file + +**PLAN MODE EXCEPTION — ALWAYS RUN:** This writes to the plan file, which is the one +file you are allowed to edit in plan mode. The plan file review report is part of the +plan's living status. + +- Search the plan file for a \\\`## GSTACK REVIEW REPORT\\\` section **anywhere** in the file + (not just at the end — content may have been added after it). +- If found, **replace it** entirely using the Edit tool. Match from \\\`## GSTACK REVIEW REPORT\\\` + through either the next \\\`## \\\` heading or end of file, whichever comes first. This ensures + content added after the report section is preserved, not eaten. If the Edit fails + (e.g., concurrent edit changed the content), re-read the plan file and retry once. +- If no such section exists, **append it** to the end of the plan file. +- Always place it as the very last section in the plan file. If it was found mid-file, + move it: delete the old location and append at the end.`; +} + +function generateTestBootstrap(_ctx: TemplateContext): string { + return `## Test Framework Bootstrap + +**Detect existing test framework and project runtime:** + +\`\`\`bash +# Detect project runtime +[ -f Gemfile ] && echo "RUNTIME:ruby" +[ -f package.json ] && echo "RUNTIME:node" +[ -f requirements.txt ] || [ -f pyproject.toml ] && echo "RUNTIME:python" +[ -f go.mod ] && echo "RUNTIME:go" +[ -f Cargo.toml ] && echo "RUNTIME:rust" +[ -f composer.json ] && echo "RUNTIME:php" +[ -f mix.exs ] && echo "RUNTIME:elixir" +# Detect sub-frameworks +[ -f Gemfile ] && grep -q "rails" Gemfile 2>/dev/null && echo "FRAMEWORK:rails" +[ -f package.json ] && grep -q '"next"' package.json 2>/dev/null && echo "FRAMEWORK:nextjs" +# Check for existing test infrastructure +ls jest.config.* vitest.config.* playwright.config.* .rspec pytest.ini pyproject.toml phpunit.xml 2>/dev/null +ls -d test/ tests/ spec/ __tests__/ cypress/ e2e/ 2>/dev/null +# Check opt-out marker +[ -f .gstack/no-test-bootstrap ] && echo "BOOTSTRAP_DECLINED" +\`\`\` + +**If test framework detected** (config files or test directories found): +Print "Test framework detected: {name} ({N} existing tests). Skipping bootstrap." +Read 2-3 existing test files to learn conventions (naming, imports, assertion style, setup patterns). +Store conventions as prose context for use in Phase 8e.5 or Step 3.4. **Skip the rest of bootstrap.** + +**If BOOTSTRAP_DECLINED** appears: Print "Test bootstrap previously declined — skipping." **Skip the rest of bootstrap.** + +**If NO runtime detected** (no config files found): Use AskUserQuestion: +"I couldn't detect your project's language. What runtime are you using?" +Options: A) Node.js/TypeScript B) Ruby/Rails C) Python D) Go E) Rust F) PHP G) Elixir H) This project doesn't need tests. +If user picks H → write \`.gstack/no-test-bootstrap\` and continue without tests. + +**If runtime detected but no test framework — bootstrap:** + +### B2. Research best practices + +Use WebSearch to find current best practices for the detected runtime: +- \`"[runtime] best test framework 2025 2026"\` +- \`"[framework A] vs [framework B] comparison"\` + +If WebSearch is unavailable, use this built-in knowledge table: + +| Runtime | Primary recommendation | Alternative | +|---------|----------------------|-------------| +| Ruby/Rails | minitest + fixtures + capybara | rspec + factory_bot + shoulda-matchers | +| Node.js | vitest + @testing-library | jest + @testing-library | +| Next.js | vitest + @testing-library/react + playwright | jest + cypress | +| Python | pytest + pytest-cov | unittest | +| Go | stdlib testing + testify | stdlib only | +| Rust | cargo test (built-in) + mockall | — | +| PHP | phpunit + mockery | pest | +| Elixir | ExUnit (built-in) + ex_machina | — | + +### B3. Framework selection + +Use AskUserQuestion: +"I detected this is a [Runtime/Framework] project with no test framework. I researched current best practices. Here are the options: +A) [Primary] — [rationale]. Includes: [packages]. Supports: unit, integration, smoke, e2e +B) [Alternative] — [rationale]. Includes: [packages] +C) Skip — don't set up testing right now +RECOMMENDATION: Choose A because [reason based on project context]" + +If user picks C → write \`.gstack/no-test-bootstrap\`. Tell user: "If you change your mind later, delete \`.gstack/no-test-bootstrap\` and re-run." Continue without tests. + +If multiple runtimes detected (monorepo) → ask which runtime to set up first, with option to do both sequentially. + +### B4. Install and configure + +1. Install the chosen packages (npm/bun/gem/pip/etc.) +2. Create minimal config file +3. Create directory structure (test/, spec/, etc.) +4. Create one example test matching the project's code to verify setup works + +If package installation fails → debug once. If still failing → revert with \`git checkout -- package.json package-lock.json\` (or equivalent for the runtime). Warn user and continue without tests. + +### B4.5. First real tests + +Generate 3-5 real tests for existing code: + +1. **Find recently changed files:** \`git log --since=30.days --name-only --format="" | sort | uniq -c | sort -rn | head -10\` +2. **Prioritize by risk:** Error handlers > business logic with conditionals > API endpoints > pure functions +3. **For each file:** Write one test that tests real behavior with meaningful assertions. Never \`expect(x).toBeDefined()\` — test what the code DOES. +4. Run each test. Passes → keep. Fails → fix once. Still fails → delete silently. +5. Generate at least 1 test, cap at 5. + +Never import secrets, API keys, or credentials in test files. Use environment variables or test fixtures. + +### B5. Verify + +\`\`\`bash +# Run the full test suite to confirm everything works +{detected test command} +\`\`\` + +If tests fail → debug once. If still failing → revert all bootstrap changes and warn user. + +### B5.5. CI/CD pipeline + +\`\`\`bash +# Check CI provider +ls -d .github/ 2>/dev/null && echo "CI:github" +ls .gitlab-ci.yml .circleci/ bitrise.yml 2>/dev/null +\`\`\` + +If \`.github/\` exists (or no CI detected — default to GitHub Actions): +Create \`.github/workflows/test.yml\` with: +- \`runs-on: ubuntu-latest\` +- Appropriate setup action for the runtime (setup-node, setup-ruby, setup-python, etc.) +- The same test command verified in B5 +- Trigger: push + pull_request + +If non-GitHub CI detected → skip CI generation with note: "Detected {provider} — CI pipeline generation supports GitHub Actions only. Add test step to your existing pipeline manually." + +### B6. Create TESTING.md + +First check: If TESTING.md already exists → read it and update/append rather than overwriting. Never destroy existing content. + +Write TESTING.md with: +- Philosophy: "100% test coverage is the key to great vibe coding. Tests let you move fast, trust your instincts, and ship with confidence — without them, vibe coding is just yolo coding. With tests, it's a superpower." +- Framework name and version +- How to run tests (the verified command from B5) +- Test layers: Unit tests (what, where, when), Integration tests, Smoke tests, E2E tests +- Conventions: file naming, assertion style, setup/teardown patterns + +### B7. Update CLAUDE.md + +First check: If CLAUDE.md already has a \`## Testing\` section → skip. Don't duplicate. + +Append a \`## Testing\` section: +- Run command and test directory +- Reference to TESTING.md +- Test expectations: + - 100% test coverage is the goal — tests make vibe coding safe + - When writing new functions, write a corresponding test + - When fixing a bug, write a regression test + - When adding error handling, write a test that triggers the error + - When adding a conditional (if/else, switch), write tests for BOTH paths + - Never commit code that makes existing tests fail + +### B8. Commit + +\`\`\`bash +git status --porcelain +\`\`\` + +Only commit if there are changes. Stage all bootstrap files (config, test directory, TESTING.md, CLAUDE.md, .github/workflows/test.yml if created): +\`git commit -m "chore: bootstrap test framework ({framework name})"\` + +---`; +} + +// ─── Test Coverage Audit ──────────────────────────────────── +// +// Shared methodology for codepath tracing, ASCII diagrams, and test gap analysis. +// Three modes, three placeholders, one inner function: +// +// {{TEST_COVERAGE_AUDIT_PLAN}} → plan-eng-review: adds missing tests to the plan +// {{TEST_COVERAGE_AUDIT_SHIP}} → ship: auto-generates tests, coverage summary +// {{TEST_COVERAGE_AUDIT_REVIEW}} → review: generates tests via Fix-First (ASK) +// +// ┌────────────────────────────────────────────────┐ +// │ generateTestCoverageAuditInner(mode) │ +// │ │ +// │ SHARED: framework detect, codepath trace, │ +// │ ASCII diagram, quality rubric, E2E matrix, │ +// │ regression rule │ +// │ │ +// │ plan: edit plan file, write artifact │ +// │ ship: auto-generate tests, write artifact │ +// │ review: Fix-First ASK, INFORMATIONAL gaps │ +// └────────────────────────────────────────────────┘ + +type CoverageAuditMode = 'plan' | 'ship' | 'review'; + +function generateTestCoverageAuditInner(mode: CoverageAuditMode): string { + const sections: string[] = []; + + // ── Intro (mode-specific) ── + if (mode === 'ship') { + sections.push(`100% coverage is the goal — every untested path is a path where bugs hide and vibe coding becomes yolo coding. Evaluate what was ACTUALLY coded (from the diff), not what was planned.`); + } else if (mode === 'plan') { + sections.push(`100% coverage is the goal. Evaluate every codepath in the plan and ensure the plan includes tests for each one. If the plan is missing tests, add them — the plan should be complete enough that implementation includes full test coverage from the start.`); + } else { + sections.push(`100% coverage is the goal. Evaluate every codepath changed in the diff and identify test gaps. Gaps become INFORMATIONAL findings that follow the Fix-First flow.`); + } + + // ── Test framework detection (shared) ── + sections.push(` +### Test Framework Detection + +Before analyzing coverage, detect the project's test framework: + +1. **Read CLAUDE.md** — look for a \`## Testing\` section with test command and framework name. If found, use that as the authoritative source. +2. **If CLAUDE.md has no testing section, auto-detect:** + +\`\`\`bash +# Detect project runtime +[ -f Gemfile ] && echo "RUNTIME:ruby" +[ -f package.json ] && echo "RUNTIME:node" +[ -f requirements.txt ] || [ -f pyproject.toml ] && echo "RUNTIME:python" +[ -f go.mod ] && echo "RUNTIME:go" +[ -f Cargo.toml ] && echo "RUNTIME:rust" +# Check for existing test infrastructure +ls jest.config.* vitest.config.* playwright.config.* cypress.config.* .rspec pytest.ini phpunit.xml 2>/dev/null +ls -d test/ tests/ spec/ __tests__/ cypress/ e2e/ 2>/dev/null +\`\`\` + +3. **If no framework detected:**${mode === 'ship' ? ' falls through to the Test Framework Bootstrap step (Step 2.5) which handles full setup.' : ' still produce the coverage diagram, but skip test generation.'}`); + + // ── Before/after count (ship only) ── + if (mode === 'ship') { + sections.push(` +**0. Before/after test count:** + +\`\`\`bash +# Count test files before any generation +find . -name '*.test.*' -o -name '*.spec.*' -o -name '*_test.*' -o -name '*_spec.*' | grep -v node_modules | wc -l +\`\`\` + +Store this number for the PR body.`); + } + + // ── Codepath tracing methodology (shared, with mode-specific source) ── + const traceSource = mode === 'plan' + ? `**Step 1. Trace every codepath in the plan:** + +Read the plan document. For each new feature, service, endpoint, or component described, trace how data will flow through the code — don't just list planned functions, actually follow the planned execution:` + : `**${mode === 'ship' ? '1' : 'Step 1'}. Trace every codepath changed** using \`git diff origin/...HEAD\`: + +Read every changed file. For each one, trace how data flows through the code — don't just list functions, actually follow the execution:`; + + const traceStep1 = mode === 'plan' + ? `1. **Read the plan.** For each planned component, understand what it does and how it connects to existing code.` + : `1. **Read the diff.** For each changed file, read the full file (not just the diff hunk) to understand context.`; + + sections.push(` +${traceSource} + +${traceStep1} +2. **Trace data flow.** Starting from each entry point (route handler, exported function, event listener, component render), follow the data through every branch: + - Where does input come from? (request params, props, database, API call) + - What transforms it? (validation, mapping, computation) + - Where does it go? (database write, API response, rendered output, side effect) + - What can go wrong at each step? (null/undefined, invalid input, network failure, empty collection) +3. **Diagram the execution.** For each changed file, draw an ASCII diagram showing: + - Every function/method that was added or modified + - Every conditional branch (if/else, switch, ternary, guard clause, early return) + - Every error path (try/catch, rescue, error boundary, fallback) + - Every call to another function (trace into it — does IT have untested branches?) + - Every edge: what happens with null input? Empty array? Invalid type? + +This is the critical step — you're building a map of every line of code that can execute differently based on input. Every branch in this diagram needs a test.`); + + // ── User flow coverage (shared) ── + sections.push(` +**${mode === 'ship' ? '2' : 'Step 2'}. Map user flows, interactions, and error states:** + +Code coverage isn't enough — you need to cover how real users interact with the changed code. For each changed feature, think through: + +- **User flows:** What sequence of actions does a user take that touches this code? Map the full journey (e.g., "user clicks 'Pay' → form validates → API call → success/failure screen"). Each step in the journey needs a test. +- **Interaction edge cases:** What happens when the user does something unexpected? + - Double-click/rapid resubmit + - Navigate away mid-operation (back button, close tab, click another link) + - Submit with stale data (page sat open for 30 minutes, session expired) + - Slow connection (API takes 10 seconds — what does the user see?) + - Concurrent actions (two tabs, same form) +- **Error states the user can see:** For every error the code handles, what does the user actually experience? + - Is there a clear error message or a silent failure? + - Can the user recover (retry, go back, fix input) or are they stuck? + - What happens with no network? With a 500 from the API? With invalid data from the server? +- **Empty/zero/boundary states:** What does the UI show with zero results? With 10,000 results? With a single character input? With maximum-length input? + +Add these to your diagram alongside the code branches. A user flow with no test is just as much a gap as an untested if/else.`); + + // ── Check branches against tests + quality rubric (shared) ── + sections.push(` +**${mode === 'ship' ? '3' : 'Step 3'}. Check each branch against existing tests:** + +Go through your diagram branch by branch — both code paths AND user flows. For each one, search for a test that exercises it: +- Function \`processPayment()\` → look for \`billing.test.ts\`, \`billing.spec.ts\`, \`test/billing_test.rb\` +- An if/else → look for tests covering BOTH the true AND false path +- An error handler → look for a test that triggers that specific error condition +- A call to \`helperFn()\` that has its own branches → those branches need tests too +- A user flow → look for an integration or E2E test that walks through the journey +- An interaction edge case → look for a test that simulates the unexpected action + +Quality scoring rubric: +- ★★★ Tests behavior with edge cases AND error paths +- ★★ Tests correct behavior, happy path only +- ★ Smoke test / existence check / trivial assertion (e.g., "it renders", "it doesn't throw")`); + + // ── E2E test decision matrix (shared) ── + sections.push(` +### E2E Test Decision Matrix + +When checking each branch, also determine whether a unit test or E2E/integration test is the right tool: + +**RECOMMEND E2E (mark as [→E2E] in the diagram):** +- Common user flow spanning 3+ components/services (e.g., signup → verify email → first login) +- Integration point where mocking hides real failures (e.g., API → queue → worker → DB) +- Auth/payment/data-destruction flows — too important to trust unit tests alone + +**RECOMMEND EVAL (mark as [→EVAL] in the diagram):** +- Critical LLM call that needs a quality eval (e.g., prompt change → test output still meets quality bar) +- Changes to prompt templates, system instructions, or tool definitions + +**STICK WITH UNIT TESTS:** +- Pure function with clear inputs/outputs +- Internal helper with no side effects +- Edge case of a single function (null input, empty array) +- Obscure/rare flow that isn't customer-facing`); + + // ── Regression rule (shared) ── + sections.push(` +### REGRESSION RULE (mandatory) + +**IRON RULE:** When the coverage audit identifies a REGRESSION — code that previously worked but the diff broke — a regression test is ${mode === 'plan' ? 'added to the plan as a critical requirement' : 'written immediately'}. No AskUserQuestion. No skipping. Regressions are the highest-priority test because they prove something broke. + +A regression is when: +- The diff modifies existing behavior (not new code) +- The existing test suite (if any) doesn't cover the changed path +- The change introduces a new failure mode for existing callers + +When uncertain whether a change is a regression, err on the side of writing the test.${mode !== 'plan' ? '\n\nFormat: commit as `test: regression test for {what broke}`' : ''}`); + + // ── ASCII coverage diagram (shared) ── + sections.push(` +**${mode === 'ship' ? '4' : 'Step 4'}. Output ASCII coverage diagram:** + +Include BOTH code paths and user flows in the same diagram. Mark E2E-worthy and eval-worthy paths: + +\`\`\` +CODE PATH COVERAGE +=========================== +[+] src/services/billing.ts + │ + ├── processPayment() + │ ├── [★★★ TESTED] Happy path + card declined + timeout — billing.test.ts:42 + │ ├── [GAP] Network timeout — NO TEST + │ └── [GAP] Invalid currency — NO TEST + │ + └── refundPayment() + ├── [★★ TESTED] Full refund — billing.test.ts:89 + └── [★ TESTED] Partial refund (checks non-throw only) — billing.test.ts:101 + +USER FLOW COVERAGE +=========================== +[+] Payment checkout flow + │ + ├── [★★★ TESTED] Complete purchase — checkout.e2e.ts:15 + ├── [GAP] [→E2E] Double-click submit — needs E2E, not just unit + ├── [GAP] Navigate away during payment — unit test sufficient + └── [★ TESTED] Form validation errors (checks render only) — checkout.test.ts:40 + +[+] Error states + │ + ├── [★★ TESTED] Card declined message — billing.test.ts:58 + ├── [GAP] Network timeout UX (what does user see?) — NO TEST + └── [GAP] Empty cart submission — NO TEST + +[+] LLM integration + │ + └── [GAP] [→EVAL] Prompt template change — needs eval test + +───────────────────────────────── +COVERAGE: 5/13 paths tested (38%) + Code paths: 3/5 (60%) + User flows: 2/8 (25%) +QUALITY: ★★★: 2 ★★: 2 ★: 1 +GAPS: 8 paths need tests (2 need E2E, 1 needs eval) +───────────────────────────────── +\`\`\` + +**Fast path:** All paths covered → "${mode === 'ship' ? 'Step 3.4' : mode === 'review' ? 'Step 4.75' : 'Test review'}: All new code paths have test coverage ✓" Continue.`); + + // ── Mode-specific action section ── + if (mode === 'plan') { + sections.push(` +**Step 5. Add missing tests to the plan:** + +For each GAP identified in the diagram, add a test requirement to the plan. Be specific: +- What test file to create (match existing naming conventions) +- What the test should assert (specific inputs → expected outputs/behavior) +- Whether it's a unit test, E2E test, or eval (use the decision matrix) +- For regressions: flag as **CRITICAL** and explain what broke + +The plan should be complete enough that when implementation begins, every test is written alongside the feature code — not deferred to a follow-up.`); + + // ── Test plan artifact (plan + ship) ── + sections.push(` +### Test Plan Artifact + +After producing the coverage diagram, write a test plan artifact to the project directory so \`/qa\` and \`/qa-only\` can consume it as primary test input: + +\`\`\`bash +eval "$(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null)" && mkdir -p ~/.gstack/projects/$SLUG +USER=$(whoami) +DATETIME=$(date +%Y%m%d-%H%M%S) +\`\`\` + +Write to \`~/.gstack/projects/{slug}/{user}-{branch}-eng-review-test-plan-{datetime}.md\`: + +\`\`\`markdown +# Test Plan +Generated by /plan-eng-review on {date} +Branch: {branch} +Repo: {owner/repo} + +## Affected Pages/Routes +- {URL path} — {what to test and why} + +## Key Interactions to Verify +- {interaction description} on {page} + +## Edge Cases +- {edge case} on {page} + +## Critical Paths +- {end-to-end flow that must work} +\`\`\` + +This file is consumed by \`/qa\` and \`/qa-only\` as primary test input. Include only the information that helps a QA tester know **what to test and where** — not implementation details.`); + } else if (mode === 'ship') { + sections.push(` +**5. Generate tests for uncovered paths:** + +If test framework detected (or bootstrapped in Step 2.5): +- Prioritize error handlers and edge cases first (happy paths are more likely already tested) +- Read 2-3 existing test files to match conventions exactly +- Generate unit tests. Mock all external dependencies (DB, API, Redis). +- For paths marked [→E2E]: generate integration/E2E tests using the project's E2E framework (Playwright, Cypress, Capybara, etc.) +- For paths marked [→EVAL]: generate eval tests using the project's eval framework, or flag for manual eval if none exists +- Write tests that exercise the specific uncovered path with real assertions +- Run each test. Passes → commit as \`test: coverage for {feature}\` +- Fails → fix once. Still fails → revert, note gap in diagram. + +Caps: 30 code paths max, 20 tests generated max (code + user flow combined), 2-min per-test exploration cap. + +If no test framework AND user declined bootstrap → diagram only, no generation. Note: "Test generation skipped — no test framework configured." + +**Diff is test-only changes:** Skip Step 3.4 entirely: "No new application code paths to audit." + +**6. After-count and coverage summary:** + +\`\`\`bash +# Count test files after generation +find . -name '*.test.*' -o -name '*.spec.*' -o -name '*_test.*' -o -name '*_spec.*' | grep -v node_modules | wc -l +\`\`\` + +For PR body: \`Tests: {before} → {after} (+{delta} new)\` +Coverage line: \`Test Coverage Audit: N new code paths. M covered (X%). K tests generated, J committed.\``); + + // ── Test plan artifact (ship mode) ── + sections.push(` +### Test Plan Artifact + +After producing the coverage diagram, write a test plan artifact so \`/qa\` and \`/qa-only\` can consume it: + +\`\`\`bash +eval "$(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null)" && mkdir -p ~/.gstack/projects/$SLUG +USER=$(whoami) +DATETIME=$(date +%Y%m%d-%H%M%S) +\`\`\` + +Write to \`~/.gstack/projects/{slug}/{user}-{branch}-ship-test-plan-{datetime}.md\`: + +\`\`\`markdown +# Test Plan +Generated by /ship on {date} +Branch: {branch} +Repo: {owner/repo} + +## Affected Pages/Routes +- {URL path} — {what to test and why} + +## Key Interactions to Verify +- {interaction description} on {page} + +## Edge Cases +- {edge case} on {page} + +## Critical Paths +- {end-to-end flow that must work} +\`\`\``); + } else { + // review mode + sections.push(` +**Step 5. Generate tests for gaps (Fix-First):** + +If test framework is detected and gaps were identified: +- Classify each gap as AUTO-FIX or ASK per the Fix-First Heuristic: + - **AUTO-FIX:** Simple unit tests for pure functions, edge cases of existing tested functions + - **ASK:** E2E tests, tests requiring new test infrastructure, tests for ambiguous behavior +- For AUTO-FIX gaps: generate the test, run it, commit as \`test: coverage for {feature}\` +- For ASK gaps: include in the Fix-First batch question with the other review findings +- For paths marked [→E2E]: always ASK (E2E tests are higher-effort and need user confirmation) +- For paths marked [→EVAL]: always ASK (eval tests need user confirmation on quality criteria) + +If no test framework detected → include gaps as INFORMATIONAL findings only, no generation. + +**Diff is test-only changes:** Skip Step 4.75 entirely: "No new application code paths to audit."`); + } + + return sections.join('\n'); +} + +function generateTestCoverageAuditPlan(_ctx: TemplateContext): string { + return generateTestCoverageAuditInner('plan'); +} + +function generateTestCoverageAuditShip(_ctx: TemplateContext): string { + return generateTestCoverageAuditInner('ship'); +} + +function generateTestCoverageAuditReview(_ctx: TemplateContext): string { + return generateTestCoverageAuditInner('review'); +} + +function generateSpecReviewLoop(_ctx: TemplateContext): string { + return `## Spec Review Loop + +Before presenting the document to the user for approval, run an adversarial review. + +**Step 1: Dispatch reviewer subagent** + +Use the Agent tool to dispatch an independent reviewer. The reviewer has fresh context +and cannot see the brainstorming conversation — only the document. This ensures genuine +adversarial independence. + +Prompt the subagent with: +- The file path of the document just written +- "Read this document and review it on 5 dimensions. For each dimension, note PASS or + list specific issues with suggested fixes. At the end, output a quality score (1-10) + across all dimensions." + +**Dimensions:** +1. **Completeness** — Are all requirements addressed? Missing edge cases? +2. **Consistency** — Do parts of the document agree with each other? Contradictions? +3. **Clarity** — Could an engineer implement this without asking questions? Ambiguous language? +4. **Scope** — Does the document creep beyond the original problem? YAGNI violations? +5. **Feasibility** — Can this actually be built with the stated approach? Hidden complexity? + +The subagent should return: +- A quality score (1-10) +- PASS if no issues, or a numbered list of issues with dimension, description, and fix + +**Step 2: Fix and re-dispatch** + +If the reviewer returns issues: +1. Fix each issue in the document on disk (use Edit tool) +2. Re-dispatch the reviewer subagent with the updated document +3. Maximum 3 iterations total + +**Convergence guard:** If the reviewer returns the same issues on consecutive iterations +(the fix didn't resolve them or the reviewer disagrees with the fix), stop the loop +and persist those issues as "Reviewer Concerns" in the document rather than looping +further. + +If the subagent fails, times out, or is unavailable — skip the review loop entirely. +Tell the user: "Spec review unavailable — presenting unreviewed doc." The document is +already written to disk; the review is a quality bonus, not a gate. + +**Step 3: Report and persist metrics** + +After the loop completes (PASS, max iterations, or convergence guard): + +1. Tell the user the result — summary by default: + "Your doc survived N rounds of adversarial review. M issues caught and fixed. + Quality score: X/10." + If they ask "what did the reviewer find?", show the full reviewer output. + +2. If issues remain after max iterations or convergence, add a "## Reviewer Concerns" + section to the document listing each unresolved issue. Downstream skills will see this. + +3. Append metrics: +\`\`\`bash +mkdir -p ~/.gstack/analytics +echo '{"skill":"${_ctx.skillName}","ts":"'$(date -u +%Y-%m-%dT%H:%M:%SZ)'","iterations":ITERATIONS,"issues_found":FOUND,"issues_fixed":FIXED,"remaining":REMAINING,"quality_score":SCORE}' >> ~/.gstack/analytics/spec-review.jsonl 2>/dev/null || true +\`\`\` +Replace ITERATIONS, FOUND, FIXED, REMAINING, SCORE with actual values from the review.`; +} + +function generateBenefitsFrom(ctx: TemplateContext): string { + if (!ctx.benefitsFrom || ctx.benefitsFrom.length === 0) return ''; + + const skillList = ctx.benefitsFrom.map(s => `\`/${s}\``).join(' or '); + const first = ctx.benefitsFrom[0]; + + return `## Prerequisite Skill Offer + +When the design doc check above prints "No design doc found," offer the prerequisite +skill before proceeding. + +Say to the user via AskUserQuestion: + +> "No design doc found for this branch. ${skillList} produces a structured problem +> statement, premise challenge, and explored alternatives — it gives this review much +> sharper input to work with. Takes about 10 minutes. The design doc is per-feature, +> not per-product — it captures the thinking behind this specific change." + +Options: +- A) Run /${first} now (we'll pick up the review right after) +- B) Skip — proceed with standard review + +If they skip: "No worries — standard review. If you ever want sharper input, try +/${first} first next time." Then proceed normally. Do not re-offer later in the session. + +If they choose A: + +Say: "Running /${first} inline. Once the design doc is ready, I'll pick up +the review right where we left off." + +Read the ${first} skill file from disk using the Read tool: +\`~/.claude/skills/gstack/${first}/SKILL.md\` + +Follow it inline, **skipping these sections** (already handled by the parent skill): +- Preamble (run first) +- AskUserQuestion Format +- Completeness Principle — Boil the Lake +- Search Before Building +- Contributor Mode +- Completion Status Protocol +- Telemetry (run last) + +If the Read fails (file not found), say: +"Could not load /${first} — proceeding with standard review." + +After /${first} completes, re-run the design doc check: +\`\`\`bash +SLUG=$(~/.claude/skills/gstack/browse/bin/remote-slug 2>/dev/null || basename "$(git rev-parse --show-toplevel 2>/dev/null || pwd)") +BRANCH=$(git rev-parse --abbrev-ref HEAD 2>/dev/null | tr '/' '-' || echo 'no-branch') +DESIGN=$(ls -t ~/.gstack/projects/$SLUG/*-$BRANCH-design-*.md 2>/dev/null | head -1) +[ -z "$DESIGN" ] && DESIGN=$(ls -t ~/.gstack/projects/$SLUG/*-design-*.md 2>/dev/null | head -1) +[ -n "$DESIGN" ] && echo "Design doc found: $DESIGN" || echo "No design doc found" +\`\`\` + +If a design doc is now found, read it and continue the review. +If none was produced (user may have cancelled), proceed with standard review.`; +} + +function generateDesignSketch(_ctx: TemplateContext): string { + return `## Visual Sketch (UI ideas only) + +If the chosen approach involves user-facing UI (screens, pages, forms, dashboards, +or interactive elements), generate a rough wireframe to help the user visualize it. +If the idea is backend-only, infrastructure, or has no UI component — skip this +section silently. + +**Step 1: Gather design context** + +1. Check if \`DESIGN.md\` exists in the repo root. If it does, read it for design + system constraints (colors, typography, spacing, component patterns). Use these + constraints in the wireframe. +2. Apply core design principles: + - **Information hierarchy** — what does the user see first, second, third? + - **Interaction states** — loading, empty, error, success, partial + - **Edge case paranoia** — what if the name is 47 chars? Zero results? Network fails? + - **Subtraction default** — "as little design as possible" (Rams). Every element earns its pixels. + - **Design for trust** — every interface element builds or erodes user trust. + +**Step 2: Generate wireframe HTML** + +Generate a single-page HTML file with these constraints: +- **Intentionally rough aesthetic** — use system fonts, thin gray borders, no color, + hand-drawn-style elements. This is a sketch, not a polished mockup. +- Self-contained — no external dependencies, no CDN links, inline CSS only +- Show the core interaction flow (1-3 screens/states max) +- Include realistic placeholder content (not "Lorem ipsum" — use content that + matches the actual use case) +- Add HTML comments explaining design decisions + +Write to a temp file: +\`\`\`bash +SKETCH_FILE="/tmp/gstack-sketch-$(date +%s).html" +\`\`\` + +**Step 3: Render and capture** + +\`\`\`bash +$B goto "file://$SKETCH_FILE" +$B screenshot /tmp/gstack-sketch.png +\`\`\` + +If \`$B\` is not available (browse binary not set up), skip the render step. Tell the +user: "Visual sketch requires the browse binary. Run the setup script to enable it." + +**Step 4: Present and iterate** + +Show the screenshot to the user. Ask: "Does this feel right? Want to iterate on the layout?" + +If they want changes, regenerate the HTML with their feedback and re-render. +If they approve or say "good enough," proceed. + +**Step 5: Include in design doc** + +Reference the wireframe screenshot in the design doc's "Recommended Approach" section. +The screenshot file at \`/tmp/gstack-sketch.png\` can be referenced by downstream skills +(\`/plan-design-review\`, \`/design-review\`) to see what was originally envisioned. + +**Step 6: Outside design voices** (optional) + +After the wireframe is approved, offer outside design perspectives: + +\`\`\`bash +which codex 2>/dev/null && echo "CODEX_AVAILABLE" || echo "CODEX_NOT_AVAILABLE" +\`\`\` + +If Codex is available, use AskUserQuestion: +> "Want outside design perspectives on the chosen approach? Codex proposes a visual thesis, content plan, and interaction ideas. A Claude subagent proposes an alternative aesthetic direction." +> +> A) Yes — get outside design voices +> B) No — proceed without + +If user chooses A, launch both voices simultaneously: + +1. **Codex** (via Bash, \`model_reasoning_effort="medium"\`): +\`\`\`bash +TMPERR_SKETCH=$(mktemp /tmp/codex-sketch-XXXXXXXX) +codex exec "For this product approach, provide: a visual thesis (one sentence — mood, material, energy), a content plan (hero → support → detail → CTA), and 2 interaction ideas that change page feel. Apply beautiful defaults: composition-first, brand-first, cardless, poster not document. Be opinionated." -s read-only -c 'model_reasoning_effort="medium"' --enable web_search_cached 2>"$TMPERR_SKETCH" +\`\`\` +Use a 5-minute timeout (\`timeout: 300000\`). After completion: \`cat "$TMPERR_SKETCH" && rm -f "$TMPERR_SKETCH"\` + +2. **Claude subagent** (via Agent tool): +"For this product approach, what design direction would you recommend? What aesthetic, typography, and interaction patterns fit? What would make this approach feel inevitable to the user? Be specific — font names, hex colors, spacing values." + +Present Codex output under \`CODEX SAYS (design sketch):\` and subagent output under \`CLAUDE SUBAGENT (design direction):\`. +Error handling: all non-blocking. On failure, skip and continue.`; +} + +function generateCodexSecondOpinion(ctx: TemplateContext): string { + // Codex host: strip entirely — Codex should never invoke itself + if (ctx.host === 'codex') return ''; + + return `## Phase 3.5: Cross-Model Second Opinion (optional) + +**Binary check first — no question if unavailable:** + +\`\`\`bash +which codex 2>/dev/null && echo "CODEX_AVAILABLE" || echo "CODEX_NOT_AVAILABLE" +\`\`\` + +If \`CODEX_NOT_AVAILABLE\`: skip Phase 3.5 entirely — no message, no AskUserQuestion. Proceed directly to Phase 4. + +If \`CODEX_AVAILABLE\`: use AskUserQuestion: + +> Want a second opinion from a different AI model? Codex will independently review your problem statement, key answers, premises, and any landscape findings from this session. It hasn't seen this conversation — it gets a structured summary. Usually takes 2-5 minutes. +> A) Yes, get a second opinion +> B) No, proceed to alternatives + +If B: skip Phase 3.5 entirely. Remember that Codex did NOT run (affects design doc, founder signals, and Phase 4 below). + +**If A: Run the Codex cold read.** + +1. Assemble a structured context block from Phases 1-3: + - Mode (Startup or Builder) + - Problem statement (from Phase 1) + - Key answers from Phase 2A/2B (summarize each Q&A in 1-2 sentences, include verbatim user quotes) + - Landscape findings (from Phase 2.75, if search was run) + - Agreed premises (from Phase 3) + - Codebase context (project name, languages, recent activity) + +2. **Write the assembled prompt to a temp file** (prevents shell injection from user-derived content): + +\`\`\`bash +CODEX_PROMPT_FILE=$(mktemp /tmp/gstack-codex-oh-XXXXXXXX.txt) +\`\`\` + +Write the full prompt (context block + instructions) to this file. Use the mode-appropriate variant: + +**Startup mode instructions:** "You are an independent technical advisor reading a transcript of a startup brainstorming session. [CONTEXT BLOCK HERE]. Your job: 1) What is the STRONGEST version of what this person is trying to build? Steelman it in 2-3 sentences. 2) What is the ONE thing from their answers that reveals the most about what they should actually build? Quote it and explain why. 3) Name ONE agreed premise you think is wrong, and what evidence would prove you right. 4) If you had 48 hours and one engineer to build a prototype, what would you build? Be specific — tech stack, features, what you'd skip. Be direct. Be terse. No preamble." + +**Builder mode instructions:** "You are an independent technical advisor reading a transcript of a builder brainstorming session. [CONTEXT BLOCK HERE]. Your job: 1) What is the COOLEST version of this they haven't considered? 2) What's the ONE thing from their answers that reveals what excites them most? Quote it. 3) What existing open source project or tool gets them 50% of the way there — and what's the 50% they'd need to build? 4) If you had a weekend to build this, what would you build first? Be specific. Be direct. No preamble." + +3. Run Codex: + +\`\`\`bash +TMPERR_OH=$(mktemp /tmp/codex-oh-err-XXXXXXXX) +codex exec "$(cat "$CODEX_PROMPT_FILE")" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached 2>"$TMPERR_OH" +\`\`\` + +Use a 5-minute timeout (\`timeout: 300000\`). After the command completes, read stderr: +\`\`\`bash +cat "$TMPERR_OH" +rm -f "$TMPERR_OH" "$CODEX_PROMPT_FILE" +\`\`\` + +**Error handling:** All errors are non-blocking — Codex second opinion is a quality enhancement, not a prerequisite. +- **Auth failure:** If stderr contains "auth", "login", "unauthorized", or "API key": "Codex authentication failed. Run \\\`codex login\\\` to authenticate. Skipping second opinion." +- **Timeout:** "Codex timed out after 5 minutes. Skipping second opinion." +- **Empty response:** "Codex returned no response. Stderr: . Skipping second opinion." + +On any error, proceed to Phase 4 — do NOT fall back to a Claude subagent (this is brainstorming, not adversarial review). + +4. **Presentation:** + +\`\`\` +SECOND OPINION (Codex): +════════════════════════════════════════════════════════════ + +════════════════════════════════════════════════════════════ +\`\`\` + +5. **Cross-model synthesis:** After presenting Codex output, provide 3-5 bullet synthesis: + - Where Claude agrees with Codex + - Where Claude disagrees and why + - Whether Codex's challenged premise changes Claude's recommendation + +6. **Premise revision check:** If Codex challenged an agreed premise, use AskUserQuestion: + +> Codex challenged premise #{N}: "{premise text}". Their argument: "{reasoning}". +> A) Revise this premise based on Codex's input +> B) Keep the original premise — proceed to alternatives + +If A: revise the premise and note the revision. If B: proceed (and note that the user defended this premise with reasoning — this is a founder signal if they articulate WHY they disagree, not just dismiss).`; +} + +function generateAdversarialStep(ctx: TemplateContext): string { + // Codex host: strip entirely — Codex should never invoke itself + if (ctx.host === 'codex') return ''; + + const isShip = ctx.skillName === 'ship'; + const stepNum = isShip ? '3.8' : '5.7'; + + return `## Step ${stepNum}: Adversarial review (auto-scaled) + +Adversarial review thoroughness scales automatically based on diff size. No configuration needed. + +**Detect diff size and tool availability:** + +\`\`\`bash +DIFF_INS=$(git diff origin/ --stat | tail -1 | grep -oE '[0-9]+ insertion' | grep -oE '[0-9]+' || echo "0") +DIFF_DEL=$(git diff origin/ --stat | tail -1 | grep -oE '[0-9]+ deletion' | grep -oE '[0-9]+' || echo "0") +DIFF_TOTAL=$((DIFF_INS + DIFF_DEL)) +which codex 2>/dev/null && echo "CODEX_AVAILABLE" || echo "CODEX_NOT_AVAILABLE" +# Respect old opt-out +OLD_CFG=$(~/.claude/skills/gstack/bin/gstack-config get codex_reviews 2>/dev/null || true) +echo "DIFF_SIZE: $DIFF_TOTAL" +echo "OLD_CFG: \${OLD_CFG:-not_set}" +\`\`\` + +If \`OLD_CFG\` is \`disabled\`: skip this step silently. Continue to the next step. + +**User override:** If the user explicitly requested a specific tier (e.g., "run all passes", "paranoid review", "full adversarial", "do all 4 passes", "thorough review"), honor that request regardless of diff size. Jump to the matching tier section. + +**Auto-select tier based on diff size:** +- **Small (< 50 lines changed):** Skip adversarial review entirely. Print: "Small diff ($DIFF_TOTAL lines) — adversarial review skipped." Continue to the next step. +- **Medium (50–199 lines changed):** Run Codex adversarial challenge (or Claude adversarial subagent if Codex unavailable). Jump to the "Medium tier" section. +- **Large (200+ lines changed):** Run all remaining passes — Codex structured review + Claude adversarial subagent + Codex adversarial. Jump to the "Large tier" section. + +--- + +### Medium tier (50–199 lines) + +Claude's structured review already ran. Now add a **cross-model adversarial challenge**. + +**If Codex is available:** run the Codex adversarial challenge. **If Codex is NOT available:** fall back to the Claude adversarial subagent instead. + +**Codex adversarial:** + +\`\`\`bash +TMPERR_ADV=$(mktemp /tmp/codex-adv-XXXXXXXX) +codex exec "Review the changes on this branch against the base branch. Run git diff origin/ to see the diff. Your job is to find ways this code will fail in production. Think like an attacker and a chaos engineer. Find edge cases, race conditions, security holes, resource leaks, failure modes, and silent data corruption paths. Be adversarial. Be thorough. No compliments — just the problems." -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached 2>"$TMPERR_ADV" +\`\`\` + +Set the Bash tool's \`timeout\` parameter to \`300000\` (5 minutes). Do NOT use the \`timeout\` shell command — it doesn't exist on macOS. After the command completes, read stderr: +\`\`\`bash +cat "$TMPERR_ADV" +\`\`\` + +Present the full output verbatim. This is informational — it never blocks shipping. + +**Error handling:** All errors are non-blocking — adversarial review is a quality enhancement, not a prerequisite. +- **Auth failure:** If stderr contains "auth", "login", "unauthorized", or "API key": "Codex authentication failed. Run \\\`codex login\\\` to authenticate." +- **Timeout:** "Codex timed out after 5 minutes." +- **Empty response:** "Codex returned no response. Stderr: ." + +On any Codex error, fall back to the Claude adversarial subagent automatically. + +**Claude adversarial subagent** (fallback when Codex unavailable or errored): + +Dispatch via the Agent tool. The subagent has fresh context — no checklist bias from the structured review. This genuine independence catches things the primary reviewer is blind to. + +Subagent prompt: +"Read the diff for this branch with \`git diff origin/\`. Think like an attacker and a chaos engineer. Your job is to find ways this code will fail in production. Look for: edge cases, race conditions, security holes, resource leaks, failure modes, silent data corruption, logic errors that produce wrong results silently, error handling that swallows failures, and trust boundary violations. Be adversarial. Be thorough. No compliments — just the problems. For each finding, classify as FIXABLE (you know how to fix it) or INVESTIGATE (needs human judgment)." + +Present findings under an \`ADVERSARIAL REVIEW (Claude subagent):\` header. **FIXABLE findings** flow into the same Fix-First pipeline as the structured review. **INVESTIGATE findings** are presented as informational. + +If the subagent fails or times out: "Claude adversarial subagent unavailable. Continuing without adversarial review." + +**Persist the review result:** +\`\`\`bash +~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"adversarial-review","timestamp":"'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'","status":"STATUS","source":"SOURCE","tier":"medium","commit":"'"$(git rev-parse --short HEAD)"'"}' +\`\`\` +Substitute STATUS: "clean" if no findings, "issues_found" if findings exist. SOURCE: "codex" if Codex ran, "claude" if subagent ran. If both failed, do NOT persist. + +**Cleanup:** Run \`rm -f "$TMPERR_ADV"\` after processing (if Codex was used). + +--- + +### Large tier (200+ lines) + +Claude's structured review already ran. Now run **all three remaining passes** for maximum coverage: + +**1. Codex structured review (if available):** +\`\`\`bash +TMPERR=$(mktemp /tmp/codex-review-XXXXXXXX) +codex review --base -c 'model_reasoning_effort="xhigh"' --enable web_search_cached 2>"$TMPERR" +\`\`\` + +Set the Bash tool's \`timeout\` parameter to \`300000\` (5 minutes). Do NOT use the \`timeout\` shell command — it doesn't exist on macOS. Present output under \`CODEX SAYS (code review):\` header. +Check for \`[P1]\` markers: found → \`GATE: FAIL\`, not found → \`GATE: PASS\`. + +If GATE is FAIL, use AskUserQuestion: +\`\`\` +Codex found N critical issues in the diff. + +A) Investigate and fix now (recommended) +B) Continue — review will still complete +\`\`\` + +If A: address the findings${isShip ? '. After fixing, re-run tests (Step 3) since code has changed' : ''}. Re-run \`codex review\` to verify. + +Read stderr for errors (same error handling as medium tier). + +After stderr: \`rm -f "$TMPERR"\` + +**2. Claude adversarial subagent:** Dispatch a subagent with the adversarial prompt (same prompt as medium tier). This always runs regardless of Codex availability. + +**3. Codex adversarial challenge (if available):** Run \`codex exec\` with the adversarial prompt (same as medium tier). + +If Codex is not available for steps 1 and 3, note to the user: "Codex CLI not found — large-diff review ran Claude structured + Claude adversarial (2 of 4 passes). Install Codex for full 4-pass coverage: \`npm install -g @openai/codex\`" + +**Persist the review result AFTER all passes complete** (not after each sub-step): +\`\`\`bash +~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"adversarial-review","timestamp":"'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'","status":"STATUS","source":"SOURCE","tier":"large","gate":"GATE","commit":"'"$(git rev-parse --short HEAD)"'"}' +\`\`\` +Substitute: STATUS = "clean" if no findings across ALL passes, "issues_found" if any pass found issues. SOURCE = "both" if Codex ran, "claude" if only Claude subagent ran. GATE = the Codex structured review gate result ("pass"/"fail"), or "informational" if Codex was unavailable. If all passes failed, do NOT persist. + +--- + +### Cross-model synthesis (medium and large tiers) + +After all passes complete, synthesize findings across all sources: + +\`\`\` +ADVERSARIAL REVIEW SYNTHESIS (auto: TIER, N lines): +════════════════════════════════════════════════════════════ + High confidence (found by multiple sources): [findings agreed on by >1 pass] + Unique to Claude structured review: [from earlier step] + Unique to Claude adversarial: [from subagent, if ran] + Unique to Codex: [from codex adversarial or code review, if ran] + Models used: Claude structured ✓ Claude adversarial ✓/✗ Codex ✓/✗ +════════════════════════════════════════════════════════════ +\`\`\` + +High-confidence findings (agreed on by multiple sources) should be prioritized for fixes. + +---`; +} + +function generateCodexPlanReview(ctx: TemplateContext): string { + // Codex host: strip entirely — Codex should never invoke itself + if (ctx.host === 'codex') return ''; + + return `## Outside Voice — Independent Plan Challenge (optional, recommended) + +After all review sections are complete, offer an independent second opinion from a +different AI system. Two models agreeing on a plan is stronger signal than one model's +thorough review. + +**Check tool availability:** + +\`\`\`bash +which codex 2>/dev/null && echo "CODEX_AVAILABLE" || echo "CODEX_NOT_AVAILABLE" +\`\`\` + +Use AskUserQuestion: + +> "All review sections are complete. Want an outside voice? A different AI system can +> give a brutally honest, independent challenge of this plan — logical gaps, feasibility +> risks, and blind spots that are hard to catch from inside the review. Takes about 2 +> minutes." +> +> RECOMMENDATION: Choose A — an independent second opinion catches structural blind +> spots. Two different AI models agreeing on a plan is stronger signal than one model's +> thorough review. Completeness: A=9/10, B=7/10. + +Options: +- A) Get the outside voice (recommended) +- B) Skip — proceed to outputs + +**If B:** Print "Skipping outside voice." and continue to the next section. + +**If A:** Construct the plan review prompt. Read the plan file being reviewed (the file +the user pointed this review at, or the branch diff scope). If a CEO plan document +was written in Step 0D-POST, read that too — it contains the scope decisions and vision. + +Construct this prompt (substitute the actual plan content — if plan content exceeds 30KB, +truncate to the first 30KB and note "Plan truncated for size"): + +"You are a brutally honest technical reviewer examining a development plan that has +already been through a multi-section review. Your job is NOT to repeat that review. +Instead, find what it missed. Look for: logical gaps and unstated assumptions that +survived the review scrutiny, overcomplexity (is there a fundamentally simpler +approach the review was too deep in the weeds to see?), feasibility risks the review +took for granted, missing dependencies or sequencing issues, and strategic +miscalibration (is this the right thing to build at all?). Be direct. Be terse. No +compliments. Just the problems. + +THE PLAN: +" + +**If CODEX_AVAILABLE:** + +\`\`\`bash +TMPERR_PV=$(mktemp /tmp/codex-planreview-XXXXXXXX) +codex exec "" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached 2>"$TMPERR_PV" +\`\`\` + +Use a 5-minute timeout (\`timeout: 300000\`). After the command completes, read stderr: +\`\`\`bash +cat "$TMPERR_PV" +\`\`\` + +Present the full output verbatim: + +\`\`\` +CODEX SAYS (plan review — outside voice): +════════════════════════════════════════════════════════════ + +════════════════════════════════════════════════════════════ +\`\`\` + +**Error handling:** All errors are non-blocking — the outside voice is informational. +- Auth failure (stderr contains "auth", "login", "unauthorized"): "Codex auth failed. Run \\\`codex login\\\` to authenticate." +- Timeout: "Codex timed out after 5 minutes." +- Empty response: "Codex returned no response." + +On any Codex error, fall back to the Claude adversarial subagent. + +**If CODEX_NOT_AVAILABLE (or Codex errored):** + +Dispatch via the Agent tool. The subagent has fresh context — genuine independence. + +Subagent prompt: same plan review prompt as above. + +Present findings under an \`OUTSIDE VOICE (Claude subagent):\` header. + +If the subagent fails or times out: "Outside voice unavailable. Continuing to outputs." + +**Cross-model tension:** + +After presenting the outside voice findings, note any points where the outside voice +disagrees with the review findings from earlier sections. Flag these as: + +\`\`\` +CROSS-MODEL TENSION: + [Topic]: Review said X. Outside voice says Y. [Your assessment of who's right.] +\`\`\` + +For each substantive tension point, auto-propose as a TODO via AskUserQuestion: + +> "Cross-model disagreement on [topic]. The review found [X] but the outside voice +> argues [Y]. Worth investigating further?" + +Options: +- A) Add to TODOS.md +- B) Skip — not substantive + +If no tension points exist, note: "No cross-model tension — both reviewers agree." + +**Persist the result:** +\`\`\`bash +~/.claude/skills/gstack/bin/gstack-review-log '{"skill":"codex-plan-review","timestamp":"'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'","status":"STATUS","source":"SOURCE","commit":"'"$(git rev-parse --short HEAD)"'"}' +\`\`\` + +Substitute: STATUS = "clean" if no findings, "issues_found" if findings exist. +SOURCE = "codex" if Codex ran, "claude" if subagent ran. + +**Cleanup:** Run \`rm -f "$TMPERR_PV"\` after processing (if Codex was used). + +---`; +} + +function generateDeployBootstrap(_ctx: TemplateContext): string { + return `\`\`\`bash +# Check for persisted deploy config in CLAUDE.md +DEPLOY_CONFIG=$(grep -A 20 "## Deploy Configuration" CLAUDE.md 2>/dev/null || echo "NO_CONFIG") +echo "$DEPLOY_CONFIG" + +# If config exists, parse it +if [ "$DEPLOY_CONFIG" != "NO_CONFIG" ]; then + PROD_URL=$(echo "$DEPLOY_CONFIG" | grep -i "production.*url" | head -1 | sed 's/.*: *//') + PLATFORM=$(echo "$DEPLOY_CONFIG" | grep -i "platform" | head -1 | sed 's/.*: *//') + echo "PERSISTED_PLATFORM:$PLATFORM" + echo "PERSISTED_URL:$PROD_URL" +fi + +# Auto-detect platform from config files +[ -f fly.toml ] && echo "PLATFORM:fly" +[ -f render.yaml ] && echo "PLATFORM:render" +([ -f vercel.json ] || [ -d .vercel ]) && echo "PLATFORM:vercel" +[ -f netlify.toml ] && echo "PLATFORM:netlify" +[ -f Procfile ] && echo "PLATFORM:heroku" +([ -f railway.json ] || [ -f railway.toml ]) && echo "PLATFORM:railway" + +# Detect deploy workflows +for f in .github/workflows/*.yml .github/workflows/*.yaml; do + [ -f "$f" ] && grep -qiE "deploy|release|production|staging|cd" "$f" 2>/dev/null && echo "DEPLOY_WORKFLOW:$f" +done +\`\`\` + +If \`PERSISTED_PLATFORM\` and \`PERSISTED_URL\` were found in CLAUDE.md, use them directly +and skip manual detection. If no persisted config exists, use the auto-detected platform +to guide deploy verification. If nothing is detected, ask the user via AskUserQuestion +in the decision tree below. + +If you want to persist deploy settings for future runs, suggest the user run \`/setup-deploy\`.`; +} + +// ─── Design Outside Voices (parallel Codex + Claude subagent) ─────── + +function generateDesignOutsideVoices(ctx: TemplateContext): string { + // Codex host: strip entirely — Codex should never invoke itself + if (ctx.host === 'codex') return ''; + + const rejectionList = OPENAI_HARD_REJECTIONS.map((item, i) => `${i + 1}. ${item}`).join('\n'); + const litmusList = OPENAI_LITMUS_CHECKS.map((item, i) => `${i + 1}. ${item}`).join('\n'); + + // Skill-specific configuration + const isPlanDesignReview = ctx.skillName === 'plan-design-review'; + const isDesignReview = ctx.skillName === 'design-review'; + const isDesignConsultation = ctx.skillName === 'design-consultation'; + + // Determine opt-in behavior and reasoning effort + const isAutomatic = isDesignReview; // design-review runs automatically + const reasoningEffort = isDesignConsultation ? 'medium' : 'high'; // creative vs analytical + + // Build skill-specific Codex prompt + let codexPrompt: string; + let subagentPrompt: string; + + if (isPlanDesignReview) { + codexPrompt = `Read the plan file at [plan-file-path]. Evaluate this plan's UI/UX design against these criteria. + +HARD REJECTION — flag if ANY apply: +${rejectionList} + +LITMUS CHECKS — answer YES or NO for each: +${litmusList} + +HARD RULES — first classify as MARKETING/LANDING PAGE vs APP UI vs HYBRID, then flag violations of the matching rule set: +- MARKETING: First viewport as one composition, brand-first hierarchy, full-bleed hero, 2-3 intentional motions, composition-first layout +- APP UI: Calm surface hierarchy, dense but readable, utility language, minimal chrome +- UNIVERSAL: CSS variables for colors, no default font stacks, one job per section, cards earn existence + +For each finding: what's wrong, what will happen if it ships unresolved, and the specific fix. Be opinionated. No hedging.`; + + subagentPrompt = `Read the plan file at [plan-file-path]. You are an independent senior product designer reviewing this plan. You have NOT seen any prior review. Evaluate: + +1. Information hierarchy: what does the user see first, second, third? Is it right? +2. Missing states: loading, empty, error, success, partial — which are unspecified? +3. User journey: what's the emotional arc? Where does it break? +4. Specificity: does the plan describe SPECIFIC UI ("48px Söhne Bold header, #1a1a1a on white") or generic patterns ("clean modern card-based layout")? +5. What design decisions will haunt the implementer if left ambiguous? + +For each finding: what's wrong, severity (critical/high/medium), and the fix.`; + } else if (isDesignReview) { + codexPrompt = `Review the frontend source code in this repo. Evaluate against these design hard rules: +- Spacing: systematic (design tokens / CSS variables) or magic numbers? +- Typography: expressive purposeful fonts or default stacks? +- Color: CSS variables with defined system, or hardcoded hex scattered? +- Responsive: breakpoints defined? calc(100svh - header) for heroes? Mobile tested? +- A11y: ARIA landmarks, alt text, contrast ratios, 44px touch targets? +- Motion: 2-3 intentional animations, or zero / ornamental only? +- Cards: used only when card IS the interaction? No decorative card grids? + +First classify as MARKETING/LANDING PAGE vs APP UI vs HYBRID, then apply matching rules. + +LITMUS CHECKS — answer YES/NO: +${litmusList} + +HARD REJECTION — flag if ANY apply: +${rejectionList} + +Be specific. Reference file:line for every finding.`; + + subagentPrompt = `Review the frontend source code in this repo. You are an independent senior product designer doing a source-code design audit. Focus on CONSISTENCY PATTERNS across files rather than individual violations: +- Are spacing values systematic across the codebase? +- Is there ONE color system or scattered approaches? +- Do responsive breakpoints follow a consistent set? +- Is the accessibility approach consistent or spotty? + +For each finding: what's wrong, severity (critical/high/medium), and the file:line.`; + } else if (isDesignConsultation) { + codexPrompt = `Given this product context, propose a complete design direction: +- Visual thesis: one sentence describing mood, material, and energy +- Typography: specific font names (not defaults — no Inter/Roboto/Arial/system) + hex colors +- Color system: CSS variables for background, surface, primary text, muted text, accent +- Layout: composition-first, not component-first. First viewport as poster, not document +- Differentiation: 2 deliberate departures from category norms +- Anti-slop: no purple gradients, no 3-column icon grids, no centered everything, no decorative blobs + +Be opinionated. Be specific. Do not hedge. This is YOUR design direction — own it.`; + + subagentPrompt = `Given this product context, propose a design direction that would SURPRISE. What would the cool indie studio do that the enterprise UI team wouldn't? +- Propose an aesthetic direction, typography stack (specific font names), color palette (hex values) +- 2 deliberate departures from category norms +- What emotional reaction should the user have in the first 3 seconds? + +Be bold. Be specific. No hedging.`; + } else { + // Unknown skill — return empty + return ''; + } + + // Build the opt-in section + const optInSection = isAutomatic ? ` +**Automatic:** Outside voices run automatically when Codex is available. No opt-in needed.` : ` +Use AskUserQuestion: +> "Want outside design voices${isPlanDesignReview ? ' before the detailed review' : ''}? Codex evaluates against OpenAI's design hard rules + litmus checks; Claude subagent does an independent ${isDesignConsultation ? 'design direction proposal' : 'completeness review'}." +> +> A) Yes — run outside design voices +> B) No — proceed without + +If user chooses B, skip this step and continue.`; + + // Build the synthesis section + const synthesisSection = isPlanDesignReview ? ` +**Synthesis — Litmus scorecard:** + +\`\`\` +DESIGN OUTSIDE VOICES — LITMUS SCORECARD: +═══════════════════════════════════════════════════════════════ + Check Claude Codex Consensus + ─────────────────────────────────────── ─────── ─────── ───────── + 1. Brand unmistakable in first screen? — — — + 2. One strong visual anchor? — — — + 3. Scannable by headlines only? — — — + 4. Each section has one job? — — — + 5. Cards actually necessary? — — — + 6. Motion improves hierarchy? — — — + 7. Premium without decorative shadows? — — — + ─────────────────────────────────────── ─────── ─────── ───────── + Hard rejections triggered: — — — +═══════════════════════════════════════════════════════════════ +\`\`\` + +Fill in each cell from the Codex and subagent outputs. CONFIRMED = both agree. DISAGREE = models differ. NOT SPEC'D = not enough info to evaluate. + +**Pass integration (respects existing 7-pass contract):** +- Hard rejections → raised as the FIRST items in Pass 1, tagged \`[HARD REJECTION]\` +- Litmus DISAGREE items → raised in the relevant pass with both perspectives +- Litmus CONFIRMED failures → pre-loaded as known issues in the relevant pass +- Passes can skip discovery and go straight to fixing for pre-identified issues` : + isDesignConsultation ? ` +**Synthesis:** Claude main references both Codex and subagent proposals in the Phase 3 proposal. Present: +- Areas of agreement between all three voices (Claude main + Codex + subagent) +- Genuine divergences as creative alternatives for the user to choose from +- "Codex and I agree on X. Codex suggested Y where I'm proposing Z — here's why..."` : ` +**Synthesis — Litmus scorecard:** + +Use the same scorecard format as /plan-design-review (shown above). Fill in from both outputs. +Merge findings into the triage with \`[codex]\` / \`[subagent]\` / \`[cross-model]\` tags.`; + + const escapedCodexPrompt = codexPrompt.replace(/`/g, '\\`').replace(/\$/g, '\\$'); + + return `## Design Outside Voices (parallel) +${optInSection} + +**Check Codex availability:** +\`\`\`bash +which codex 2>/dev/null && echo "CODEX_AVAILABLE" || echo "CODEX_NOT_AVAILABLE" +\`\`\` + +**If Codex is available**, launch both voices simultaneously: + +1. **Codex design voice** (via Bash): +\`\`\`bash +TMPERR_DESIGN=$(mktemp /tmp/codex-design-XXXXXXXX) +codex exec "${escapedCodexPrompt}" -s read-only -c 'model_reasoning_effort="${reasoningEffort}"' --enable web_search_cached 2>"$TMPERR_DESIGN" +\`\`\` +Use a 5-minute timeout (\`timeout: 300000\`). After the command completes, read stderr: +\`\`\`bash +cat "$TMPERR_DESIGN" && rm -f "$TMPERR_DESIGN" +\`\`\` + +2. **Claude design subagent** (via Agent tool): +Dispatch a subagent with this prompt: +"${subagentPrompt}" + +**Error handling (all non-blocking):** +- **Auth failure:** If stderr contains "auth", "login", "unauthorized", or "API key": "Codex authentication failed. Run \`codex login\` to authenticate." +- **Timeout:** "Codex timed out after 5 minutes." +- **Empty response:** "Codex returned no response." +- On any Codex error: proceed with Claude subagent output only, tagged \`[single-model]\`. +- If Claude subagent also fails: "Outside voices unavailable — continuing with primary review." + +Present Codex output under a \`CODEX SAYS (design ${isPlanDesignReview ? 'critique' : isDesignReview ? 'source audit' : 'direction'}):\` header. +Present subagent output under a \`CLAUDE SUBAGENT (design ${isPlanDesignReview ? 'completeness' : isDesignReview ? 'consistency' : 'direction'}):\` header. +${synthesisSection} + +**Log the result:** +\`\`\`bash +${ctx.paths.binDir}/gstack-review-log '{"skill":"design-outside-voices","timestamp":"'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'","status":"STATUS","source":"SOURCE","commit":"'"$(git rev-parse --short HEAD)"'"}' +\`\`\` +Replace STATUS with "clean" or "issues_found", SOURCE with "codex+subagent", "codex-only", "subagent-only", or "unavailable".`; +} + +// ─── Design Hard Rules (OpenAI framework + gstack slop blacklist) ─── + +function generateDesignHardRules(_ctx: TemplateContext): string { + const slopItems = AI_SLOP_BLACKLIST.map((item, i) => `${i + 1}. ${item}`).join('\n'); + const rejectionItems = OPENAI_HARD_REJECTIONS.map((item, i) => `${i + 1}. ${item}`).join('\n'); + const litmusItems = OPENAI_LITMUS_CHECKS.map((item, i) => `${i + 1}. ${item}`).join('\n'); + + return `### Design Hard Rules + +**Classifier — determine rule set before evaluating:** +- **MARKETING/LANDING PAGE** (hero-driven, brand-forward, conversion-focused) → apply Landing Page Rules +- **APP UI** (workspace-driven, data-dense, task-focused: dashboards, admin, settings) → apply App UI Rules +- **HYBRID** (marketing shell with app-like sections) → apply Landing Page Rules to hero/marketing sections, App UI Rules to functional sections + +**Hard rejection criteria** (instant-fail patterns — flag if ANY apply): +${rejectionItems} + +**Litmus checks** (answer YES/NO for each — used for cross-model consensus scoring): +${litmusItems} + +**Landing page rules** (apply when classifier = MARKETING/LANDING): +- First viewport reads as one composition, not a dashboard +- Brand-first hierarchy: brand > headline > body > CTA +- Typography: expressive, purposeful — no default stacks (Inter, Roboto, Arial, system) +- No flat single-color backgrounds — use gradients, images, subtle patterns +- Hero: full-bleed, edge-to-edge, no inset/tiled/rounded variants +- Hero budget: brand, one headline, one supporting sentence, one CTA group, one image +- No cards in hero. Cards only when card IS the interaction +- One job per section: one purpose, one headline, one short supporting sentence +- Motion: 2-3 intentional motions minimum (entrance, scroll-linked, hover/reveal) +- Color: define CSS variables, avoid purple-on-white defaults, one accent color default +- Copy: product language not design commentary. "If deleting 30% improves it, keep deleting" +- Beautiful defaults: composition-first, brand as loudest text, two typefaces max, cardless by default, first viewport as poster not document + +**App UI rules** (apply when classifier = APP UI): +- Calm surface hierarchy, strong typography, few colors +- Dense but readable, minimal chrome +- Organize: primary workspace, navigation, secondary context, one accent +- Avoid: dashboard-card mosaics, thick borders, decorative gradients, ornamental icons +- Copy: utility language — orientation, status, action. Not mood/brand/aspiration +- Cards only when card IS the interaction +- Section headings state what area is or what user can do ("Selected KPIs", "Plan status") + +**Universal rules** (apply to ALL types): +- Define CSS variables for color system +- No default font stacks (Inter, Roboto, Arial, system) +- One job per section +- "If deleting 30% of the copy improves it, keep deleting" +- Cards earn their existence — no decorative card grids + +**AI Slop blacklist** (the 10 patterns that scream "AI-generated"): +${slopItems} + +Source: [OpenAI "Designing Delightful Frontends with GPT-5.4"](https://developers.openai.com/blog/designing-delightful-frontends-with-gpt-5-4) (Mar 2026) + gstack design methodology.`; +} + +function generateSlugEval(ctx: TemplateContext): string { + return `eval "$(${ctx.paths.binDir}/gstack-slug 2>/dev/null)"`; +} + +function generateSlugSetup(ctx: TemplateContext): string { + return `eval "$(${ctx.paths.binDir}/gstack-slug 2>/dev/null)" && mkdir -p ~/.gstack/projects/$SLUG`; +} + +const RESOLVERS: Record string> = { + SLUG_EVAL: generateSlugEval, + SLUG_SETUP: generateSlugSetup, + COMMAND_REFERENCE: generateCommandReference, + SNAPSHOT_FLAGS: generateSnapshotFlags, + PREAMBLE: generatePreamble, + BROWSE_SETUP: generateBrowseSetup, + BASE_BRANCH_DETECT: generateBaseBranchDetect, + QA_METHODOLOGY: generateQAMethodology, + DESIGN_METHODOLOGY: generateDesignMethodology, + DESIGN_HARD_RULES: generateDesignHardRules, + DESIGN_OUTSIDE_VOICES: generateDesignOutsideVoices, + DESIGN_REVIEW_LITE: generateDesignReviewLite, + REVIEW_DASHBOARD: generateReviewDashboard, + PLAN_FILE_REVIEW_REPORT: generatePlanFileReviewReport, + TEST_BOOTSTRAP: generateTestBootstrap, + TEST_COVERAGE_AUDIT_PLAN: generateTestCoverageAuditPlan, + TEST_COVERAGE_AUDIT_SHIP: generateTestCoverageAuditShip, + TEST_COVERAGE_AUDIT_REVIEW: generateTestCoverageAuditReview, + TEST_FAILURE_TRIAGE: generateTestFailureTriage, + SPEC_REVIEW_LOOP: generateSpecReviewLoop, + DESIGN_SKETCH: generateDesignSketch, + BENEFITS_FROM: generateBenefitsFrom, + CODEX_SECOND_OPINION: generateCodexSecondOpinion, + CODEX_REVIEW_STEP: generateAdversarialStep, + ADVERSARIAL_STEP: generateAdversarialStep, + DEPLOY_BOOTSTRAP: generateDeployBootstrap, + CODEX_PLAN_REVIEW: generateCodexPlanReview, + PLAN_COMPLETION_AUDIT_SHIP: generatePlanCompletionAuditShip, + PLAN_COMPLETION_AUDIT_REVIEW: generatePlanCompletionAuditReview, + PLAN_VERIFICATION_EXEC: generatePlanVerificationExec, +}; + +// ─── Codex Helpers ─────────────────────────────────────────── + +function codexSkillName(skillDir: string): string { + if (skillDir === '.' || skillDir === '') return 'gstack'; + // Don't double-prefix: gstack-upgrade → gstack-upgrade (not gstack-gstack-upgrade) + if (skillDir.startsWith('gstack-')) return skillDir; + return `gstack-${skillDir}`; +} + +function extractNameAndDescription(content: string): { name: string; description: string } { + const fmStart = content.indexOf('---\n'); + if (fmStart !== 0) return { name: '', description: '' }; + const fmEnd = content.indexOf('\n---', fmStart + 4); + if (fmEnd === -1) return { name: '', description: '' }; + + const frontmatter = content.slice(fmStart + 4, fmEnd); + const nameMatch = frontmatter.match(/^name:\s*(.+)$/m); + const name = nameMatch ? nameMatch[1].trim() : ''; + + let description = ''; + const lines = frontmatter.split('\n'); + let inDescription = false; + const descLines: string[] = []; + for (const line of lines) { + if (line.match(/^description:\s*\|?\s*$/)) { + inDescription = true; + continue; + } + if (line.match(/^description:\s*\S/)) { + description = line.replace(/^description:\s*/, '').trim(); + break; + } + if (inDescription) { + if (line === '' || line.match(/^\s/)) { + descLines.push(line.replace(/^ /, '')); + } else { + break; + } + } + } + if (descLines.length > 0) { + description = descLines.join('\n').trim(); + } + + return { name, description }; +} + +const OPENAI_SHORT_DESCRIPTION_LIMIT = 120; + +function condenseOpenAIShortDescription(description: string): string { + const firstParagraph = description.split(/\n\s*\n/)[0] || description; + const collapsed = firstParagraph.replace(/\s+/g, ' ').trim(); + if (collapsed.length <= OPENAI_SHORT_DESCRIPTION_LIMIT) return collapsed; + + const truncated = collapsed.slice(0, OPENAI_SHORT_DESCRIPTION_LIMIT - 3); + const lastSpace = truncated.lastIndexOf(' '); + const safe = lastSpace > 40 ? truncated.slice(0, lastSpace) : truncated; + return `${safe}...`; +} + +function generateOpenAIYaml(displayName: string, shortDescription: string): string { + return `interface: + display_name: ${JSON.stringify(displayName)} + short_description: ${JSON.stringify(shortDescription)} + default_prompt: ${JSON.stringify(`Use ${displayName} for this task.`)} +policy: + allow_implicit_invocation: true +`; +} + +/** + * Transform frontmatter for Codex: keep only name + description. + * Strips allowed-tools, hooks, version, and all other fields. + * Handles multiline block scalar descriptions (YAML | syntax). + */ +function transformFrontmatter(content: string, host: Host): string { + if (host === 'claude') return content; + + const fmStart = content.indexOf('---\n'); + if (fmStart !== 0) return content; + const fmEnd = content.indexOf('\n---', fmStart + 4); + if (fmEnd === -1) return content; + const body = content.slice(fmEnd + 4); // includes the leading \n after --- + const { name, description } = extractNameAndDescription(content); + + // Codex 1024-char description limit — fail build, don't ship broken skills + const MAX_DESC = 1024; + if (description.length > MAX_DESC) { + throw new Error( + `Codex description for "${name}" is ${description.length} chars (max ${MAX_DESC}). ` + + `Compress the description in the .tmpl file.` + ); + } + + // Re-emit Codex frontmatter (name + description only) + const indentedDesc = description.split('\n').map(l => ` ${l}`).join('\n'); + const codexFm = `---\nname: ${name}\ndescription: |\n${indentedDesc}\n---`; + return codexFm + body; +} + +/** + * Extract hook descriptions from frontmatter for inline safety prose. + * Returns a description of what the hooks do, or null if no hooks. + */ +function extractHookSafetyProse(tmplContent: string): string | null { + if (!tmplContent.match(/^hooks:/m)) return null; + + // Parse the hook matchers to build a human-readable safety description + const matchers: string[] = []; + const matcherRegex = /matcher:\s*"(\w+)"/g; + let m; + while ((m = matcherRegex.exec(tmplContent)) !== null) { + if (!matchers.includes(m[1])) matchers.push(m[1]); + } + + if (matchers.length === 0) return null; + + // Build safety prose based on what tools are hooked + const toolDescriptions: Record = { + Bash: 'check bash commands for destructive operations (rm -rf, DROP TABLE, force-push, git reset --hard, etc.) before execution', + Edit: 'verify file edits are within the allowed scope boundary before applying', + Write: 'verify file writes are within the allowed scope boundary before applying', + }; + + const safetyChecks = matchers + .map(t => toolDescriptions[t] || `check ${t} operations for safety`) + .join(', and '); + + return `> **Safety Advisory:** This skill includes safety checks that ${safetyChecks}. When using this skill, always pause and verify before executing potentially destructive operations. If uncertain about a command's safety, ask the user for confirmation before proceeding.`; +} + // ─── Template Processing ──────────────────────────────────── const GENERATED_HEADER = `\n\n`; @@ -84,17 +3016,6 @@ function processTemplate(tmplPath: string, host: Host = 'claude'): { outputPath: throw new Error(`Unresolved placeholders in ${relTmplPath}: ${remaining.join(', ')}`); } - // Inject auto-trigger guard into skill descriptions. - // Adds explicit trigger criteria so Claude Code doesn't auto-fire skills - // based on semantic similarity. Preserves existing "Use when" and - // "Proactively suggest" text (both are tested in skill-validation.test.ts). - const triggerGuard = ` MANUAL TRIGGER ONLY: invoke only when user types /${skillName}.\n`; - const descMatch = content.match(/^(description:\s*\|?\s*\n)/m); - if (descMatch && descMatch.index !== undefined) { - const insertAt = descMatch.index + descMatch[0].length; - content = content.slice(0, insertAt) + triggerGuard + content.slice(insertAt); - } - // For codex host: transform frontmatter and replace Claude-specific paths if (host === 'codex') { // Extract hook safety prose BEFORE transforming frontmatter (which strips hooks) diff --git a/scripts/resolvers/design.ts b/scripts/resolvers/design.ts index c4926112a..30b1fe2c8 100644 --- a/scripts/resolvers/design.ts +++ b/scripts/resolvers/design.ts @@ -17,7 +17,7 @@ If Codex is available, run a lightweight design check on the diff: \`\`\`bash TMPERR_DRL=$(mktemp /tmp/codex-drl-XXXXXXXX) -codex exec "Review the git diff on this branch. Run 7 litmus checks (YES/NO each): ${litmusList} Flag any hard rejections: ${rejectionList} 5 most important design findings only. Reference file:line." -C "$(git rev-parse --show-toplevel)" -s read-only -c 'model_reasoning_effort="high"' --enable web_search_cached 2>"$TMPERR_DRL" +codex exec "Review the git diff on this branch. Run 7 litmus checks (YES/NO each): ${litmusList} Flag any hard rejections: ${rejectionList} 5 most important design findings only. Reference file:line." -s read-only -c 'model_reasoning_effort="high"' --enable web_search_cached 2>"$TMPERR_DRL" \`\`\` Use a 5-minute timeout (\`timeout: 300000\`). After the command completes, read stderr: @@ -467,7 +467,7 @@ If user chooses A, launch both voices simultaneously: 1. **Codex** (via Bash, \`model_reasoning_effort="medium"\`): \`\`\`bash TMPERR_SKETCH=$(mktemp /tmp/codex-sketch-XXXXXXXX) -codex exec "For this product approach, provide: a visual thesis (one sentence — mood, material, energy), a content plan (hero → support → detail → CTA), and 2 interaction ideas that change page feel. Apply beautiful defaults: composition-first, brand-first, cardless, poster not document. Be opinionated." -C "$(git rev-parse --show-toplevel)" -s read-only -c 'model_reasoning_effort="medium"' --enable web_search_cached 2>"$TMPERR_SKETCH" +codex exec "For this product approach, provide: a visual thesis (one sentence — mood, material, energy), a content plan (hero → support → detail → CTA), and 2 interaction ideas that change page feel. Apply beautiful defaults: composition-first, brand-first, cardless, poster not document. Be opinionated." -s read-only -c 'model_reasoning_effort="medium"' --enable web_search_cached 2>"$TMPERR_SKETCH" \`\`\` Use a 5-minute timeout (\`timeout: 300000\`). After completion: \`cat "$TMPERR_SKETCH" && rm -f "$TMPERR_SKETCH"\` @@ -636,7 +636,7 @@ which codex 2>/dev/null && echo "CODEX_AVAILABLE" || echo "CODEX_NOT_AVAILABLE" 1. **Codex design voice** (via Bash): \`\`\`bash TMPERR_DESIGN=$(mktemp /tmp/codex-design-XXXXXXXX) -codex exec "${escapedCodexPrompt}" -C "$(git rev-parse --show-toplevel)" -s read-only -c 'model_reasoning_effort="${reasoningEffort}"' --enable web_search_cached 2>"$TMPERR_DESIGN" +codex exec "${escapedCodexPrompt}" -s read-only -c 'model_reasoning_effort="${reasoningEffort}"' --enable web_search_cached 2>"$TMPERR_DESIGN" \`\`\` Use a 5-minute timeout (\`timeout: 300000\`). After the command completes, read stderr: \`\`\`bash diff --git a/scripts/resolvers/index.ts b/scripts/resolvers/index.ts index 95c6ea082..9e9b9596f 100644 --- a/scripts/resolvers/index.ts +++ b/scripts/resolvers/index.ts @@ -11,7 +11,7 @@ import { generateTestFailureTriage } from './preamble'; import { generateCommandReference, generateSnapshotFlags, generateBrowseSetup } from './browse'; import { generateDesignMethodology, generateDesignHardRules, generateDesignOutsideVoices, generateDesignReviewLite, generateDesignSketch } from './design'; import { generateTestBootstrap, generateTestCoverageAuditPlan, generateTestCoverageAuditShip, generateTestCoverageAuditReview } from './testing'; -import { generateReviewDashboard, generatePlanFileReviewReport, generateSpecReviewLoop, generateBenefitsFrom, generateCodexSecondOpinion, generateAdversarialStep, generateCodexPlanReview } from './review'; +import { generateReviewDashboard, generatePlanFileReviewReport, generateSpecReviewLoop, generateBenefitsFrom, generateCodexSecondOpinion, generateAdversarialStep, generateCodexPlanReview, generatePlanCompletionAuditShip, generatePlanCompletionAuditReview, generatePlanVerificationExec } from './review'; import { generateSlugEval, generateSlugSetup, generateBaseBranchDetect, generateDeployBootstrap, generateQAMethodology } from './utility'; export const RESOLVERS: Record string> = { @@ -41,4 +41,7 @@ export const RESOLVERS: Record string> = { ADVERSARIAL_STEP: generateAdversarialStep, DEPLOY_BOOTSTRAP: generateDeployBootstrap, CODEX_PLAN_REVIEW: generateCodexPlanReview, + PLAN_COMPLETION_AUDIT_SHIP: generatePlanCompletionAuditShip, + PLAN_COMPLETION_AUDIT_REVIEW: generatePlanCompletionAuditReview, + PLAN_VERIFICATION_EXEC: generatePlanVerificationExec, }; diff --git a/scripts/resolvers/preamble.ts b/scripts/resolvers/preamble.ts index 1fdfed1d8..76573422e 100644 --- a/scripts/resolvers/preamble.ts +++ b/scripts/resolvers/preamble.ts @@ -21,9 +21,11 @@ _SESSIONS=$(find ~/.gstack/sessions -mmin -120 -type f 2>/dev/null | wc -l | tr find ~/.gstack/sessions -mmin +120 -type f -delete 2>/dev/null || true _CONTRIB=$(${ctx.paths.binDir}/gstack-config get gstack_contributor 2>/dev/null || true) _PROACTIVE=$(${ctx.paths.binDir}/gstack-config get proactive 2>/dev/null || echo "true") +_PROACTIVE_PROMPTED=$([ -f ~/.gstack/.proactive-prompted ] && echo "yes" || echo "no") _BRANCH=$(git branch --show-current 2>/dev/null || echo "unknown") echo "BRANCH: $_BRANCH" echo "PROACTIVE: $_PROACTIVE" +echo "PROACTIVE_PROMPTED: $_PROACTIVE_PROMPTED" source <(${ctx.paths.binDir}/gstack-repo-mode 2>/dev/null) || true REPO_MODE=\${REPO_MODE:-unknown} echo "REPO_MODE: $REPO_MODE" @@ -43,8 +45,11 @@ for _PF in $(find ~/.gstack/analytics -maxdepth 1 -name '.pending-*' 2>/dev/null } function generateUpgradeCheck(ctx: TemplateContext): string { - return `If \`PROACTIVE\` is \`"false"\`, do not proactively suggest gstack skills — only invoke -them when the user explicitly asks. The user opted out of proactive suggestions. + return `If \`PROACTIVE\` is \`"false"\`, do not proactively suggest gstack skills AND do not +auto-invoke skills based on conversation context. Only run skills the user explicitly +types (e.g., /qa, /ship). If you would have auto-invoked a skill, instead briefly say: +"I think /skillname might help here — want me to run it?" and wait for confirmation. +The user opted out of proactive behavior. If output shows \`UPGRADE_AVAILABLE \`: read \`${ctx.paths.skillRoot}/gstack-upgrade/SKILL.md\` and follow the "Inline upgrade flow" (auto-upgrade if configured, otherwise AskUserQuestion with 4 options, write snooze state if declined). If \`JUST_UPGRADED \`: tell user "Running gstack v{to} (just updated!)" and continue.`; } @@ -98,6 +103,29 @@ touch ~/.gstack/.telemetry-prompted This only happens once. If \`TEL_PROMPTED\` is \`yes\`, skip this entirely.`; } +function generateProactivePrompt(ctx: TemplateContext): string { + return `If \`PROACTIVE_PROMPTED\` is \`no\` AND \`TEL_PROMPTED\` is \`yes\`: After telemetry is handled, +ask the user about proactive behavior. Use AskUserQuestion: + +> gstack can proactively figure out when you might need a skill while you work — +> like suggesting /qa when you say "does this work?" or /investigate when you hit +> a bug. We recommend keeping this on — it speeds up every part of your workflow. + +Options: +- A) Keep it on (recommended) +- B) Turn it off — I'll type /commands myself + +If A: run \`${ctx.paths.binDir}/gstack-config set proactive true\` +If B: run \`${ctx.paths.binDir}/gstack-config set proactive false\` + +Always run: +\`\`\`bash +touch ~/.gstack/.proactive-prompted +\`\`\` + +This only happens once. If \`PROACTIVE_PROMPTED\` is \`yes\`, skip this entirely.`; +} + function generateAskUserFormat(_ctx: TemplateContext): string { return `## AskUserQuestion Format @@ -382,6 +410,7 @@ export function generatePreamble(ctx: TemplateContext): string { generateUpgradeCheck(ctx), generateLakeIntro(), generateTelemetryPrompt(ctx), + generateProactivePrompt(ctx), ...(tier >= 2 ? [generateAskUserFormat(ctx), generateCompletenessSection()] : []), ...(tier >= 3 ? [generateRepoModeSection(), generateSearchBeforeBuildingSection(ctx)] : []), generateContributorMode(), diff --git a/scripts/resolvers/review.ts b/scripts/resolvers/review.ts index 0087cda7c..1831e098b 100644 --- a/scripts/resolvers/review.ts +++ b/scripts/resolvers/review.ts @@ -286,7 +286,7 @@ Write the full prompt (context block + instructions) to this file. Use the mode- \`\`\`bash TMPERR_OH=$(mktemp /tmp/codex-oh-err-XXXXXXXX) -codex exec "$(cat "$CODEX_PROMPT_FILE")" -C "$(git rev-parse --show-toplevel)" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached 2>"$TMPERR_OH" +codex exec "$(cat "$CODEX_PROMPT_FILE")" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached 2>"$TMPERR_OH" \`\`\` Use a 5-minute timeout (\`timeout: 300000\`). After the command completes, read stderr: @@ -370,7 +370,7 @@ Claude's structured review already ran. Now add a **cross-model adversarial chal \`\`\`bash TMPERR_ADV=$(mktemp /tmp/codex-adv-XXXXXXXX) -codex exec "Review the changes on this branch against the base branch. Run git diff origin/ to see the diff. Your job is to find ways this code will fail in production. Think like an attacker and a chaos engineer. Find edge cases, race conditions, security holes, resource leaks, failure modes, and silent data corruption paths. Be adversarial. Be thorough. No compliments — just the problems." -C "$(git rev-parse --show-toplevel)" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached 2>"$TMPERR_ADV" +codex exec "Review the changes on this branch against the base branch. Run git diff origin/ to see the diff. Your job is to find ways this code will fail in production. Think like an attacker and a chaos engineer. Find edge cases, race conditions, security holes, resource leaks, failure modes, and silent data corruption paths. Be adversarial. Be thorough. No compliments — just the problems." -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached 2>"$TMPERR_ADV" \`\`\` Set the Bash tool's \`timeout\` parameter to \`300000\` (5 minutes). Do NOT use the \`timeout\` shell command — it doesn't exist on macOS. After the command completes, read stderr: @@ -525,7 +525,7 @@ THE PLAN: \`\`\`bash TMPERR_PV=$(mktemp /tmp/codex-planreview-XXXXXXXX) -codex exec "" -C "$(git rev-parse --show-toplevel)" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached 2>"$TMPERR_PV" +codex exec "" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached 2>"$TMPERR_PV" \`\`\` Use a 5-minute timeout (\`timeout: 300000\`). After the command completes, read stderr: @@ -592,3 +592,236 @@ SOURCE = "codex" if Codex ran, "claude" if subagent ran. ---`; } + +// ─── Plan File Discovery (shared helper) ────────────────────────────── + +function generatePlanFileDiscovery(): string { + return `### Plan File Discovery + +1. **Conversation context (primary):** Check if there is an active plan file in this conversation — Claude Code system messages include plan file paths when in plan mode. Look for references like \`~/.claude/plans/*.md\` in system messages. If found, use it directly — this is the most reliable signal. + +2. **Content-based search (fallback):** If no plan file is referenced in conversation context, search by content: + +\`\`\`bash +BRANCH=$(git branch --show-current 2>/dev/null | tr '/' '-') +REPO=$(basename "$(git rev-parse --show-toplevel 2>/dev/null)") +# Try branch name match first (most specific) +PLAN=$(ls -t ~/.claude/plans/*.md 2>/dev/null | xargs grep -l "$BRANCH" 2>/dev/null | head -1) +# Fall back to repo name match +[ -z "$PLAN" ] && PLAN=$(ls -t ~/.claude/plans/*.md 2>/dev/null | xargs grep -l "$REPO" 2>/dev/null | head -1) +# Last resort: most recent plan modified in the last 24 hours +[ -z "$PLAN" ] && PLAN=$(find ~/.claude/plans -name '*.md' -mmin -1440 -maxdepth 1 2>/dev/null | xargs ls -t 2>/dev/null | head -1) +[ -n "$PLAN" ] && echo "PLAN_FILE: $PLAN" || echo "NO_PLAN_FILE" +\`\`\` + +3. **Validation:** If a plan file was found via content-based search (not conversation context), read the first 20 lines and verify it is relevant to the current branch's work. If it appears to be from a different project or feature, treat as "no plan file found." + +**Error handling:** +- No plan file found → skip with "No plan file detected — skipping." +- Plan file found but unreadable (permissions, encoding) → skip with "Plan file found but unreadable — skipping."`; +} + +// ─── Plan Completion Audit ──────────────────────────────────────────── + +type PlanCompletionMode = 'ship' | 'review'; + +function generatePlanCompletionAuditInner(mode: PlanCompletionMode): string { + const sections: string[] = []; + + // ── Plan file discovery (shared) ── + sections.push(generatePlanFileDiscovery()); + + // ── Item extraction ── + sections.push(` +### Actionable Item Extraction + +Read the plan file. Extract every actionable item — anything that describes work to be done. Look for: + +- **Checkbox items:** \`- [ ] ...\` or \`- [x] ...\` +- **Numbered steps** under implementation headings: "1. Create ...", "2. Add ...", "3. Modify ..." +- **Imperative statements:** "Add X to Y", "Create a Z service", "Modify the W controller" +- **File-level specifications:** "New file: path/to/file.ts", "Modify path/to/existing.rb" +- **Test requirements:** "Test that X", "Add test for Y", "Verify Z" +- **Data model changes:** "Add column X to table Y", "Create migration for Z" + +**Ignore:** +- Context/Background sections (\`## Context\`, \`## Background\`, \`## Problem\`) +- Questions and open items (marked with ?, "TBD", "TODO: decide") +- Review report sections (\`## GSTACK REVIEW REPORT\`) +- Explicitly deferred items ("Future:", "Out of scope:", "NOT in scope:", "P2:", "P3:", "P4:") +- CEO Review Decisions sections (these record choices, not work items) + +**Cap:** Extract at most 50 items. If the plan has more, note: "Showing top 50 of N plan items — full list in plan file." + +**No items found:** If the plan contains no extractable actionable items, skip with: "Plan file contains no actionable items — skipping completion audit." + +For each item, note: +- The item text (verbatim or concise summary) +- Its category: CODE | TEST | MIGRATION | CONFIG | DOCS`); + + // ── Cross-reference against diff ── + sections.push(` +### Cross-Reference Against Diff + +Run \`git diff origin/...HEAD\` and \`git log origin/..HEAD --oneline\` to understand what was implemented. + +For each extracted plan item, check the diff and classify: + +- **DONE** — Clear evidence in the diff that this item was implemented. Cite the specific file(s) changed. +- **PARTIAL** — Some work toward this item exists in the diff but it's incomplete (e.g., model created but controller missing, function exists but edge cases not handled). +- **NOT DONE** — No evidence in the diff that this item was addressed. +- **CHANGED** — The item was implemented using a different approach than the plan described, but the same goal is achieved. Note the difference. + +**Be conservative with DONE** — require clear evidence in the diff. A file being touched is not enough; the specific functionality described must be present. +**Be generous with CHANGED** — if the goal is met by different means, that counts as addressed.`); + + // ── Output format ── + sections.push(` +### Output Format + +\`\`\` +PLAN COMPLETION AUDIT +═══════════════════════════════ +Plan: {plan file path} + +## Implementation Items + [DONE] Create UserService — src/services/user_service.rb (+142 lines) + [PARTIAL] Add validation — model validates but missing controller checks + [NOT DONE] Add caching layer — no cache-related changes in diff + [CHANGED] "Redis queue" → implemented with Sidekiq instead + +## Test Items + [DONE] Unit tests for UserService — test/services/user_service_test.rb + [NOT DONE] E2E test for signup flow + +## Migration Items + [DONE] Create users table — db/migrate/20240315_create_users.rb + +───────────────────────────────── +COMPLETION: 4/7 DONE, 1 PARTIAL, 1 NOT DONE, 1 CHANGED +───────────────────────────────── +\`\`\``); + + // ── Gate logic (mode-specific) ── + if (mode === 'ship') { + sections.push(` +### Gate Logic + +After producing the completion checklist: + +- **All DONE or CHANGED:** Pass. "Plan completion: PASS — all items addressed." Continue. +- **Only PARTIAL items (no NOT DONE):** Continue with a note in the PR body. Not blocking. +- **Any NOT DONE items:** Use AskUserQuestion: + - Show the completion checklist above + - "{N} items from the plan are NOT DONE. These were part of the original plan but are missing from the implementation." + - RECOMMENDATION: depends on item count and severity. If 1-2 minor items (docs, config), recommend B. If core functionality is missing, recommend A. + - Options: + A) Stop — implement the missing items before shipping + B) Ship anyway — defer these to a follow-up (will create P1 TODOs in Step 5.5) + C) These items were intentionally dropped — remove from scope + - If A: STOP. List the missing items for the user to implement. + - If B: Continue. For each NOT DONE item, create a P1 TODO in Step 5.5 with "Deferred from plan: {plan file path}". + - If C: Continue. Note in PR body: "Plan items intentionally dropped: {list}." + +**No plan file found:** Skip entirely. "No plan file detected — skipping plan completion audit." + +**Include in PR body (Step 8):** Add a \`## Plan Completion\` section with the checklist summary.`); + } else { + // review mode + sections.push(` +### Integration with Scope Drift Detection + +The plan completion results augment the existing Scope Drift Detection. If a plan file is found: + +- **NOT DONE items** become additional evidence for **MISSING REQUIREMENTS** in the scope drift report. +- **Items in the diff that don't match any plan item** become evidence for **SCOPE CREEP** detection. + +This is **INFORMATIONAL** — does not block the review (consistent with existing scope drift behavior). + +Update the scope drift output to include plan file context: + +\`\`\` +Scope Check: [CLEAN / DRIFT DETECTED / REQUIREMENTS MISSING] +Intent: +Plan: +Delivered: <1-line summary of what the diff actually does> +Plan items: N DONE, M PARTIAL, K NOT DONE +[If NOT DONE: list each missing item] +[If scope creep: list each out-of-scope change not in the plan] +\`\`\` + +**No plan file found:** Fall back to existing scope drift behavior (check TODOS.md and PR description only).`); + } + + return sections.join('\n'); +} + +export function generatePlanCompletionAuditShip(_ctx: TemplateContext): string { + return generatePlanCompletionAuditInner('ship'); +} + +export function generatePlanCompletionAuditReview(_ctx: TemplateContext): string { + return generatePlanCompletionAuditInner('review'); +} + +// ─── Plan Verification Execution ────────────────────────────────────── + +export function generatePlanVerificationExec(_ctx: TemplateContext): string { + return `## Step 3.47: Plan Verification + +Automatically verify the plan's testing/verification steps using the \`/qa-only\` skill. + +### 1. Check for verification section + +Using the plan file already discovered in Step 3.45, look for a verification section. Match any of these headings: \`## Verification\`, \`## Test plan\`, \`## Testing\`, \`## How to test\`, \`## Manual testing\`, or any section with verification-flavored items (URLs to visit, things to check visually, interactions to test). + +**If no verification section found:** Skip with "No verification steps found in plan — skipping auto-verification." +**If no plan file was found in Step 3.45:** Skip (already handled). + +### 2. Check for running dev server + +Before invoking browse-based verification, check if a dev server is reachable: + +\`\`\`bash +curl -s -o /dev/null -w '%{http_code}' http://localhost:3000 2>/dev/null || \\ +curl -s -o /dev/null -w '%{http_code}' http://localhost:8080 2>/dev/null || \\ +curl -s -o /dev/null -w '%{http_code}' http://localhost:5173 2>/dev/null || \\ +curl -s -o /dev/null -w '%{http_code}' http://localhost:4000 2>/dev/null || echo "NO_SERVER" +\`\`\` + +**If NO_SERVER:** Skip with "No dev server detected — skipping plan verification. Run /qa separately after deploying." + +### 3. Invoke /qa-only inline + +Read the \`/qa-only\` skill from disk: + +\`\`\`bash +cat \${CLAUDE_SKILL_DIR}/../qa-only/SKILL.md +\`\`\` + +**If unreadable:** Skip with "Could not load /qa-only — skipping plan verification." + +Follow the /qa-only workflow with these modifications: +- **Skip the preamble** (already handled by /ship) +- **Use the plan's verification section as the primary test input** — treat each verification item as a test case +- **Use the detected dev server URL** as the base URL +- **Skip the fix loop** — this is report-only verification during /ship +- **Cap at the verification items from the plan** — do not expand into general site QA + +### 4. Gate logic + +- **All verification items PASS:** Continue silently. "Plan verification: PASS." +- **Any FAIL:** Use AskUserQuestion: + - Show the failures with screenshot evidence + - RECOMMENDATION: Choose A if failures indicate broken functionality. Choose B if cosmetic only. + - Options: + A) Fix the failures before shipping (recommended for functional issues) + B) Ship anyway — known issues (acceptable for cosmetic issues) +- **No verification section / no server / unreadable skill:** Skip (non-blocking). + +### 5. Include in PR body + +Add a \`## Verification Results\` section to the PR body (Step 8): +- If verification ran: summary of results (N PASS, M FAIL, K SKIPPED) +- If skipped: reason for skipping (no plan, no server, no verification section)`; +} diff --git a/scripts/resolvers/testing.ts b/scripts/resolvers/testing.ts index 4ede82708..fde799dc0 100644 --- a/scripts/resolvers/testing.ts +++ b/scripts/resolvers/testing.ts @@ -454,7 +454,40 @@ find . -name '*.test.*' -o -name '*.spec.*' -o -name '*_test.*' -o -name '*_spec \`\`\` For PR body: \`Tests: {before} → {after} (+{delta} new)\` -Coverage line: \`Test Coverage Audit: N new code paths. M covered (X%). K tests generated, J committed.\``); +Coverage line: \`Test Coverage Audit: N new code paths. M covered (X%). K tests generated, J committed.\` + +**7. Coverage gate:** + +Before proceeding, check CLAUDE.md for a \`## Test Coverage\` section with \`Minimum:\` and \`Target:\` fields. If found, use those percentages. Otherwise use defaults: Minimum = 60%, Target = 80%. + +Using the coverage percentage from the diagram in substep 4 (the \`COVERAGE: X/Y (Z%)\` line): + +- **>= target:** Pass. "Coverage gate: PASS ({X}%)." Continue. +- **>= minimum, < target:** Use AskUserQuestion: + - "AI-assessed coverage is {X}%. {N} code paths are untested. Target is {target}%." + - RECOMMENDATION: Choose A because untested code paths are where production bugs hide. + - Options: + A) Generate more tests for remaining gaps (recommended) + B) Ship anyway — I accept the coverage risk + C) These paths don't need tests — mark as intentionally uncovered + - If A: Loop back to substep 5 (generate tests) targeting the remaining gaps. After second pass, if still below target, present AskUserQuestion again with updated numbers. Maximum 2 generation passes total. + - If B: Continue. Include in PR body: "Coverage gate: {X}% — user accepted risk." + - If C: Continue. Include in PR body: "Coverage gate: {X}% — {N} paths intentionally uncovered." + +- **< minimum:** Use AskUserQuestion: + - "AI-assessed coverage is critically low ({X}%). {N} of {M} code paths have no tests. Minimum threshold is {minimum}%." + - RECOMMENDATION: Choose A because less than {minimum}% means more code is untested than tested. + - Options: + A) Generate tests for remaining gaps (recommended) + B) Override — ship with low coverage (I understand the risk) + - If A: Loop back to substep 5. Maximum 2 passes. If still below minimum after 2 passes, present the override choice again. + - If B: Continue. Include in PR body: "Coverage gate: OVERRIDDEN at {X}%." + +**Coverage percentage undetermined:** If the coverage diagram doesn't produce a clear numeric percentage (ambiguous output, parse error), **skip the gate** with: "Coverage gate: could not determine percentage — skipping." Do not default to 0% or block. + +**Test-only diffs:** Skip the gate (same as the existing fast-path). + +**100% coverage:** "Coverage gate: PASS (100%)." Continue.`); // ── Test plan artifact (ship mode) ── sections.push(` @@ -504,7 +537,22 @@ If test framework is detected and gaps were identified: If no test framework detected → include gaps as INFORMATIONAL findings only, no generation. -**Diff is test-only changes:** Skip Step 4.75 entirely: "No new application code paths to audit."`); +**Diff is test-only changes:** Skip Step 4.75 entirely: "No new application code paths to audit." + +### Coverage Warning + +After producing the coverage diagram, check the coverage percentage. Read CLAUDE.md for a \`## Test Coverage\` section with a \`Minimum:\` field. If not found, use default: 60%. + +If coverage is below the minimum threshold, output a prominent warning **before** the regular review findings: + +\`\`\` +⚠️ COVERAGE WARNING: AI-assessed coverage is {X}%. {N} code paths untested. +Consider writing tests before running /ship. +\`\`\` + +This is INFORMATIONAL — does not block /review. But it makes low coverage visible early so the developer can address it before reaching the /ship coverage gate. + +If coverage percentage cannot be determined, skip the warning silently.`); } return sections.join('\n'); diff --git a/setup-browser-cookies/SKILL.md b/setup-browser-cookies/SKILL.md index 37c944a0d..85c1ce20e 100644 --- a/setup-browser-cookies/SKILL.md +++ b/setup-browser-cookies/SKILL.md @@ -3,7 +3,6 @@ name: setup-browser-cookies preamble-tier: 1 version: 1.0.0 description: | - MANUAL TRIGGER ONLY: invoke only when user types /setup-browser-cookies. Import cookies from your real Chromium browser into the headless browse session. Opens an interactive picker UI where you select which cookie domains to import. Use before QA testing authenticated pages. Use when asked to "import cookies", @@ -97,23 +96,112 @@ touch ~/.gstack/.telemetry-prompted This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. +## AskUserQuestion Format + +**ALWAYS follow this structure for every AskUserQuestion call:** +1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) +2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. +3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. +4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` +5. **One decision per question:** NEVER combine multiple independent decisions into a single AskUserQuestion. Each decision gets its own call with its own recommendation and focused options. Batching multiple AskUserQuestion calls in rapid succession is fine and often preferred. Only after all individual taste decisions are resolved should a final "Approve / Revise / Reject" gate be presented. + +Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. + +Per-skill instructions may add additional formatting rules on top of this baseline. + +## Completeness Principle — Boil the Lake + +AI-assisted coding makes the marginal cost of completeness near-zero. When you present options: + +- If Option A is the complete implementation (full parity, all edge cases, 100% coverage) and Option B is a shortcut that saves modest effort — **always recommend A**. The delta between 80 lines and 150 lines is meaningless with CC+gstack. "Good enough" is the wrong instinct when "complete" costs minutes more. +- **Lake vs. ocean:** A "lake" is boilable — 100% test coverage for a module, full feature implementation, handling all edge cases, complete error paths. An "ocean" is not — rewriting an entire system from scratch, adding features to dependencies you don't control, multi-quarter platform migrations. Recommend boiling lakes. Flag oceans as out of scope. +- **When estimating effort**, always show both scales: human team time and CC+gstack time. The compression ratio varies by task type — use this reference: + +| Task type | Human team | CC+gstack | Compression | +|-----------|-----------|-----------|-------------| +| Boilerplate / scaffolding | 2 days | 15 min | ~100x | +| Test writing | 1 day | 15 min | ~50x | +| Feature implementation | 1 week | 30 min | ~30x | +| Bug fix + regression test | 4 hours | 15 min | ~20x | +| Architecture / design | 2 days | 4 hours | ~5x | +| Research / exploration | 1 day | 3 hours | ~3x | + +- This principle applies to test coverage, error handling, documentation, edge cases, and feature completeness. Don't skip the last 10% to "save time" — with AI, that 10% costs seconds. + +**Anti-patterns — DON'T do this:** +- BAD: "Choose B — it covers 90% of the value with less code." (If A is only 70 lines more, choose A.) +- BAD: "We can skip edge case handling to save time." (Edge case handling costs minutes with CC.) +- BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) +- BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") + +## Repo Ownership Mode — See Something, Say Something + +`REPO_MODE` from the preamble tells you who owns issues in this repo: + +- **`solo`** — One person does 80%+ of the work. They own everything. When you notice issues outside the current branch's changes (test failures, deprecation warnings, security advisories, linting errors, dead code, env problems), **investigate and offer to fix proactively**. The solo dev is the only person who will fix it. Default to action. +- **`collaborative`** — Multiple active contributors. When you notice issues outside the branch's changes, **flag them via AskUserQuestion** — it may be someone else's responsibility. Default to asking, not fixing. +- **`unknown`** — Treat as collaborative (safer default — ask before fixing). + +**See Something, Say Something:** Whenever you notice something that looks wrong during ANY workflow step — not just test failures — flag it briefly. One sentence: what you noticed and its impact. In solo mode, follow up with "Want me to fix it?" In collaborative mode, just flag it and move on. + +Never let a noticed issue silently pass. The whole point is proactive communication. + +## Search Before Building + +Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.claude/skills/gstack/ETHOS.md` for the full philosophy. + +**Three layers of knowledge:** +- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. +- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. +- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. + +**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: +"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." + +Log eureka moments: +```bash +jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true +``` +Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. + +**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." + ## Contributor Mode -If `_CONTRIB` is `true`: you are in **contributor mode**. At the end of each major workflow step, rate your gstack experience 0-10. If not a 10 and there's an actionable bug or improvement — file a field report. +If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. -**File only:** gstack tooling bugs where the input was reasonable but gstack failed. **Skip:** user app bugs, network errors, auth failures on user's site. +**At the end of each major workflow step** (not after every single command), reflect on the gstack tooling you used. Rate your experience 0 to 10. If it wasn't a 10, think about why. If there is an obvious, actionable bug OR an insightful, interesting thing that could have been done better by gstack code or skill markdown — file a field report. Maybe our contributor will help make us better! + +**Calibration — this is the bar:** For example, `$B js "await fetch(...)"` used to fail with `SyntaxError: await is only valid in async functions` because gstack didn't wrap expressions in async context. Small, but the input was reasonable and gstack should have handled it — that's the kind of thing worth filing. Things less consequential than this, ignore. + +**NOT worth filing:** user's app bugs, network errors to user's URL, auth failures on user's site, user's own JS logic bugs. + +**To file:** write `~/.gstack/contributor-logs/{slug}.md` with **all sections below** (do not truncate — include every section through the Date/Version footer): -**To file:** write `~/.gstack/contributor-logs/{slug}.md`: ``` # {Title} -**What I tried:** {action} | **What happened:** {result} | **Rating:** {0-10} -## Repro + +Hey gstack team — ran into this while using /{skill-name}: + +**What I was trying to do:** {what the user/agent was attempting} +**What happened instead:** {what actually happened} +**My rating:** {0-10} — {one sentence on why it wasn't a 10} + +## Steps to reproduce 1. {step} + +## Raw output +``` +{paste the actual error or unexpected output here} +``` + ## What would make this a 10 -{one sentence} -**Date:** {YYYY-MM-DD} | **Version:** {version} | **Skill:** /{skill} +{one sentence: what gstack should have done differently} + +**Date:** {YYYY-MM-DD} | **Version:** {gstack version} | **Skill:** /{skill} ``` -Slug: lowercase hyphens, max 60 chars. Skip if exists. Max 3/session. File inline, don't stop. + +Slug: lowercase, hyphens, max 60 chars (e.g. `browse-js-no-await`). Skip if file already exists. Max 3 reports per session. File inline and continue — don't stop the workflow. Tell user: "Filed gstack field report: {title}" ## Completion Status Protocol diff --git a/setup-deploy/SKILL.md b/setup-deploy/SKILL.md index 6d1a35bb7..9eba44790 100644 --- a/setup-deploy/SKILL.md +++ b/setup-deploy/SKILL.md @@ -3,7 +3,6 @@ name: setup-deploy preamble-tier: 2 version: 1.0.0 description: | - MANUAL TRIGGER ONLY: invoke only when user types /setup-deploy. Configure deployment settings for /land-and-deploy. Detects your deploy platform (Fly.io, Render, Vercel, Netlify, Heroku, GitHub Actions, custom), production URL, health check endpoints, and deploy status commands. Writes @@ -110,6 +109,7 @@ This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. 3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. 4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` +5. **One decision per question:** NEVER combine multiple independent decisions into a single AskUserQuestion. Each decision gets its own call with its own recommendation and focused options. Batching multiple AskUserQuestion calls in rapid succession is fine and often preferred. Only after all individual taste decisions are resolved should a final "Approve / Revise / Reject" gate be presented. Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -117,36 +117,97 @@ Per-skill instructions may add additional formatting rules on top of this baseli ## Completeness Principle — Boil the Lake -AI makes completeness near-free. Always recommend the complete option over shortcuts — the delta is minutes with CC+gstack. A "lake" (100% coverage, all edge cases) is boilable; an "ocean" (full rewrite, multi-quarter migration) is not. Boil lakes, flag oceans. +AI-assisted coding makes the marginal cost of completeness near-zero. When you present options: -**Effort reference** — always show both scales: +- If Option A is the complete implementation (full parity, all edge cases, 100% coverage) and Option B is a shortcut that saves modest effort — **always recommend A**. The delta between 80 lines and 150 lines is meaningless with CC+gstack. "Good enough" is the wrong instinct when "complete" costs minutes more. +- **Lake vs. ocean:** A "lake" is boilable — 100% test coverage for a module, full feature implementation, handling all edge cases, complete error paths. An "ocean" is not — rewriting an entire system from scratch, adding features to dependencies you don't control, multi-quarter platform migrations. Recommend boiling lakes. Flag oceans as out of scope. +- **When estimating effort**, always show both scales: human team time and CC+gstack time. The compression ratio varies by task type — use this reference: | Task type | Human team | CC+gstack | Compression | |-----------|-----------|-----------|-------------| -| Boilerplate | 2 days | 15 min | ~100x | -| Tests | 1 day | 15 min | ~50x | -| Feature | 1 week | 30 min | ~30x | -| Bug fix | 4 hours | 15 min | ~20x | +| Boilerplate / scaffolding | 2 days | 15 min | ~100x | +| Test writing | 1 day | 15 min | ~50x | +| Feature implementation | 1 week | 30 min | ~30x | +| Bug fix + regression test | 4 hours | 15 min | ~20x | +| Architecture / design | 2 days | 4 hours | ~5x | +| Research / exploration | 1 day | 3 hours | ~3x | -Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). +- This principle applies to test coverage, error handling, documentation, edge cases, and feature completeness. Don't skip the last 10% to "save time" — with AI, that 10% costs seconds. + +**Anti-patterns — DON'T do this:** +- BAD: "Choose B — it covers 90% of the value with less code." (If A is only 70 lines more, choose A.) +- BAD: "We can skip edge case handling to save time." (Edge case handling costs minutes with CC.) +- BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) +- BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") + +## Repo Ownership Mode — See Something, Say Something + +`REPO_MODE` from the preamble tells you who owns issues in this repo: + +- **`solo`** — One person does 80%+ of the work. They own everything. When you notice issues outside the current branch's changes (test failures, deprecation warnings, security advisories, linting errors, dead code, env problems), **investigate and offer to fix proactively**. The solo dev is the only person who will fix it. Default to action. +- **`collaborative`** — Multiple active contributors. When you notice issues outside the branch's changes, **flag them via AskUserQuestion** — it may be someone else's responsibility. Default to asking, not fixing. +- **`unknown`** — Treat as collaborative (safer default — ask before fixing). + +**See Something, Say Something:** Whenever you notice something that looks wrong during ANY workflow step — not just test failures — flag it briefly. One sentence: what you noticed and its impact. In solo mode, follow up with "Want me to fix it?" In collaborative mode, just flag it and move on. + +Never let a noticed issue silently pass. The whole point is proactive communication. + +## Search Before Building + +Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.claude/skills/gstack/ETHOS.md` for the full philosophy. + +**Three layers of knowledge:** +- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. +- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. +- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. + +**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: +"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." + +Log eureka moments: +```bash +jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true +``` +Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. + +**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." ## Contributor Mode -If `_CONTRIB` is `true`: you are in **contributor mode**. At the end of each major workflow step, rate your gstack experience 0-10. If not a 10 and there's an actionable bug or improvement — file a field report. +If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. + +**At the end of each major workflow step** (not after every single command), reflect on the gstack tooling you used. Rate your experience 0 to 10. If it wasn't a 10, think about why. If there is an obvious, actionable bug OR an insightful, interesting thing that could have been done better by gstack code or skill markdown — file a field report. Maybe our contributor will help make us better! + +**Calibration — this is the bar:** For example, `$B js "await fetch(...)"` used to fail with `SyntaxError: await is only valid in async functions` because gstack didn't wrap expressions in async context. Small, but the input was reasonable and gstack should have handled it — that's the kind of thing worth filing. Things less consequential than this, ignore. + +**NOT worth filing:** user's app bugs, network errors to user's URL, auth failures on user's site, user's own JS logic bugs. -**File only:** gstack tooling bugs where the input was reasonable but gstack failed. **Skip:** user app bugs, network errors, auth failures on user's site. +**To file:** write `~/.gstack/contributor-logs/{slug}.md` with **all sections below** (do not truncate — include every section through the Date/Version footer): -**To file:** write `~/.gstack/contributor-logs/{slug}.md`: ``` # {Title} -**What I tried:** {action} | **What happened:** {result} | **Rating:** {0-10} -## Repro + +Hey gstack team — ran into this while using /{skill-name}: + +**What I was trying to do:** {what the user/agent was attempting} +**What happened instead:** {what actually happened} +**My rating:** {0-10} — {one sentence on why it wasn't a 10} + +## Steps to reproduce 1. {step} + +## Raw output +``` +{paste the actual error or unexpected output here} +``` + ## What would make this a 10 -{one sentence} -**Date:** {YYYY-MM-DD} | **Version:** {version} | **Skill:** /{skill} +{one sentence: what gstack should have done differently} + +**Date:** {YYYY-MM-DD} | **Version:** {gstack version} | **Skill:** /{skill} ``` -Slug: lowercase hyphens, max 60 chars. Skip if exists. Max 3/session. File inline, don't stop. + +Slug: lowercase, hyphens, max 60 chars (e.g. `browse-js-no-await`). Skip if file already exists. Max 3 reports per session. File inline and continue — don't stop the workflow. Tell user: "Filed gstack field report: {title}" ## Completion Status Protocol diff --git a/ship/SKILL.md.tmpl b/ship/SKILL.md.tmpl index ce859cf37..7f82c64dd 100644 --- a/ship/SKILL.md.tmpl +++ b/ship/SKILL.md.tmpl @@ -32,6 +32,9 @@ You are running the `/ship` workflow. This is a **non-interactive, fully automat - Pre-landing review finds ASK items that need user judgment - MINOR or MAJOR version bump needed (ask — see Step 4) - Greptile review comments that need user decision (complex fixes, false positives) +- AI-assessed coverage below minimum threshold (hard gate with user override — see Step 3.4) +- Plan items NOT DONE with no user override (see Step 3.45) +- Plan verification failures (see Step 3.47) - TODOS.md missing and user wants to create one (ask — see Step 5.5) - TODOS.md disorganized and user wants to reorganize (ask — see Step 5.5) @@ -43,7 +46,7 @@ You are running the `/ship` workflow. This is a **non-interactive, fully automat - Multi-file changesets (auto-split into bisectable commits) - TODOS.md completed-item detection (auto-mark) - Auto-fixable review findings (dead code, N+1, stale comments — fixed automatically) -- Test coverage gaps (auto-generate and commit, or flag in PR body) +- Test coverage gaps within target threshold (auto-generate and commit, or flag in PR body) --- @@ -225,6 +228,16 @@ If multiple suites need to run, run them sequentially (each needs a test lane). --- +## Step 3.45: Plan Completion Audit + +{{PLAN_COMPLETION_AUDIT_SHIP}} + +--- + +{{PLAN_VERIFICATION_EXEC}} + +--- + ## Step 3.5: Pre-Landing Review Review the diff for structural issues that tests don't catch. @@ -500,6 +513,16 @@ gh pr create --base --title ": " --body "$(cat <<'EOF' +## Plan Completion + + + + +## Verification Results + + + + ## TODOS @@ -540,6 +563,32 @@ doc updates — the user runs `/ship` and documentation stays current without a --- +## Step 8.75: Persist ship metrics + +Log coverage and plan completion data so `/retro` can track trends: + +```bash +eval "$(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null)" && mkdir -p ~/.gstack/projects/$SLUG +``` + +Append to `~/.gstack/projects/$SLUG/$BRANCH-reviews.jsonl`: + +```bash +echo '{"skill":"ship","timestamp":"'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'","coverage_pct":COVERAGE_PCT,"plan_items_total":PLAN_TOTAL,"plan_items_done":PLAN_DONE,"verification_result":"VERIFY_RESULT","version":"VERSION","branch":"BRANCH"}' >> ~/.gstack/projects/$SLUG/$BRANCH-reviews.jsonl +``` + +Substitute from earlier steps: +- **COVERAGE_PCT**: coverage percentage from Step 3.4 diagram (integer, or -1 if undetermined) +- **PLAN_TOTAL**: total plan items extracted in Step 3.45 (0 if no plan file) +- **PLAN_DONE**: count of DONE + CHANGED items from Step 3.45 (0 if no plan file) +- **VERIFY_RESULT**: "pass", "fail", or "skipped" from Step 3.47 +- **VERSION**: from the VERSION file +- **BRANCH**: current branch name + +This step is automatic — never skip it, never ask for confirmation. + +--- + ## Important Rules - **Never skip tests.** If tests fail, stop. diff --git a/test/gen-skill-docs.test.ts b/test/gen-skill-docs.test.ts index 590f49504..d8a071a1f 100644 --- a/test/gen-skill-docs.test.ts +++ b/test/gen-skill-docs.test.ts @@ -152,24 +152,6 @@ describe('gen-skill-docs', () => { } }); - test('every Codex SKILL.md description stays under 900-char warning threshold', () => { - const WARN_THRESHOLD = 900; - const agentsDir = path.join(ROOT, '.agents', 'skills'); - if (!fs.existsSync(agentsDir)) return; - const violations: string[] = []; - for (const entry of fs.readdirSync(agentsDir, { withFileTypes: true })) { - if (!entry.isDirectory()) continue; - const skillMd = path.join(agentsDir, entry.name, 'SKILL.md'); - if (!fs.existsSync(skillMd)) continue; - const content = fs.readFileSync(skillMd, 'utf-8'); - const description = extractDescription(content); - if (description.length > WARN_THRESHOLD) { - violations.push(`${entry.name}: ${description.length} chars (limit ${MAX_SKILL_DESCRIPTION_LENGTH}, ${MAX_SKILL_DESCRIPTION_LENGTH - description.length} remaining)`); - } - } - expect(violations).toEqual([]); - }); - test('package.json version matches VERSION file', () => { const pkg = JSON.parse(fs.readFileSync(path.join(ROOT, 'package.json'), 'utf-8')); const version = fs.readFileSync(path.join(ROOT, 'VERSION'), 'utf-8').trim(); @@ -695,6 +677,168 @@ describe('PLAN_FILE_REVIEW_REPORT resolver', () => { }); }); +// --- {{PLAN_COMPLETION_AUDIT}} resolver tests --- + +describe('PLAN_COMPLETION_AUDIT placeholders', () => { + const shipSkill = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8'); + const reviewSkill = fs.readFileSync(path.join(ROOT, 'review', 'SKILL.md'), 'utf-8'); + + test('ship SKILL.md contains plan completion audit step', () => { + expect(shipSkill).toContain('Plan Completion Audit'); + expect(shipSkill).toContain('Step 3.45'); + }); + + test('review SKILL.md contains plan completion in scope drift', () => { + expect(reviewSkill).toContain('Plan File Discovery'); + expect(reviewSkill).toContain('Actionable Item Extraction'); + expect(reviewSkill).toContain('Integration with Scope Drift Detection'); + }); + + test('both modes share plan file discovery methodology', () => { + expect(shipSkill).toContain('Plan File Discovery'); + expect(reviewSkill).toContain('Plan File Discovery'); + // Both should have conversation context first + expect(shipSkill).toContain('Conversation context (primary)'); + expect(reviewSkill).toContain('Conversation context (primary)'); + // Both should have grep fallback + expect(shipSkill).toContain('Content-based search (fallback)'); + expect(reviewSkill).toContain('Content-based search (fallback)'); + }); + + test('ship mode has gate logic for NOT DONE items', () => { + expect(shipSkill).toContain('NOT DONE'); + expect(shipSkill).toContain('Stop — implement the missing items'); + expect(shipSkill).toContain('Ship anyway — defer'); + expect(shipSkill).toContain('intentionally dropped'); + }); + + test('review mode is INFORMATIONAL only', () => { + expect(reviewSkill).toContain('INFORMATIONAL'); + expect(reviewSkill).toContain('MISSING REQUIREMENTS'); + expect(reviewSkill).toContain('SCOPE CREEP'); + }); + + test('item extraction has 50-item cap', () => { + expect(shipSkill).toContain('at most 50 items'); + }); + + test('uses file-level traceability (not commit-level)', () => { + expect(shipSkill).toContain('Cite the specific file'); + expect(shipSkill).not.toContain('commit-level traceability'); + }); +}); + +// --- {{PLAN_VERIFICATION_EXEC}} resolver tests --- + +describe('PLAN_VERIFICATION_EXEC placeholder', () => { + const shipSkill = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8'); + + test('ship SKILL.md contains plan verification step', () => { + expect(shipSkill).toContain('Step 3.47'); + expect(shipSkill).toContain('Plan Verification'); + }); + + test('references /qa-only invocation', () => { + expect(shipSkill).toContain('qa-only/SKILL.md'); + expect(shipSkill).toContain('qa-only'); + }); + + test('contains localhost reachability check', () => { + expect(shipSkill).toContain('localhost:3000'); + expect(shipSkill).toContain('NO_SERVER'); + }); + + test('skips gracefully when no verification section', () => { + expect(shipSkill).toContain('No verification steps found in plan'); + }); + + test('skips gracefully when no dev server', () => { + expect(shipSkill).toContain('No dev server detected'); + }); +}); + +// --- Coverage gate tests --- + +describe('Coverage gate in ship', () => { + const shipSkill = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8'); + const reviewSkill = fs.readFileSync(path.join(ROOT, 'review', 'SKILL.md'), 'utf-8'); + + test('ship SKILL.md contains coverage gate with thresholds', () => { + expect(shipSkill).toContain('Coverage gate'); + expect(shipSkill).toContain('>= target'); + expect(shipSkill).toContain('< minimum'); + }); + + test('ship SKILL.md supports configurable thresholds via CLAUDE.md', () => { + expect(shipSkill).toContain('## Test Coverage'); + expect(shipSkill).toContain('Minimum:'); + expect(shipSkill).toContain('Target:'); + }); + + test('coverage gate skips on parse failure (not block)', () => { + expect(shipSkill).toContain('could not determine percentage — skipping'); + }); + + test('review SKILL.md contains coverage WARNING', () => { + expect(reviewSkill).toContain('COVERAGE WARNING'); + expect(reviewSkill).toContain('Consider writing tests before running /ship'); + }); + + test('review coverage warning is INFORMATIONAL', () => { + expect(reviewSkill).toContain('INFORMATIONAL'); + }); +}); + +// --- Ship metrics logging --- + +describe('Ship metrics logging', () => { + const shipSkill = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8'); + + test('ship SKILL.md contains metrics persistence step', () => { + expect(shipSkill).toContain('Step 8.75'); + expect(shipSkill).toContain('coverage_pct'); + expect(shipSkill).toContain('plan_items_total'); + expect(shipSkill).toContain('plan_items_done'); + expect(shipSkill).toContain('verification_result'); + }); +}); + +// --- Plan file discovery shared helper --- + +describe('Plan file discovery shared helper', () => { + // The shared helper should appear in ship (via PLAN_COMPLETION_AUDIT_SHIP) + // and in review (via PLAN_COMPLETION_AUDIT_REVIEW) + const shipSkill = fs.readFileSync(path.join(ROOT, 'ship', 'SKILL.md'), 'utf-8'); + const reviewSkill = fs.readFileSync(path.join(ROOT, 'review', 'SKILL.md'), 'utf-8'); + + test('plan file discovery appears in both ship and review', () => { + expect(shipSkill).toContain('Plan File Discovery'); + expect(reviewSkill).toContain('Plan File Discovery'); + }); + + test('both include conversation context first', () => { + expect(shipSkill).toContain('Conversation context (primary)'); + expect(reviewSkill).toContain('Conversation context (primary)'); + }); + + test('both include content-based fallback', () => { + expect(shipSkill).toContain('Content-based search (fallback)'); + expect(reviewSkill).toContain('Content-based search (fallback)'); + }); +}); + +// --- Retro plan completion --- + +describe('Retro plan completion section', () => { + const retroSkill = fs.readFileSync(path.join(ROOT, 'retro', 'SKILL.md'), 'utf-8'); + + test('retro SKILL.md contains plan completion section', () => { + expect(retroSkill).toContain('### Plan Completion'); + expect(retroSkill).toContain('plan_items_total'); + expect(retroSkill).toContain('Plan Completion This Period'); + }); +}); + // --- Plan status footer in preamble --- describe('Plan status footer in preamble', () => { diff --git a/test/helpers/e2e-helpers.ts b/test/helpers/e2e-helpers.ts index 4615307c2..70564acba 100644 --- a/test/helpers/e2e-helpers.ts +++ b/test/helpers/e2e-helpers.ts @@ -207,7 +207,7 @@ export async function finalizeEvalCollector(evalCollector: EvalCollector | null) if (evalsEnabled) { const gstackDir = path.join(os.homedir(), '.gstack'); fs.mkdirSync(gstackDir, { recursive: true }); - for (const f of ['.completeness-intro-seen', '.telemetry-prompted']) { + for (const f of ['.completeness-intro-seen', '.telemetry-prompted', '.proactive-prompted']) { const p = path.join(gstackDir, f); if (!fs.existsSync(p)) fs.writeFileSync(p, ''); } diff --git a/test/helpers/touchfiles.ts b/test/helpers/touchfiles.ts index 417369999..d61ae1647 100644 --- a/test/helpers/touchfiles.ts +++ b/test/helpers/touchfiles.ts @@ -107,12 +107,17 @@ export const E2E_TOUCHFILES: Record = { 'gemini-review-findings': ['review/**', '.agents/skills/gstack-review/**', 'test/helpers/gemini-session-runner.ts', 'lib/worktree.ts'], - // Coverage audit (shared fixture) + triage + // Coverage audit (shared fixture) + triage + gates 'ship-coverage-audit': ['ship/**', 'test/fixtures/coverage-audit-fixture.ts', 'bin/gstack-repo-mode'], 'review-coverage-audit': ['review/**', 'test/fixtures/coverage-audit-fixture.ts'], 'plan-eng-coverage-audit': ['plan-eng-review/**', 'test/fixtures/coverage-audit-fixture.ts'], 'ship-triage': ['ship/**', 'bin/gstack-repo-mode'], + // Plan completion audit + verification + 'ship-plan-completion': ['ship/**', 'scripts/gen-skill-docs.ts'], + 'ship-plan-verification': ['ship/**', 'qa-only/**', 'scripts/gen-skill-docs.ts'], + 'review-plan-completion': ['review/**', 'scripts/gen-skill-docs.ts'], + // Design 'design-consultation-core': ['design-consultation/**', 'scripts/gen-skill-docs.ts', 'test/helpers/llm-judge.ts'], 'design-consultation-existing': ['design-consultation/**', 'scripts/gen-skill-docs.ts'], diff --git a/unfreeze/SKILL.md b/unfreeze/SKILL.md index 16eda041e..d4ad37e2d 100644 --- a/unfreeze/SKILL.md +++ b/unfreeze/SKILL.md @@ -2,7 +2,6 @@ name: unfreeze version: 0.1.0 description: | - MANUAL TRIGGER ONLY: invoke only when user types /unfreeze. Clear the freeze boundary set by /freeze, allowing edits to all directories again. Use when you want to widen edit scope without ending the session. Use when asked to "unfreeze", "unlock edits", "remove freeze", or From 1b53fd2979dc97b55d6ab545fb1745e2980bb9ed Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Tue, 24 Mar 2026 20:32:30 -0700 Subject: [PATCH 3/5] chore: regenerate SKILL.md files from updated templates Co-Authored-By: Claude Opus 4.6 (1M context) --- SKILL.md | 114 +++++++++++-- autoplan/SKILL.md | 91 ++++++++--- codex/SKILL.md | 91 ++++++++--- design-consultation/SKILL.md | 91 ++++++++--- design-review/SKILL.md | 93 ++++++++--- office-hours/SKILL.md | 91 ++++++++--- plan-ceo-review/SKILL.md | 91 ++++++++--- plan-design-review/SKILL.md | 91 ++++++++--- plan-eng-review/SKILL.md | 91 ++++++++--- review/SKILL.md | 206 ++++++++++++++++++++--- ship/SKILL.md | 307 ++++++++++++++++++++++++++++++++--- 11 files changed, 1097 insertions(+), 260 deletions(-) diff --git a/SKILL.md b/SKILL.md index bee0571d9..dada1e75d 100644 --- a/SKILL.md +++ b/SKILL.md @@ -3,19 +3,10 @@ name: gstack preamble-tier: 1 version: 1.1.0 description: | - MANUAL TRIGGER ONLY: invoke only when user types /gstack. Fast headless browser for QA testing and site dogfooding. Navigate pages, interact with elements, verify state, diff before/after, take annotated screenshots, test responsive layouts, forms, uploads, dialogs, and capture bug evidence. Use when asked to open or test a site, verify a deployment, dogfood a user flow, or file a bug with screenshots. - Also suggest adjacent gstack skills by stage: brainstorm /office-hours; strategy - /plan-ceo-review; architecture /plan-eng-review; design /plan-design-review or - /design-consultation; auto-review /autoplan; debugging /investigate; QA /qa; code review - /review; visual audit /design-review; shipping /ship; docs /document-release; retro - /retro; second opinion /codex; prod safety /careful or /guard; scoped edits /freeze or - /unfreeze; gstack upgrades /gstack-upgrade. If the user opts out of suggestions, stop - and run gstack-config set proactive false; if they opt back in, run gstack-config set - proactive true. allowed-tools: - Bash - Read @@ -106,23 +97,112 @@ touch ~/.gstack/.telemetry-prompted This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. +## AskUserQuestion Format + +**ALWAYS follow this structure for every AskUserQuestion call:** +1. **Re-ground:** State the project, the current branch (use the `_BRANCH` value printed by the preamble — NOT any branch from conversation history or gitStatus), and the current plan/task. (1-2 sentences) +2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. +3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. +4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` +5. **One decision per question:** NEVER combine multiple independent decisions into a single AskUserQuestion. Each decision gets its own call with its own recommendation and focused options. Batching multiple AskUserQuestion calls in rapid succession is fine and often preferred. Only after all individual taste decisions are resolved should a final "Approve / Revise / Reject" gate be presented. + +Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. + +Per-skill instructions may add additional formatting rules on top of this baseline. + +## Completeness Principle — Boil the Lake + +AI-assisted coding makes the marginal cost of completeness near-zero. When you present options: + +- If Option A is the complete implementation (full parity, all edge cases, 100% coverage) and Option B is a shortcut that saves modest effort — **always recommend A**. The delta between 80 lines and 150 lines is meaningless with CC+gstack. "Good enough" is the wrong instinct when "complete" costs minutes more. +- **Lake vs. ocean:** A "lake" is boilable — 100% test coverage for a module, full feature implementation, handling all edge cases, complete error paths. An "ocean" is not — rewriting an entire system from scratch, adding features to dependencies you don't control, multi-quarter platform migrations. Recommend boiling lakes. Flag oceans as out of scope. +- **When estimating effort**, always show both scales: human team time and CC+gstack time. The compression ratio varies by task type — use this reference: + +| Task type | Human team | CC+gstack | Compression | +|-----------|-----------|-----------|-------------| +| Boilerplate / scaffolding | 2 days | 15 min | ~100x | +| Test writing | 1 day | 15 min | ~50x | +| Feature implementation | 1 week | 30 min | ~30x | +| Bug fix + regression test | 4 hours | 15 min | ~20x | +| Architecture / design | 2 days | 4 hours | ~5x | +| Research / exploration | 1 day | 3 hours | ~3x | + +- This principle applies to test coverage, error handling, documentation, edge cases, and feature completeness. Don't skip the last 10% to "save time" — with AI, that 10% costs seconds. + +**Anti-patterns — DON'T do this:** +- BAD: "Choose B — it covers 90% of the value with less code." (If A is only 70 lines more, choose A.) +- BAD: "We can skip edge case handling to save time." (Edge case handling costs minutes with CC.) +- BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) +- BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") + +## Repo Ownership Mode — See Something, Say Something + +`REPO_MODE` from the preamble tells you who owns issues in this repo: + +- **`solo`** — One person does 80%+ of the work. They own everything. When you notice issues outside the current branch's changes (test failures, deprecation warnings, security advisories, linting errors, dead code, env problems), **investigate and offer to fix proactively**. The solo dev is the only person who will fix it. Default to action. +- **`collaborative`** — Multiple active contributors. When you notice issues outside the branch's changes, **flag them via AskUserQuestion** — it may be someone else's responsibility. Default to asking, not fixing. +- **`unknown`** — Treat as collaborative (safer default — ask before fixing). + +**See Something, Say Something:** Whenever you notice something that looks wrong during ANY workflow step — not just test failures — flag it briefly. One sentence: what you noticed and its impact. In solo mode, follow up with "Want me to fix it?" In collaborative mode, just flag it and move on. + +Never let a noticed issue silently pass. The whole point is proactive communication. + +## Search Before Building + +Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.claude/skills/gstack/ETHOS.md` for the full philosophy. + +**Three layers of knowledge:** +- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. +- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. +- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. + +**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: +"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." + +Log eureka moments: +```bash +jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true +``` +Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. + +**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." + ## Contributor Mode -If `_CONTRIB` is `true`: you are in **contributor mode**. At the end of each major workflow step, rate your gstack experience 0-10. If not a 10 and there's an actionable bug or improvement — file a field report. +If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. -**File only:** gstack tooling bugs where the input was reasonable but gstack failed. **Skip:** user app bugs, network errors, auth failures on user's site. +**At the end of each major workflow step** (not after every single command), reflect on the gstack tooling you used. Rate your experience 0 to 10. If it wasn't a 10, think about why. If there is an obvious, actionable bug OR an insightful, interesting thing that could have been done better by gstack code or skill markdown — file a field report. Maybe our contributor will help make us better! + +**Calibration — this is the bar:** For example, `$B js "await fetch(...)"` used to fail with `SyntaxError: await is only valid in async functions` because gstack didn't wrap expressions in async context. Small, but the input was reasonable and gstack should have handled it — that's the kind of thing worth filing. Things less consequential than this, ignore. + +**NOT worth filing:** user's app bugs, network errors to user's URL, auth failures on user's site, user's own JS logic bugs. + +**To file:** write `~/.gstack/contributor-logs/{slug}.md` with **all sections below** (do not truncate — include every section through the Date/Version footer): -**To file:** write `~/.gstack/contributor-logs/{slug}.md`: ``` # {Title} -**What I tried:** {action} | **What happened:** {result} | **Rating:** {0-10} -## Repro + +Hey gstack team — ran into this while using /{skill-name}: + +**What I was trying to do:** {what the user/agent was attempting} +**What happened instead:** {what actually happened} +**My rating:** {0-10} — {one sentence on why it wasn't a 10} + +## Steps to reproduce 1. {step} + +## Raw output +``` +{paste the actual error or unexpected output here} +``` + ## What would make this a 10 -{one sentence} -**Date:** {YYYY-MM-DD} | **Version:** {version} | **Skill:** /{skill} +{one sentence: what gstack should have done differently} + +**Date:** {YYYY-MM-DD} | **Version:** {gstack version} | **Skill:** /{skill} ``` -Slug: lowercase hyphens, max 60 chars. Skip if exists. Max 3/session. File inline, don't stop. + +Slug: lowercase, hyphens, max 60 chars (e.g. `browse-js-no-await`). Skip if file already exists. Max 3 reports per session. File inline and continue — don't stop the workflow. Tell user: "Filed gstack field report: {title}" ## Completion Status Protocol diff --git a/autoplan/SKILL.md b/autoplan/SKILL.md index 036240987..e9161eabe 100644 --- a/autoplan/SKILL.md +++ b/autoplan/SKILL.md @@ -3,7 +3,6 @@ name: autoplan preamble-tier: 3 version: 1.0.0 description: | - MANUAL TRIGGER ONLY: invoke only when user types /autoplan. Auto-review pipeline — reads the full CEO, design, and eng review skills from disk and runs them sequentially with auto-decisions using 6 decision principles. Surfaces taste decisions (close approaches, borderline scope, codex disagreements) at a final @@ -114,6 +113,7 @@ This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. 3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. 4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` +5. **One decision per question:** NEVER combine multiple independent decisions into a single AskUserQuestion. Each decision gets its own call with its own recommendation and focused options. Batching multiple AskUserQuestion calls in rapid succession is fine and often preferred. Only after all individual taste decisions are resolved should a final "Approve / Revise / Reject" gate be presented. Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -121,54 +121,97 @@ Per-skill instructions may add additional formatting rules on top of this baseli ## Completeness Principle — Boil the Lake -AI makes completeness near-free. Always recommend the complete option over shortcuts — the delta is minutes with CC+gstack. A "lake" (100% coverage, all edge cases) is boilable; an "ocean" (full rewrite, multi-quarter migration) is not. Boil lakes, flag oceans. +AI-assisted coding makes the marginal cost of completeness near-zero. When you present options: -**Effort reference** — always show both scales: +- If Option A is the complete implementation (full parity, all edge cases, 100% coverage) and Option B is a shortcut that saves modest effort — **always recommend A**. The delta between 80 lines and 150 lines is meaningless with CC+gstack. "Good enough" is the wrong instinct when "complete" costs minutes more. +- **Lake vs. ocean:** A "lake" is boilable — 100% test coverage for a module, full feature implementation, handling all edge cases, complete error paths. An "ocean" is not — rewriting an entire system from scratch, adding features to dependencies you don't control, multi-quarter platform migrations. Recommend boiling lakes. Flag oceans as out of scope. +- **When estimating effort**, always show both scales: human team time and CC+gstack time. The compression ratio varies by task type — use this reference: | Task type | Human team | CC+gstack | Compression | |-----------|-----------|-----------|-------------| -| Boilerplate | 2 days | 15 min | ~100x | -| Tests | 1 day | 15 min | ~50x | -| Feature | 1 week | 30 min | ~30x | -| Bug fix | 4 hours | 15 min | ~20x | +| Boilerplate / scaffolding | 2 days | 15 min | ~100x | +| Test writing | 1 day | 15 min | ~50x | +| Feature implementation | 1 week | 30 min | ~30x | +| Bug fix + regression test | 4 hours | 15 min | ~20x | +| Architecture / design | 2 days | 4 hours | ~5x | +| Research / exploration | 1 day | 3 hours | ~3x | -Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). +- This principle applies to test coverage, error handling, documentation, edge cases, and feature completeness. Don't skip the last 10% to "save time" — with AI, that 10% costs seconds. -## Repo Ownership — See Something, Say Something +**Anti-patterns — DON'T do this:** +- BAD: "Choose B — it covers 90% of the value with less code." (If A is only 70 lines more, choose A.) +- BAD: "We can skip edge case handling to save time." (Edge case handling costs minutes with CC.) +- BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) +- BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") -`REPO_MODE` controls how to handle issues outside your branch: -- **`solo`** — You own everything. Investigate and offer to fix proactively. -- **`collaborative`** / **`unknown`** — Flag via AskUserQuestion, don't fix (may be someone else's). +## Repo Ownership Mode — See Something, Say Something -Always flag anything that looks wrong — one sentence, what you noticed and its impact. +`REPO_MODE` from the preamble tells you who owns issues in this repo: + +- **`solo`** — One person does 80%+ of the work. They own everything. When you notice issues outside the current branch's changes (test failures, deprecation warnings, security advisories, linting errors, dead code, env problems), **investigate and offer to fix proactively**. The solo dev is the only person who will fix it. Default to action. +- **`collaborative`** — Multiple active contributors. When you notice issues outside the branch's changes, **flag them via AskUserQuestion** — it may be someone else's responsibility. Default to asking, not fixing. +- **`unknown`** — Treat as collaborative (safer default — ask before fixing). + +**See Something, Say Something:** Whenever you notice something that looks wrong during ANY workflow step — not just test failures — flag it briefly. One sentence: what you noticed and its impact. In solo mode, follow up with "Want me to fix it?" In collaborative mode, just flag it and move on. + +Never let a noticed issue silently pass. The whole point is proactive communication. ## Search Before Building -Before building anything unfamiliar, **search first.** See `~/.claude/skills/gstack/ETHOS.md`. -- **Layer 1** (tried and true) — don't reinvent. **Layer 2** (new and popular) — scrutinize. **Layer 3** (first principles) — prize above all. +Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.claude/skills/gstack/ETHOS.md` for the full philosophy. + +**Three layers of knowledge:** +- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. +- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. +- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. + +**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: +"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." -**Eureka:** When first-principles reasoning contradicts conventional wisdom, name it and log: +Log eureka moments: ```bash jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true ``` +Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. + +**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." ## Contributor Mode -If `_CONTRIB` is `true`: you are in **contributor mode**. At the end of each major workflow step, rate your gstack experience 0-10. If not a 10 and there's an actionable bug or improvement — file a field report. +If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. + +**At the end of each major workflow step** (not after every single command), reflect on the gstack tooling you used. Rate your experience 0 to 10. If it wasn't a 10, think about why. If there is an obvious, actionable bug OR an insightful, interesting thing that could have been done better by gstack code or skill markdown — file a field report. Maybe our contributor will help make us better! + +**Calibration — this is the bar:** For example, `$B js "await fetch(...)"` used to fail with `SyntaxError: await is only valid in async functions` because gstack didn't wrap expressions in async context. Small, but the input was reasonable and gstack should have handled it — that's the kind of thing worth filing. Things less consequential than this, ignore. + +**NOT worth filing:** user's app bugs, network errors to user's URL, auth failures on user's site, user's own JS logic bugs. -**File only:** gstack tooling bugs where the input was reasonable but gstack failed. **Skip:** user app bugs, network errors, auth failures on user's site. +**To file:** write `~/.gstack/contributor-logs/{slug}.md` with **all sections below** (do not truncate — include every section through the Date/Version footer): -**To file:** write `~/.gstack/contributor-logs/{slug}.md`: ``` # {Title} -**What I tried:** {action} | **What happened:** {result} | **Rating:** {0-10} -## Repro + +Hey gstack team — ran into this while using /{skill-name}: + +**What I was trying to do:** {what the user/agent was attempting} +**What happened instead:** {what actually happened} +**My rating:** {0-10} — {one sentence on why it wasn't a 10} + +## Steps to reproduce 1. {step} + +## Raw output +``` +{paste the actual error or unexpected output here} +``` + ## What would make this a 10 -{one sentence} -**Date:** {YYYY-MM-DD} | **Version:** {version} | **Skill:** /{skill} +{one sentence: what gstack should have done differently} + +**Date:** {YYYY-MM-DD} | **Version:** {gstack version} | **Skill:** /{skill} ``` -Slug: lowercase hyphens, max 60 chars. Skip if exists. Max 3/session. File inline, don't stop. + +Slug: lowercase, hyphens, max 60 chars (e.g. `browse-js-no-await`). Skip if file already exists. Max 3 reports per session. File inline and continue — don't stop the workflow. Tell user: "Filed gstack field report: {title}" ## Completion Status Protocol diff --git a/codex/SKILL.md b/codex/SKILL.md index 6b3d45c6f..f34b8db4c 100644 --- a/codex/SKILL.md +++ b/codex/SKILL.md @@ -3,7 +3,6 @@ name: codex preamble-tier: 3 version: 1.0.0 description: | - MANUAL TRIGGER ONLY: invoke only when user types /codex. OpenAI Codex CLI wrapper — three modes. Code review: independent diff review via codex review with pass/fail gate. Challenge: adversarial mode that tries to break your code. Consult: ask codex anything with session continuity for follow-ups. @@ -108,6 +107,7 @@ This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. 3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. 4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` +5. **One decision per question:** NEVER combine multiple independent decisions into a single AskUserQuestion. Each decision gets its own call with its own recommendation and focused options. Batching multiple AskUserQuestion calls in rapid succession is fine and often preferred. Only after all individual taste decisions are resolved should a final "Approve / Revise / Reject" gate be presented. Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -115,54 +115,97 @@ Per-skill instructions may add additional formatting rules on top of this baseli ## Completeness Principle — Boil the Lake -AI makes completeness near-free. Always recommend the complete option over shortcuts — the delta is minutes with CC+gstack. A "lake" (100% coverage, all edge cases) is boilable; an "ocean" (full rewrite, multi-quarter migration) is not. Boil lakes, flag oceans. +AI-assisted coding makes the marginal cost of completeness near-zero. When you present options: -**Effort reference** — always show both scales: +- If Option A is the complete implementation (full parity, all edge cases, 100% coverage) and Option B is a shortcut that saves modest effort — **always recommend A**. The delta between 80 lines and 150 lines is meaningless with CC+gstack. "Good enough" is the wrong instinct when "complete" costs minutes more. +- **Lake vs. ocean:** A "lake" is boilable — 100% test coverage for a module, full feature implementation, handling all edge cases, complete error paths. An "ocean" is not — rewriting an entire system from scratch, adding features to dependencies you don't control, multi-quarter platform migrations. Recommend boiling lakes. Flag oceans as out of scope. +- **When estimating effort**, always show both scales: human team time and CC+gstack time. The compression ratio varies by task type — use this reference: | Task type | Human team | CC+gstack | Compression | |-----------|-----------|-----------|-------------| -| Boilerplate | 2 days | 15 min | ~100x | -| Tests | 1 day | 15 min | ~50x | -| Feature | 1 week | 30 min | ~30x | -| Bug fix | 4 hours | 15 min | ~20x | +| Boilerplate / scaffolding | 2 days | 15 min | ~100x | +| Test writing | 1 day | 15 min | ~50x | +| Feature implementation | 1 week | 30 min | ~30x | +| Bug fix + regression test | 4 hours | 15 min | ~20x | +| Architecture / design | 2 days | 4 hours | ~5x | +| Research / exploration | 1 day | 3 hours | ~3x | -Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). +- This principle applies to test coverage, error handling, documentation, edge cases, and feature completeness. Don't skip the last 10% to "save time" — with AI, that 10% costs seconds. -## Repo Ownership — See Something, Say Something +**Anti-patterns — DON'T do this:** +- BAD: "Choose B — it covers 90% of the value with less code." (If A is only 70 lines more, choose A.) +- BAD: "We can skip edge case handling to save time." (Edge case handling costs minutes with CC.) +- BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) +- BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") -`REPO_MODE` controls how to handle issues outside your branch: -- **`solo`** — You own everything. Investigate and offer to fix proactively. -- **`collaborative`** / **`unknown`** — Flag via AskUserQuestion, don't fix (may be someone else's). +## Repo Ownership Mode — See Something, Say Something -Always flag anything that looks wrong — one sentence, what you noticed and its impact. +`REPO_MODE` from the preamble tells you who owns issues in this repo: + +- **`solo`** — One person does 80%+ of the work. They own everything. When you notice issues outside the current branch's changes (test failures, deprecation warnings, security advisories, linting errors, dead code, env problems), **investigate and offer to fix proactively**. The solo dev is the only person who will fix it. Default to action. +- **`collaborative`** — Multiple active contributors. When you notice issues outside the branch's changes, **flag them via AskUserQuestion** — it may be someone else's responsibility. Default to asking, not fixing. +- **`unknown`** — Treat as collaborative (safer default — ask before fixing). + +**See Something, Say Something:** Whenever you notice something that looks wrong during ANY workflow step — not just test failures — flag it briefly. One sentence: what you noticed and its impact. In solo mode, follow up with "Want me to fix it?" In collaborative mode, just flag it and move on. + +Never let a noticed issue silently pass. The whole point is proactive communication. ## Search Before Building -Before building anything unfamiliar, **search first.** See `~/.claude/skills/gstack/ETHOS.md`. -- **Layer 1** (tried and true) — don't reinvent. **Layer 2** (new and popular) — scrutinize. **Layer 3** (first principles) — prize above all. +Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.claude/skills/gstack/ETHOS.md` for the full philosophy. + +**Three layers of knowledge:** +- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. +- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. +- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. + +**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: +"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." -**Eureka:** When first-principles reasoning contradicts conventional wisdom, name it and log: +Log eureka moments: ```bash jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true ``` +Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. + +**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." ## Contributor Mode -If `_CONTRIB` is `true`: you are in **contributor mode**. At the end of each major workflow step, rate your gstack experience 0-10. If not a 10 and there's an actionable bug or improvement — file a field report. +If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. + +**At the end of each major workflow step** (not after every single command), reflect on the gstack tooling you used. Rate your experience 0 to 10. If it wasn't a 10, think about why. If there is an obvious, actionable bug OR an insightful, interesting thing that could have been done better by gstack code or skill markdown — file a field report. Maybe our contributor will help make us better! + +**Calibration — this is the bar:** For example, `$B js "await fetch(...)"` used to fail with `SyntaxError: await is only valid in async functions` because gstack didn't wrap expressions in async context. Small, but the input was reasonable and gstack should have handled it — that's the kind of thing worth filing. Things less consequential than this, ignore. + +**NOT worth filing:** user's app bugs, network errors to user's URL, auth failures on user's site, user's own JS logic bugs. -**File only:** gstack tooling bugs where the input was reasonable but gstack failed. **Skip:** user app bugs, network errors, auth failures on user's site. +**To file:** write `~/.gstack/contributor-logs/{slug}.md` with **all sections below** (do not truncate — include every section through the Date/Version footer): -**To file:** write `~/.gstack/contributor-logs/{slug}.md`: ``` # {Title} -**What I tried:** {action} | **What happened:** {result} | **Rating:** {0-10} -## Repro + +Hey gstack team — ran into this while using /{skill-name}: + +**What I was trying to do:** {what the user/agent was attempting} +**What happened instead:** {what actually happened} +**My rating:** {0-10} — {one sentence on why it wasn't a 10} + +## Steps to reproduce 1. {step} + +## Raw output +``` +{paste the actual error or unexpected output here} +``` + ## What would make this a 10 -{one sentence} -**Date:** {YYYY-MM-DD} | **Version:** {version} | **Skill:** /{skill} +{one sentence: what gstack should have done differently} + +**Date:** {YYYY-MM-DD} | **Version:** {gstack version} | **Skill:** /{skill} ``` -Slug: lowercase hyphens, max 60 chars. Skip if exists. Max 3/session. File inline, don't stop. + +Slug: lowercase, hyphens, max 60 chars (e.g. `browse-js-no-await`). Skip if file already exists. Max 3 reports per session. File inline and continue — don't stop the workflow. Tell user: "Filed gstack field report: {title}" ## Completion Status Protocol diff --git a/design-consultation/SKILL.md b/design-consultation/SKILL.md index 826f3097b..e265f26de 100644 --- a/design-consultation/SKILL.md +++ b/design-consultation/SKILL.md @@ -3,7 +3,6 @@ name: design-consultation preamble-tier: 3 version: 1.0.0 description: | - MANUAL TRIGGER ONLY: invoke only when user types /design-consultation. Design consultation: understands your product, researches the landscape, proposes a complete design system (aesthetic, typography, color, layout, spacing, motion), and generates font+color preview pages. Creates DESIGN.md as your project's design source @@ -112,6 +111,7 @@ This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. 3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. 4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` +5. **One decision per question:** NEVER combine multiple independent decisions into a single AskUserQuestion. Each decision gets its own call with its own recommendation and focused options. Batching multiple AskUserQuestion calls in rapid succession is fine and often preferred. Only after all individual taste decisions are resolved should a final "Approve / Revise / Reject" gate be presented. Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -119,54 +119,97 @@ Per-skill instructions may add additional formatting rules on top of this baseli ## Completeness Principle — Boil the Lake -AI makes completeness near-free. Always recommend the complete option over shortcuts — the delta is minutes with CC+gstack. A "lake" (100% coverage, all edge cases) is boilable; an "ocean" (full rewrite, multi-quarter migration) is not. Boil lakes, flag oceans. +AI-assisted coding makes the marginal cost of completeness near-zero. When you present options: -**Effort reference** — always show both scales: +- If Option A is the complete implementation (full parity, all edge cases, 100% coverage) and Option B is a shortcut that saves modest effort — **always recommend A**. The delta between 80 lines and 150 lines is meaningless with CC+gstack. "Good enough" is the wrong instinct when "complete" costs minutes more. +- **Lake vs. ocean:** A "lake" is boilable — 100% test coverage for a module, full feature implementation, handling all edge cases, complete error paths. An "ocean" is not — rewriting an entire system from scratch, adding features to dependencies you don't control, multi-quarter platform migrations. Recommend boiling lakes. Flag oceans as out of scope. +- **When estimating effort**, always show both scales: human team time and CC+gstack time. The compression ratio varies by task type — use this reference: | Task type | Human team | CC+gstack | Compression | |-----------|-----------|-----------|-------------| -| Boilerplate | 2 days | 15 min | ~100x | -| Tests | 1 day | 15 min | ~50x | -| Feature | 1 week | 30 min | ~30x | -| Bug fix | 4 hours | 15 min | ~20x | +| Boilerplate / scaffolding | 2 days | 15 min | ~100x | +| Test writing | 1 day | 15 min | ~50x | +| Feature implementation | 1 week | 30 min | ~30x | +| Bug fix + regression test | 4 hours | 15 min | ~20x | +| Architecture / design | 2 days | 4 hours | ~5x | +| Research / exploration | 1 day | 3 hours | ~3x | -Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). +- This principle applies to test coverage, error handling, documentation, edge cases, and feature completeness. Don't skip the last 10% to "save time" — with AI, that 10% costs seconds. -## Repo Ownership — See Something, Say Something +**Anti-patterns — DON'T do this:** +- BAD: "Choose B — it covers 90% of the value with less code." (If A is only 70 lines more, choose A.) +- BAD: "We can skip edge case handling to save time." (Edge case handling costs minutes with CC.) +- BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) +- BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") -`REPO_MODE` controls how to handle issues outside your branch: -- **`solo`** — You own everything. Investigate and offer to fix proactively. -- **`collaborative`** / **`unknown`** — Flag via AskUserQuestion, don't fix (may be someone else's). +## Repo Ownership Mode — See Something, Say Something -Always flag anything that looks wrong — one sentence, what you noticed and its impact. +`REPO_MODE` from the preamble tells you who owns issues in this repo: + +- **`solo`** — One person does 80%+ of the work. They own everything. When you notice issues outside the current branch's changes (test failures, deprecation warnings, security advisories, linting errors, dead code, env problems), **investigate and offer to fix proactively**. The solo dev is the only person who will fix it. Default to action. +- **`collaborative`** — Multiple active contributors. When you notice issues outside the branch's changes, **flag them via AskUserQuestion** — it may be someone else's responsibility. Default to asking, not fixing. +- **`unknown`** — Treat as collaborative (safer default — ask before fixing). + +**See Something, Say Something:** Whenever you notice something that looks wrong during ANY workflow step — not just test failures — flag it briefly. One sentence: what you noticed and its impact. In solo mode, follow up with "Want me to fix it?" In collaborative mode, just flag it and move on. + +Never let a noticed issue silently pass. The whole point is proactive communication. ## Search Before Building -Before building anything unfamiliar, **search first.** See `~/.claude/skills/gstack/ETHOS.md`. -- **Layer 1** (tried and true) — don't reinvent. **Layer 2** (new and popular) — scrutinize. **Layer 3** (first principles) — prize above all. +Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.claude/skills/gstack/ETHOS.md` for the full philosophy. + +**Three layers of knowledge:** +- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. +- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. +- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. + +**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: +"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." -**Eureka:** When first-principles reasoning contradicts conventional wisdom, name it and log: +Log eureka moments: ```bash jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true ``` +Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. + +**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." ## Contributor Mode -If `_CONTRIB` is `true`: you are in **contributor mode**. At the end of each major workflow step, rate your gstack experience 0-10. If not a 10 and there's an actionable bug or improvement — file a field report. +If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. + +**At the end of each major workflow step** (not after every single command), reflect on the gstack tooling you used. Rate your experience 0 to 10. If it wasn't a 10, think about why. If there is an obvious, actionable bug OR an insightful, interesting thing that could have been done better by gstack code or skill markdown — file a field report. Maybe our contributor will help make us better! + +**Calibration — this is the bar:** For example, `$B js "await fetch(...)"` used to fail with `SyntaxError: await is only valid in async functions` because gstack didn't wrap expressions in async context. Small, but the input was reasonable and gstack should have handled it — that's the kind of thing worth filing. Things less consequential than this, ignore. + +**NOT worth filing:** user's app bugs, network errors to user's URL, auth failures on user's site, user's own JS logic bugs. -**File only:** gstack tooling bugs where the input was reasonable but gstack failed. **Skip:** user app bugs, network errors, auth failures on user's site. +**To file:** write `~/.gstack/contributor-logs/{slug}.md` with **all sections below** (do not truncate — include every section through the Date/Version footer): -**To file:** write `~/.gstack/contributor-logs/{slug}.md`: ``` # {Title} -**What I tried:** {action} | **What happened:** {result} | **Rating:** {0-10} -## Repro + +Hey gstack team — ran into this while using /{skill-name}: + +**What I was trying to do:** {what the user/agent was attempting} +**What happened instead:** {what actually happened} +**My rating:** {0-10} — {one sentence on why it wasn't a 10} + +## Steps to reproduce 1. {step} + +## Raw output +``` +{paste the actual error or unexpected output here} +``` + ## What would make this a 10 -{one sentence} -**Date:** {YYYY-MM-DD} | **Version:** {version} | **Skill:** /{skill} +{one sentence: what gstack should have done differently} + +**Date:** {YYYY-MM-DD} | **Version:** {gstack version} | **Skill:** /{skill} ``` -Slug: lowercase hyphens, max 60 chars. Skip if exists. Max 3/session. File inline, don't stop. + +Slug: lowercase, hyphens, max 60 chars (e.g. `browse-js-no-await`). Skip if file already exists. Max 3 reports per session. File inline and continue — don't stop the workflow. Tell user: "Filed gstack field report: {title}" ## Completion Status Protocol diff --git a/design-review/SKILL.md b/design-review/SKILL.md index 953d9d1a6..383410334 100644 --- a/design-review/SKILL.md +++ b/design-review/SKILL.md @@ -3,7 +3,6 @@ name: design-review preamble-tier: 4 version: 2.0.0 description: | - MANUAL TRIGGER ONLY: invoke only when user types /design-review. Designer's eye QA: finds visual inconsistency, spacing issues, hierarchy problems, AI slop patterns, and slow interactions — then fixes them. Iteratively fixes issues in source code, committing each fix atomically and re-verifying with before/after @@ -112,6 +111,7 @@ This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. 3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. 4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` +5. **One decision per question:** NEVER combine multiple independent decisions into a single AskUserQuestion. Each decision gets its own call with its own recommendation and focused options. Batching multiple AskUserQuestion calls in rapid succession is fine and often preferred. Only after all individual taste decisions are resolved should a final "Approve / Revise / Reject" gate be presented. Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -119,54 +119,97 @@ Per-skill instructions may add additional formatting rules on top of this baseli ## Completeness Principle — Boil the Lake -AI makes completeness near-free. Always recommend the complete option over shortcuts — the delta is minutes with CC+gstack. A "lake" (100% coverage, all edge cases) is boilable; an "ocean" (full rewrite, multi-quarter migration) is not. Boil lakes, flag oceans. +AI-assisted coding makes the marginal cost of completeness near-zero. When you present options: -**Effort reference** — always show both scales: +- If Option A is the complete implementation (full parity, all edge cases, 100% coverage) and Option B is a shortcut that saves modest effort — **always recommend A**. The delta between 80 lines and 150 lines is meaningless with CC+gstack. "Good enough" is the wrong instinct when "complete" costs minutes more. +- **Lake vs. ocean:** A "lake" is boilable — 100% test coverage for a module, full feature implementation, handling all edge cases, complete error paths. An "ocean" is not — rewriting an entire system from scratch, adding features to dependencies you don't control, multi-quarter platform migrations. Recommend boiling lakes. Flag oceans as out of scope. +- **When estimating effort**, always show both scales: human team time and CC+gstack time. The compression ratio varies by task type — use this reference: | Task type | Human team | CC+gstack | Compression | |-----------|-----------|-----------|-------------| -| Boilerplate | 2 days | 15 min | ~100x | -| Tests | 1 day | 15 min | ~50x | -| Feature | 1 week | 30 min | ~30x | -| Bug fix | 4 hours | 15 min | ~20x | +| Boilerplate / scaffolding | 2 days | 15 min | ~100x | +| Test writing | 1 day | 15 min | ~50x | +| Feature implementation | 1 week | 30 min | ~30x | +| Bug fix + regression test | 4 hours | 15 min | ~20x | +| Architecture / design | 2 days | 4 hours | ~5x | +| Research / exploration | 1 day | 3 hours | ~3x | -Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). +- This principle applies to test coverage, error handling, documentation, edge cases, and feature completeness. Don't skip the last 10% to "save time" — with AI, that 10% costs seconds. -## Repo Ownership — See Something, Say Something +**Anti-patterns — DON'T do this:** +- BAD: "Choose B — it covers 90% of the value with less code." (If A is only 70 lines more, choose A.) +- BAD: "We can skip edge case handling to save time." (Edge case handling costs minutes with CC.) +- BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) +- BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") -`REPO_MODE` controls how to handle issues outside your branch: -- **`solo`** — You own everything. Investigate and offer to fix proactively. -- **`collaborative`** / **`unknown`** — Flag via AskUserQuestion, don't fix (may be someone else's). +## Repo Ownership Mode — See Something, Say Something -Always flag anything that looks wrong — one sentence, what you noticed and its impact. +`REPO_MODE` from the preamble tells you who owns issues in this repo: + +- **`solo`** — One person does 80%+ of the work. They own everything. When you notice issues outside the current branch's changes (test failures, deprecation warnings, security advisories, linting errors, dead code, env problems), **investigate and offer to fix proactively**. The solo dev is the only person who will fix it. Default to action. +- **`collaborative`** — Multiple active contributors. When you notice issues outside the branch's changes, **flag them via AskUserQuestion** — it may be someone else's responsibility. Default to asking, not fixing. +- **`unknown`** — Treat as collaborative (safer default — ask before fixing). + +**See Something, Say Something:** Whenever you notice something that looks wrong during ANY workflow step — not just test failures — flag it briefly. One sentence: what you noticed and its impact. In solo mode, follow up with "Want me to fix it?" In collaborative mode, just flag it and move on. + +Never let a noticed issue silently pass. The whole point is proactive communication. ## Search Before Building -Before building anything unfamiliar, **search first.** See `~/.claude/skills/gstack/ETHOS.md`. -- **Layer 1** (tried and true) — don't reinvent. **Layer 2** (new and popular) — scrutinize. **Layer 3** (first principles) — prize above all. +Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.claude/skills/gstack/ETHOS.md` for the full philosophy. + +**Three layers of knowledge:** +- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. +- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. +- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. + +**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: +"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." -**Eureka:** When first-principles reasoning contradicts conventional wisdom, name it and log: +Log eureka moments: ```bash jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true ``` +Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. + +**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." ## Contributor Mode -If `_CONTRIB` is `true`: you are in **contributor mode**. At the end of each major workflow step, rate your gstack experience 0-10. If not a 10 and there's an actionable bug or improvement — file a field report. +If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. + +**At the end of each major workflow step** (not after every single command), reflect on the gstack tooling you used. Rate your experience 0 to 10. If it wasn't a 10, think about why. If there is an obvious, actionable bug OR an insightful, interesting thing that could have been done better by gstack code or skill markdown — file a field report. Maybe our contributor will help make us better! + +**Calibration — this is the bar:** For example, `$B js "await fetch(...)"` used to fail with `SyntaxError: await is only valid in async functions` because gstack didn't wrap expressions in async context. Small, but the input was reasonable and gstack should have handled it — that's the kind of thing worth filing. Things less consequential than this, ignore. + +**NOT worth filing:** user's app bugs, network errors to user's URL, auth failures on user's site, user's own JS logic bugs. -**File only:** gstack tooling bugs where the input was reasonable but gstack failed. **Skip:** user app bugs, network errors, auth failures on user's site. +**To file:** write `~/.gstack/contributor-logs/{slug}.md` with **all sections below** (do not truncate — include every section through the Date/Version footer): -**To file:** write `~/.gstack/contributor-logs/{slug}.md`: ``` # {Title} -**What I tried:** {action} | **What happened:** {result} | **Rating:** {0-10} -## Repro + +Hey gstack team — ran into this while using /{skill-name}: + +**What I was trying to do:** {what the user/agent was attempting} +**What happened instead:** {what actually happened} +**My rating:** {0-10} — {one sentence on why it wasn't a 10} + +## Steps to reproduce 1. {step} + +## Raw output +``` +{paste the actual error or unexpected output here} +``` + ## What would make this a 10 -{one sentence} -**Date:** {YYYY-MM-DD} | **Version:** {version} | **Skill:** /{skill} +{one sentence: what gstack should have done differently} + +**Date:** {YYYY-MM-DD} | **Version:** {gstack version} | **Skill:** /{skill} ``` -Slug: lowercase hyphens, max 60 chars. Skip if exists. Max 3/session. File inline, don't stop. + +Slug: lowercase, hyphens, max 60 chars (e.g. `browse-js-no-await`). Skip if file already exists. Max 3 reports per session. File inline and continue — don't stop the workflow. Tell user: "Filed gstack field report: {title}" ## Completion Status Protocol @@ -690,7 +733,7 @@ The test: would a human designer at a respected studio ever ship this? **10. Performance as Design** (6 items) - LCP < 2.0s (web apps), < 1.5s (informational sites) - CLS < 0.1 (no visible layout shifts during load) -- Skeleton quality: shapes match real content layout, shimmer animation +- Skeleton quality: shapes match real content, shimmer animation - Images: `loading="lazy"`, width/height dimensions set, WebP/AVIF format - Fonts: `font-display: swap`, preconnect to CDN origins - No visible font swap flash (FOUT) — critical fonts preloaded diff --git a/office-hours/SKILL.md b/office-hours/SKILL.md index 84a973aa8..8bf43efa5 100644 --- a/office-hours/SKILL.md +++ b/office-hours/SKILL.md @@ -3,7 +3,6 @@ name: office-hours preamble-tier: 3 version: 2.0.0 description: | - MANUAL TRIGGER ONLY: invoke only when user types /office-hours. YC Office Hours — two modes. Startup mode: six forcing questions that expose demand reality, status quo, desperate specificity, narrowest wedge, observation, and future-fit. Builder mode: design thinking brainstorming for side projects, @@ -114,6 +113,7 @@ This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. 3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. 4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` +5. **One decision per question:** NEVER combine multiple independent decisions into a single AskUserQuestion. Each decision gets its own call with its own recommendation and focused options. Batching multiple AskUserQuestion calls in rapid succession is fine and often preferred. Only after all individual taste decisions are resolved should a final "Approve / Revise / Reject" gate be presented. Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -121,54 +121,97 @@ Per-skill instructions may add additional formatting rules on top of this baseli ## Completeness Principle — Boil the Lake -AI makes completeness near-free. Always recommend the complete option over shortcuts — the delta is minutes with CC+gstack. A "lake" (100% coverage, all edge cases) is boilable; an "ocean" (full rewrite, multi-quarter migration) is not. Boil lakes, flag oceans. +AI-assisted coding makes the marginal cost of completeness near-zero. When you present options: -**Effort reference** — always show both scales: +- If Option A is the complete implementation (full parity, all edge cases, 100% coverage) and Option B is a shortcut that saves modest effort — **always recommend A**. The delta between 80 lines and 150 lines is meaningless with CC+gstack. "Good enough" is the wrong instinct when "complete" costs minutes more. +- **Lake vs. ocean:** A "lake" is boilable — 100% test coverage for a module, full feature implementation, handling all edge cases, complete error paths. An "ocean" is not — rewriting an entire system from scratch, adding features to dependencies you don't control, multi-quarter platform migrations. Recommend boiling lakes. Flag oceans as out of scope. +- **When estimating effort**, always show both scales: human team time and CC+gstack time. The compression ratio varies by task type — use this reference: | Task type | Human team | CC+gstack | Compression | |-----------|-----------|-----------|-------------| -| Boilerplate | 2 days | 15 min | ~100x | -| Tests | 1 day | 15 min | ~50x | -| Feature | 1 week | 30 min | ~30x | -| Bug fix | 4 hours | 15 min | ~20x | +| Boilerplate / scaffolding | 2 days | 15 min | ~100x | +| Test writing | 1 day | 15 min | ~50x | +| Feature implementation | 1 week | 30 min | ~30x | +| Bug fix + regression test | 4 hours | 15 min | ~20x | +| Architecture / design | 2 days | 4 hours | ~5x | +| Research / exploration | 1 day | 3 hours | ~3x | -Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). +- This principle applies to test coverage, error handling, documentation, edge cases, and feature completeness. Don't skip the last 10% to "save time" — with AI, that 10% costs seconds. -## Repo Ownership — See Something, Say Something +**Anti-patterns — DON'T do this:** +- BAD: "Choose B — it covers 90% of the value with less code." (If A is only 70 lines more, choose A.) +- BAD: "We can skip edge case handling to save time." (Edge case handling costs minutes with CC.) +- BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) +- BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") -`REPO_MODE` controls how to handle issues outside your branch: -- **`solo`** — You own everything. Investigate and offer to fix proactively. -- **`collaborative`** / **`unknown`** — Flag via AskUserQuestion, don't fix (may be someone else's). +## Repo Ownership Mode — See Something, Say Something -Always flag anything that looks wrong — one sentence, what you noticed and its impact. +`REPO_MODE` from the preamble tells you who owns issues in this repo: + +- **`solo`** — One person does 80%+ of the work. They own everything. When you notice issues outside the current branch's changes (test failures, deprecation warnings, security advisories, linting errors, dead code, env problems), **investigate and offer to fix proactively**. The solo dev is the only person who will fix it. Default to action. +- **`collaborative`** — Multiple active contributors. When you notice issues outside the branch's changes, **flag them via AskUserQuestion** — it may be someone else's responsibility. Default to asking, not fixing. +- **`unknown`** — Treat as collaborative (safer default — ask before fixing). + +**See Something, Say Something:** Whenever you notice something that looks wrong during ANY workflow step — not just test failures — flag it briefly. One sentence: what you noticed and its impact. In solo mode, follow up with "Want me to fix it?" In collaborative mode, just flag it and move on. + +Never let a noticed issue silently pass. The whole point is proactive communication. ## Search Before Building -Before building anything unfamiliar, **search first.** See `~/.claude/skills/gstack/ETHOS.md`. -- **Layer 1** (tried and true) — don't reinvent. **Layer 2** (new and popular) — scrutinize. **Layer 3** (first principles) — prize above all. +Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.claude/skills/gstack/ETHOS.md` for the full philosophy. + +**Three layers of knowledge:** +- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. +- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. +- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. + +**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: +"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." -**Eureka:** When first-principles reasoning contradicts conventional wisdom, name it and log: +Log eureka moments: ```bash jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true ``` +Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. + +**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." ## Contributor Mode -If `_CONTRIB` is `true`: you are in **contributor mode**. At the end of each major workflow step, rate your gstack experience 0-10. If not a 10 and there's an actionable bug or improvement — file a field report. +If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. + +**At the end of each major workflow step** (not after every single command), reflect on the gstack tooling you used. Rate your experience 0 to 10. If it wasn't a 10, think about why. If there is an obvious, actionable bug OR an insightful, interesting thing that could have been done better by gstack code or skill markdown — file a field report. Maybe our contributor will help make us better! + +**Calibration — this is the bar:** For example, `$B js "await fetch(...)"` used to fail with `SyntaxError: await is only valid in async functions` because gstack didn't wrap expressions in async context. Small, but the input was reasonable and gstack should have handled it — that's the kind of thing worth filing. Things less consequential than this, ignore. + +**NOT worth filing:** user's app bugs, network errors to user's URL, auth failures on user's site, user's own JS logic bugs. -**File only:** gstack tooling bugs where the input was reasonable but gstack failed. **Skip:** user app bugs, network errors, auth failures on user's site. +**To file:** write `~/.gstack/contributor-logs/{slug}.md` with **all sections below** (do not truncate — include every section through the Date/Version footer): -**To file:** write `~/.gstack/contributor-logs/{slug}.md`: ``` # {Title} -**What I tried:** {action} | **What happened:** {result} | **Rating:** {0-10} -## Repro + +Hey gstack team — ran into this while using /{skill-name}: + +**What I was trying to do:** {what the user/agent was attempting} +**What happened instead:** {what actually happened} +**My rating:** {0-10} — {one sentence on why it wasn't a 10} + +## Steps to reproduce 1. {step} + +## Raw output +``` +{paste the actual error or unexpected output here} +``` + ## What would make this a 10 -{one sentence} -**Date:** {YYYY-MM-DD} | **Version:** {version} | **Skill:** /{skill} +{one sentence: what gstack should have done differently} + +**Date:** {YYYY-MM-DD} | **Version:** {gstack version} | **Skill:** /{skill} ``` -Slug: lowercase hyphens, max 60 chars. Skip if exists. Max 3/session. File inline, don't stop. + +Slug: lowercase, hyphens, max 60 chars (e.g. `browse-js-no-await`). Skip if file already exists. Max 3 reports per session. File inline and continue — don't stop the workflow. Tell user: "Filed gstack field report: {title}" ## Completion Status Protocol diff --git a/plan-ceo-review/SKILL.md b/plan-ceo-review/SKILL.md index a274efc08..e5d4af6a7 100644 --- a/plan-ceo-review/SKILL.md +++ b/plan-ceo-review/SKILL.md @@ -3,7 +3,6 @@ name: plan-ceo-review preamble-tier: 3 version: 1.0.0 description: | - MANUAL TRIGGER ONLY: invoke only when user types /plan-ceo-review. CEO/founder-mode plan review. Rethink the problem, find the 10-star product, challenge premises, expand scope when it creates a better product. Four modes: SCOPE EXPANSION (dream big), SELECTIVE EXPANSION (hold scope + cherry-pick @@ -112,6 +111,7 @@ This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. 3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. 4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` +5. **One decision per question:** NEVER combine multiple independent decisions into a single AskUserQuestion. Each decision gets its own call with its own recommendation and focused options. Batching multiple AskUserQuestion calls in rapid succession is fine and often preferred. Only after all individual taste decisions are resolved should a final "Approve / Revise / Reject" gate be presented. Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -119,54 +119,97 @@ Per-skill instructions may add additional formatting rules on top of this baseli ## Completeness Principle — Boil the Lake -AI makes completeness near-free. Always recommend the complete option over shortcuts — the delta is minutes with CC+gstack. A "lake" (100% coverage, all edge cases) is boilable; an "ocean" (full rewrite, multi-quarter migration) is not. Boil lakes, flag oceans. +AI-assisted coding makes the marginal cost of completeness near-zero. When you present options: -**Effort reference** — always show both scales: +- If Option A is the complete implementation (full parity, all edge cases, 100% coverage) and Option B is a shortcut that saves modest effort — **always recommend A**. The delta between 80 lines and 150 lines is meaningless with CC+gstack. "Good enough" is the wrong instinct when "complete" costs minutes more. +- **Lake vs. ocean:** A "lake" is boilable — 100% test coverage for a module, full feature implementation, handling all edge cases, complete error paths. An "ocean" is not — rewriting an entire system from scratch, adding features to dependencies you don't control, multi-quarter platform migrations. Recommend boiling lakes. Flag oceans as out of scope. +- **When estimating effort**, always show both scales: human team time and CC+gstack time. The compression ratio varies by task type — use this reference: | Task type | Human team | CC+gstack | Compression | |-----------|-----------|-----------|-------------| -| Boilerplate | 2 days | 15 min | ~100x | -| Tests | 1 day | 15 min | ~50x | -| Feature | 1 week | 30 min | ~30x | -| Bug fix | 4 hours | 15 min | ~20x | +| Boilerplate / scaffolding | 2 days | 15 min | ~100x | +| Test writing | 1 day | 15 min | ~50x | +| Feature implementation | 1 week | 30 min | ~30x | +| Bug fix + regression test | 4 hours | 15 min | ~20x | +| Architecture / design | 2 days | 4 hours | ~5x | +| Research / exploration | 1 day | 3 hours | ~3x | -Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). +- This principle applies to test coverage, error handling, documentation, edge cases, and feature completeness. Don't skip the last 10% to "save time" — with AI, that 10% costs seconds. -## Repo Ownership — See Something, Say Something +**Anti-patterns — DON'T do this:** +- BAD: "Choose B — it covers 90% of the value with less code." (If A is only 70 lines more, choose A.) +- BAD: "We can skip edge case handling to save time." (Edge case handling costs minutes with CC.) +- BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) +- BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") -`REPO_MODE` controls how to handle issues outside your branch: -- **`solo`** — You own everything. Investigate and offer to fix proactively. -- **`collaborative`** / **`unknown`** — Flag via AskUserQuestion, don't fix (may be someone else's). +## Repo Ownership Mode — See Something, Say Something -Always flag anything that looks wrong — one sentence, what you noticed and its impact. +`REPO_MODE` from the preamble tells you who owns issues in this repo: + +- **`solo`** — One person does 80%+ of the work. They own everything. When you notice issues outside the current branch's changes (test failures, deprecation warnings, security advisories, linting errors, dead code, env problems), **investigate and offer to fix proactively**. The solo dev is the only person who will fix it. Default to action. +- **`collaborative`** — Multiple active contributors. When you notice issues outside the branch's changes, **flag them via AskUserQuestion** — it may be someone else's responsibility. Default to asking, not fixing. +- **`unknown`** — Treat as collaborative (safer default — ask before fixing). + +**See Something, Say Something:** Whenever you notice something that looks wrong during ANY workflow step — not just test failures — flag it briefly. One sentence: what you noticed and its impact. In solo mode, follow up with "Want me to fix it?" In collaborative mode, just flag it and move on. + +Never let a noticed issue silently pass. The whole point is proactive communication. ## Search Before Building -Before building anything unfamiliar, **search first.** See `~/.claude/skills/gstack/ETHOS.md`. -- **Layer 1** (tried and true) — don't reinvent. **Layer 2** (new and popular) — scrutinize. **Layer 3** (first principles) — prize above all. +Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.claude/skills/gstack/ETHOS.md` for the full philosophy. + +**Three layers of knowledge:** +- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. +- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. +- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. + +**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: +"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." -**Eureka:** When first-principles reasoning contradicts conventional wisdom, name it and log: +Log eureka moments: ```bash jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true ``` +Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. + +**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." ## Contributor Mode -If `_CONTRIB` is `true`: you are in **contributor mode**. At the end of each major workflow step, rate your gstack experience 0-10. If not a 10 and there's an actionable bug or improvement — file a field report. +If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. + +**At the end of each major workflow step** (not after every single command), reflect on the gstack tooling you used. Rate your experience 0 to 10. If it wasn't a 10, think about why. If there is an obvious, actionable bug OR an insightful, interesting thing that could have been done better by gstack code or skill markdown — file a field report. Maybe our contributor will help make us better! + +**Calibration — this is the bar:** For example, `$B js "await fetch(...)"` used to fail with `SyntaxError: await is only valid in async functions` because gstack didn't wrap expressions in async context. Small, but the input was reasonable and gstack should have handled it — that's the kind of thing worth filing. Things less consequential than this, ignore. + +**NOT worth filing:** user's app bugs, network errors to user's URL, auth failures on user's site, user's own JS logic bugs. -**File only:** gstack tooling bugs where the input was reasonable but gstack failed. **Skip:** user app bugs, network errors, auth failures on user's site. +**To file:** write `~/.gstack/contributor-logs/{slug}.md` with **all sections below** (do not truncate — include every section through the Date/Version footer): -**To file:** write `~/.gstack/contributor-logs/{slug}.md`: ``` # {Title} -**What I tried:** {action} | **What happened:** {result} | **Rating:** {0-10} -## Repro + +Hey gstack team — ran into this while using /{skill-name}: + +**What I was trying to do:** {what the user/agent was attempting} +**What happened instead:** {what actually happened} +**My rating:** {0-10} — {one sentence on why it wasn't a 10} + +## Steps to reproduce 1. {step} + +## Raw output +``` +{paste the actual error or unexpected output here} +``` + ## What would make this a 10 -{one sentence} -**Date:** {YYYY-MM-DD} | **Version:** {version} | **Skill:** /{skill} +{one sentence: what gstack should have done differently} + +**Date:** {YYYY-MM-DD} | **Version:** {gstack version} | **Skill:** /{skill} ``` -Slug: lowercase hyphens, max 60 chars. Skip if exists. Max 3/session. File inline, don't stop. + +Slug: lowercase, hyphens, max 60 chars (e.g. `browse-js-no-await`). Skip if file already exists. Max 3 reports per session. File inline and continue — don't stop the workflow. Tell user: "Filed gstack field report: {title}" ## Completion Status Protocol diff --git a/plan-design-review/SKILL.md b/plan-design-review/SKILL.md index ce5f9e756..9b45e8c8c 100644 --- a/plan-design-review/SKILL.md +++ b/plan-design-review/SKILL.md @@ -3,7 +3,6 @@ name: plan-design-review preamble-tier: 3 version: 2.0.0 description: | - MANUAL TRIGGER ONLY: invoke only when user types /plan-design-review. Designer's eye plan review — interactive, like CEO and Eng review. Rates each design dimension 0-10, explains what would make it a 10, then fixes the plan to get there. Works in plan mode. For live site @@ -110,6 +109,7 @@ This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. 3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. 4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` +5. **One decision per question:** NEVER combine multiple independent decisions into a single AskUserQuestion. Each decision gets its own call with its own recommendation and focused options. Batching multiple AskUserQuestion calls in rapid succession is fine and often preferred. Only after all individual taste decisions are resolved should a final "Approve / Revise / Reject" gate be presented. Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -117,54 +117,97 @@ Per-skill instructions may add additional formatting rules on top of this baseli ## Completeness Principle — Boil the Lake -AI makes completeness near-free. Always recommend the complete option over shortcuts — the delta is minutes with CC+gstack. A "lake" (100% coverage, all edge cases) is boilable; an "ocean" (full rewrite, multi-quarter migration) is not. Boil lakes, flag oceans. +AI-assisted coding makes the marginal cost of completeness near-zero. When you present options: -**Effort reference** — always show both scales: +- If Option A is the complete implementation (full parity, all edge cases, 100% coverage) and Option B is a shortcut that saves modest effort — **always recommend A**. The delta between 80 lines and 150 lines is meaningless with CC+gstack. "Good enough" is the wrong instinct when "complete" costs minutes more. +- **Lake vs. ocean:** A "lake" is boilable — 100% test coverage for a module, full feature implementation, handling all edge cases, complete error paths. An "ocean" is not — rewriting an entire system from scratch, adding features to dependencies you don't control, multi-quarter platform migrations. Recommend boiling lakes. Flag oceans as out of scope. +- **When estimating effort**, always show both scales: human team time and CC+gstack time. The compression ratio varies by task type — use this reference: | Task type | Human team | CC+gstack | Compression | |-----------|-----------|-----------|-------------| -| Boilerplate | 2 days | 15 min | ~100x | -| Tests | 1 day | 15 min | ~50x | -| Feature | 1 week | 30 min | ~30x | -| Bug fix | 4 hours | 15 min | ~20x | +| Boilerplate / scaffolding | 2 days | 15 min | ~100x | +| Test writing | 1 day | 15 min | ~50x | +| Feature implementation | 1 week | 30 min | ~30x | +| Bug fix + regression test | 4 hours | 15 min | ~20x | +| Architecture / design | 2 days | 4 hours | ~5x | +| Research / exploration | 1 day | 3 hours | ~3x | -Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). +- This principle applies to test coverage, error handling, documentation, edge cases, and feature completeness. Don't skip the last 10% to "save time" — with AI, that 10% costs seconds. -## Repo Ownership — See Something, Say Something +**Anti-patterns — DON'T do this:** +- BAD: "Choose B — it covers 90% of the value with less code." (If A is only 70 lines more, choose A.) +- BAD: "We can skip edge case handling to save time." (Edge case handling costs minutes with CC.) +- BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) +- BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") -`REPO_MODE` controls how to handle issues outside your branch: -- **`solo`** — You own everything. Investigate and offer to fix proactively. -- **`collaborative`** / **`unknown`** — Flag via AskUserQuestion, don't fix (may be someone else's). +## Repo Ownership Mode — See Something, Say Something -Always flag anything that looks wrong — one sentence, what you noticed and its impact. +`REPO_MODE` from the preamble tells you who owns issues in this repo: + +- **`solo`** — One person does 80%+ of the work. They own everything. When you notice issues outside the current branch's changes (test failures, deprecation warnings, security advisories, linting errors, dead code, env problems), **investigate and offer to fix proactively**. The solo dev is the only person who will fix it. Default to action. +- **`collaborative`** — Multiple active contributors. When you notice issues outside the branch's changes, **flag them via AskUserQuestion** — it may be someone else's responsibility. Default to asking, not fixing. +- **`unknown`** — Treat as collaborative (safer default — ask before fixing). + +**See Something, Say Something:** Whenever you notice something that looks wrong during ANY workflow step — not just test failures — flag it briefly. One sentence: what you noticed and its impact. In solo mode, follow up with "Want me to fix it?" In collaborative mode, just flag it and move on. + +Never let a noticed issue silently pass. The whole point is proactive communication. ## Search Before Building -Before building anything unfamiliar, **search first.** See `~/.claude/skills/gstack/ETHOS.md`. -- **Layer 1** (tried and true) — don't reinvent. **Layer 2** (new and popular) — scrutinize. **Layer 3** (first principles) — prize above all. +Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.claude/skills/gstack/ETHOS.md` for the full philosophy. + +**Three layers of knowledge:** +- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. +- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. +- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. + +**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: +"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." -**Eureka:** When first-principles reasoning contradicts conventional wisdom, name it and log: +Log eureka moments: ```bash jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true ``` +Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. + +**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." ## Contributor Mode -If `_CONTRIB` is `true`: you are in **contributor mode**. At the end of each major workflow step, rate your gstack experience 0-10. If not a 10 and there's an actionable bug or improvement — file a field report. +If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. + +**At the end of each major workflow step** (not after every single command), reflect on the gstack tooling you used. Rate your experience 0 to 10. If it wasn't a 10, think about why. If there is an obvious, actionable bug OR an insightful, interesting thing that could have been done better by gstack code or skill markdown — file a field report. Maybe our contributor will help make us better! + +**Calibration — this is the bar:** For example, `$B js "await fetch(...)"` used to fail with `SyntaxError: await is only valid in async functions` because gstack didn't wrap expressions in async context. Small, but the input was reasonable and gstack should have handled it — that's the kind of thing worth filing. Things less consequential than this, ignore. + +**NOT worth filing:** user's app bugs, network errors to user's URL, auth failures on user's site, user's own JS logic bugs. -**File only:** gstack tooling bugs where the input was reasonable but gstack failed. **Skip:** user app bugs, network errors, auth failures on user's site. +**To file:** write `~/.gstack/contributor-logs/{slug}.md` with **all sections below** (do not truncate — include every section through the Date/Version footer): -**To file:** write `~/.gstack/contributor-logs/{slug}.md`: ``` # {Title} -**What I tried:** {action} | **What happened:** {result} | **Rating:** {0-10} -## Repro + +Hey gstack team — ran into this while using /{skill-name}: + +**What I was trying to do:** {what the user/agent was attempting} +**What happened instead:** {what actually happened} +**My rating:** {0-10} — {one sentence on why it wasn't a 10} + +## Steps to reproduce 1. {step} + +## Raw output +``` +{paste the actual error or unexpected output here} +``` + ## What would make this a 10 -{one sentence} -**Date:** {YYYY-MM-DD} | **Version:** {version} | **Skill:** /{skill} +{one sentence: what gstack should have done differently} + +**Date:** {YYYY-MM-DD} | **Version:** {gstack version} | **Skill:** /{skill} ``` -Slug: lowercase hyphens, max 60 chars. Skip if exists. Max 3/session. File inline, don't stop. + +Slug: lowercase, hyphens, max 60 chars (e.g. `browse-js-no-await`). Skip if file already exists. Max 3 reports per session. File inline and continue — don't stop the workflow. Tell user: "Filed gstack field report: {title}" ## Completion Status Protocol diff --git a/plan-eng-review/SKILL.md b/plan-eng-review/SKILL.md index ecf0ae309..53bf7112d 100644 --- a/plan-eng-review/SKILL.md +++ b/plan-eng-review/SKILL.md @@ -3,7 +3,6 @@ name: plan-eng-review preamble-tier: 3 version: 1.0.0 description: | - MANUAL TRIGGER ONLY: invoke only when user types /plan-eng-review. Eng manager-mode plan review. Lock in the execution plan — architecture, data flow, diagrams, edge cases, test coverage, performance. Walks through issues interactively with opinionated recommendations. Use when asked to @@ -111,6 +110,7 @@ This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. 3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. 4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` +5. **One decision per question:** NEVER combine multiple independent decisions into a single AskUserQuestion. Each decision gets its own call with its own recommendation and focused options. Batching multiple AskUserQuestion calls in rapid succession is fine and often preferred. Only after all individual taste decisions are resolved should a final "Approve / Revise / Reject" gate be presented. Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -118,54 +118,97 @@ Per-skill instructions may add additional formatting rules on top of this baseli ## Completeness Principle — Boil the Lake -AI makes completeness near-free. Always recommend the complete option over shortcuts — the delta is minutes with CC+gstack. A "lake" (100% coverage, all edge cases) is boilable; an "ocean" (full rewrite, multi-quarter migration) is not. Boil lakes, flag oceans. +AI-assisted coding makes the marginal cost of completeness near-zero. When you present options: -**Effort reference** — always show both scales: +- If Option A is the complete implementation (full parity, all edge cases, 100% coverage) and Option B is a shortcut that saves modest effort — **always recommend A**. The delta between 80 lines and 150 lines is meaningless with CC+gstack. "Good enough" is the wrong instinct when "complete" costs minutes more. +- **Lake vs. ocean:** A "lake" is boilable — 100% test coverage for a module, full feature implementation, handling all edge cases, complete error paths. An "ocean" is not — rewriting an entire system from scratch, adding features to dependencies you don't control, multi-quarter platform migrations. Recommend boiling lakes. Flag oceans as out of scope. +- **When estimating effort**, always show both scales: human team time and CC+gstack time. The compression ratio varies by task type — use this reference: | Task type | Human team | CC+gstack | Compression | |-----------|-----------|-----------|-------------| -| Boilerplate | 2 days | 15 min | ~100x | -| Tests | 1 day | 15 min | ~50x | -| Feature | 1 week | 30 min | ~30x | -| Bug fix | 4 hours | 15 min | ~20x | +| Boilerplate / scaffolding | 2 days | 15 min | ~100x | +| Test writing | 1 day | 15 min | ~50x | +| Feature implementation | 1 week | 30 min | ~30x | +| Bug fix + regression test | 4 hours | 15 min | ~20x | +| Architecture / design | 2 days | 4 hours | ~5x | +| Research / exploration | 1 day | 3 hours | ~3x | -Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). +- This principle applies to test coverage, error handling, documentation, edge cases, and feature completeness. Don't skip the last 10% to "save time" — with AI, that 10% costs seconds. -## Repo Ownership — See Something, Say Something +**Anti-patterns — DON'T do this:** +- BAD: "Choose B — it covers 90% of the value with less code." (If A is only 70 lines more, choose A.) +- BAD: "We can skip edge case handling to save time." (Edge case handling costs minutes with CC.) +- BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) +- BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") -`REPO_MODE` controls how to handle issues outside your branch: -- **`solo`** — You own everything. Investigate and offer to fix proactively. -- **`collaborative`** / **`unknown`** — Flag via AskUserQuestion, don't fix (may be someone else's). +## Repo Ownership Mode — See Something, Say Something -Always flag anything that looks wrong — one sentence, what you noticed and its impact. +`REPO_MODE` from the preamble tells you who owns issues in this repo: + +- **`solo`** — One person does 80%+ of the work. They own everything. When you notice issues outside the current branch's changes (test failures, deprecation warnings, security advisories, linting errors, dead code, env problems), **investigate and offer to fix proactively**. The solo dev is the only person who will fix it. Default to action. +- **`collaborative`** — Multiple active contributors. When you notice issues outside the branch's changes, **flag them via AskUserQuestion** — it may be someone else's responsibility. Default to asking, not fixing. +- **`unknown`** — Treat as collaborative (safer default — ask before fixing). + +**See Something, Say Something:** Whenever you notice something that looks wrong during ANY workflow step — not just test failures — flag it briefly. One sentence: what you noticed and its impact. In solo mode, follow up with "Want me to fix it?" In collaborative mode, just flag it and move on. + +Never let a noticed issue silently pass. The whole point is proactive communication. ## Search Before Building -Before building anything unfamiliar, **search first.** See `~/.claude/skills/gstack/ETHOS.md`. -- **Layer 1** (tried and true) — don't reinvent. **Layer 2** (new and popular) — scrutinize. **Layer 3** (first principles) — prize above all. +Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.claude/skills/gstack/ETHOS.md` for the full philosophy. + +**Three layers of knowledge:** +- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. +- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. +- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. + +**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: +"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." -**Eureka:** When first-principles reasoning contradicts conventional wisdom, name it and log: +Log eureka moments: ```bash jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true ``` +Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. + +**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." ## Contributor Mode -If `_CONTRIB` is `true`: you are in **contributor mode**. At the end of each major workflow step, rate your gstack experience 0-10. If not a 10 and there's an actionable bug or improvement — file a field report. +If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. + +**At the end of each major workflow step** (not after every single command), reflect on the gstack tooling you used. Rate your experience 0 to 10. If it wasn't a 10, think about why. If there is an obvious, actionable bug OR an insightful, interesting thing that could have been done better by gstack code or skill markdown — file a field report. Maybe our contributor will help make us better! + +**Calibration — this is the bar:** For example, `$B js "await fetch(...)"` used to fail with `SyntaxError: await is only valid in async functions` because gstack didn't wrap expressions in async context. Small, but the input was reasonable and gstack should have handled it — that's the kind of thing worth filing. Things less consequential than this, ignore. + +**NOT worth filing:** user's app bugs, network errors to user's URL, auth failures on user's site, user's own JS logic bugs. -**File only:** gstack tooling bugs where the input was reasonable but gstack failed. **Skip:** user app bugs, network errors, auth failures on user's site. +**To file:** write `~/.gstack/contributor-logs/{slug}.md` with **all sections below** (do not truncate — include every section through the Date/Version footer): -**To file:** write `~/.gstack/contributor-logs/{slug}.md`: ``` # {Title} -**What I tried:** {action} | **What happened:** {result} | **Rating:** {0-10} -## Repro + +Hey gstack team — ran into this while using /{skill-name}: + +**What I was trying to do:** {what the user/agent was attempting} +**What happened instead:** {what actually happened} +**My rating:** {0-10} — {one sentence on why it wasn't a 10} + +## Steps to reproduce 1. {step} + +## Raw output +``` +{paste the actual error or unexpected output here} +``` + ## What would make this a 10 -{one sentence} -**Date:** {YYYY-MM-DD} | **Version:** {version} | **Skill:** /{skill} +{one sentence: what gstack should have done differently} + +**Date:** {YYYY-MM-DD} | **Version:** {gstack version} | **Skill:** /{skill} ``` -Slug: lowercase hyphens, max 60 chars. Skip if exists. Max 3/session. File inline, don't stop. + +Slug: lowercase, hyphens, max 60 chars (e.g. `browse-js-no-await`). Skip if file already exists. Max 3 reports per session. File inline and continue — don't stop the workflow. Tell user: "Filed gstack field report: {title}" ## Completion Status Protocol diff --git a/review/SKILL.md b/review/SKILL.md index 912e1f3ec..a58e86274 100644 --- a/review/SKILL.md +++ b/review/SKILL.md @@ -3,7 +3,6 @@ name: review preamble-tier: 4 version: 1.0.0 description: | - MANUAL TRIGGER ONLY: invoke only when user types /review. Pre-landing PR review. Analyzes diff against the base branch for SQL safety, LLM trust boundary violations, conditional side effects, and other structural issues. Use when asked to "review this PR", "code review", "pre-landing review", or "check my diff". @@ -110,6 +109,7 @@ This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. 3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. 4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` +5. **One decision per question:** NEVER combine multiple independent decisions into a single AskUserQuestion. Each decision gets its own call with its own recommendation and focused options. Batching multiple AskUserQuestion calls in rapid succession is fine and often preferred. Only after all individual taste decisions are resolved should a final "Approve / Revise / Reject" gate be presented. Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -117,54 +117,97 @@ Per-skill instructions may add additional formatting rules on top of this baseli ## Completeness Principle — Boil the Lake -AI makes completeness near-free. Always recommend the complete option over shortcuts — the delta is minutes with CC+gstack. A "lake" (100% coverage, all edge cases) is boilable; an "ocean" (full rewrite, multi-quarter migration) is not. Boil lakes, flag oceans. +AI-assisted coding makes the marginal cost of completeness near-zero. When you present options: -**Effort reference** — always show both scales: +- If Option A is the complete implementation (full parity, all edge cases, 100% coverage) and Option B is a shortcut that saves modest effort — **always recommend A**. The delta between 80 lines and 150 lines is meaningless with CC+gstack. "Good enough" is the wrong instinct when "complete" costs minutes more. +- **Lake vs. ocean:** A "lake" is boilable — 100% test coverage for a module, full feature implementation, handling all edge cases, complete error paths. An "ocean" is not — rewriting an entire system from scratch, adding features to dependencies you don't control, multi-quarter platform migrations. Recommend boiling lakes. Flag oceans as out of scope. +- **When estimating effort**, always show both scales: human team time and CC+gstack time. The compression ratio varies by task type — use this reference: | Task type | Human team | CC+gstack | Compression | |-----------|-----------|-----------|-------------| -| Boilerplate | 2 days | 15 min | ~100x | -| Tests | 1 day | 15 min | ~50x | -| Feature | 1 week | 30 min | ~30x | -| Bug fix | 4 hours | 15 min | ~20x | +| Boilerplate / scaffolding | 2 days | 15 min | ~100x | +| Test writing | 1 day | 15 min | ~50x | +| Feature implementation | 1 week | 30 min | ~30x | +| Bug fix + regression test | 4 hours | 15 min | ~20x | +| Architecture / design | 2 days | 4 hours | ~5x | +| Research / exploration | 1 day | 3 hours | ~3x | -Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). +- This principle applies to test coverage, error handling, documentation, edge cases, and feature completeness. Don't skip the last 10% to "save time" — with AI, that 10% costs seconds. -## Repo Ownership — See Something, Say Something +**Anti-patterns — DON'T do this:** +- BAD: "Choose B — it covers 90% of the value with less code." (If A is only 70 lines more, choose A.) +- BAD: "We can skip edge case handling to save time." (Edge case handling costs minutes with CC.) +- BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) +- BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") -`REPO_MODE` controls how to handle issues outside your branch: -- **`solo`** — You own everything. Investigate and offer to fix proactively. -- **`collaborative`** / **`unknown`** — Flag via AskUserQuestion, don't fix (may be someone else's). +## Repo Ownership Mode — See Something, Say Something -Always flag anything that looks wrong — one sentence, what you noticed and its impact. +`REPO_MODE` from the preamble tells you who owns issues in this repo: + +- **`solo`** — One person does 80%+ of the work. They own everything. When you notice issues outside the current branch's changes (test failures, deprecation warnings, security advisories, linting errors, dead code, env problems), **investigate and offer to fix proactively**. The solo dev is the only person who will fix it. Default to action. +- **`collaborative`** — Multiple active contributors. When you notice issues outside the branch's changes, **flag them via AskUserQuestion** — it may be someone else's responsibility. Default to asking, not fixing. +- **`unknown`** — Treat as collaborative (safer default — ask before fixing). + +**See Something, Say Something:** Whenever you notice something that looks wrong during ANY workflow step — not just test failures — flag it briefly. One sentence: what you noticed and its impact. In solo mode, follow up with "Want me to fix it?" In collaborative mode, just flag it and move on. + +Never let a noticed issue silently pass. The whole point is proactive communication. ## Search Before Building -Before building anything unfamiliar, **search first.** See `~/.claude/skills/gstack/ETHOS.md`. -- **Layer 1** (tried and true) — don't reinvent. **Layer 2** (new and popular) — scrutinize. **Layer 3** (first principles) — prize above all. +Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.claude/skills/gstack/ETHOS.md` for the full philosophy. + +**Three layers of knowledge:** +- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. +- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. +- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. -**Eureka:** When first-principles reasoning contradicts conventional wisdom, name it and log: +**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: +"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." + +Log eureka moments: ```bash jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true ``` +Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. + +**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." ## Contributor Mode -If `_CONTRIB` is `true`: you are in **contributor mode**. At the end of each major workflow step, rate your gstack experience 0-10. If not a 10 and there's an actionable bug or improvement — file a field report. +If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. + +**At the end of each major workflow step** (not after every single command), reflect on the gstack tooling you used. Rate your experience 0 to 10. If it wasn't a 10, think about why. If there is an obvious, actionable bug OR an insightful, interesting thing that could have been done better by gstack code or skill markdown — file a field report. Maybe our contributor will help make us better! + +**Calibration — this is the bar:** For example, `$B js "await fetch(...)"` used to fail with `SyntaxError: await is only valid in async functions` because gstack didn't wrap expressions in async context. Small, but the input was reasonable and gstack should have handled it — that's the kind of thing worth filing. Things less consequential than this, ignore. -**File only:** gstack tooling bugs where the input was reasonable but gstack failed. **Skip:** user app bugs, network errors, auth failures on user's site. +**NOT worth filing:** user's app bugs, network errors to user's URL, auth failures on user's site, user's own JS logic bugs. + +**To file:** write `~/.gstack/contributor-logs/{slug}.md` with **all sections below** (do not truncate — include every section through the Date/Version footer): -**To file:** write `~/.gstack/contributor-logs/{slug}.md`: ``` # {Title} -**What I tried:** {action} | **What happened:** {result} | **Rating:** {0-10} -## Repro + +Hey gstack team — ran into this while using /{skill-name}: + +**What I was trying to do:** {what the user/agent was attempting} +**What happened instead:** {what actually happened} +**My rating:** {0-10} — {one sentence on why it wasn't a 10} + +## Steps to reproduce 1. {step} + +## Raw output +``` +{paste the actual error or unexpected output here} +``` + ## What would make this a 10 -{one sentence} -**Date:** {YYYY-MM-DD} | **Version:** {version} | **Skill:** /{skill} +{one sentence: what gstack should have done differently} + +**Date:** {YYYY-MM-DD} | **Version:** {gstack version} | **Skill:** /{skill} ``` -Slug: lowercase hyphens, max 60 chars. Skip if exists. Max 3/session. File inline, don't stop. + +Slug: lowercase, hyphens, max 60 chars (e.g. `browse-js-no-await`). Skip if file already exists. Max 3 reports per session. File inline and continue — don't stop the workflow. Tell user: "Filed gstack field report: {title}" ## Completion Status Protocol @@ -297,7 +340,120 @@ Before reviewing code quality, check: **did they build what was requested — no **If no PR exists:** rely on commit messages and TODOS.md for stated intent — this is the common case since /review runs before /ship creates the PR. 2. Identify the **stated intent** — what was this branch supposed to accomplish? 3. Run `git diff origin/...HEAD --stat` and compare the files changed against the stated intent. -4. Evaluate with skepticism: + +### Plan File Discovery + +1. **Conversation context (primary):** Check if there is an active plan file in this conversation — Claude Code system messages include plan file paths when in plan mode. Look for references like `~/.claude/plans/*.md` in system messages. If found, use it directly — this is the most reliable signal. + +2. **Content-based search (fallback):** If no plan file is referenced in conversation context, search by content: + +```bash +BRANCH=$(git branch --show-current 2>/dev/null | tr '/' '-') +REPO=$(basename "$(git rev-parse --show-toplevel 2>/dev/null)") +# Try branch name match first (most specific) +PLAN=$(ls -t ~/.claude/plans/*.md 2>/dev/null | xargs grep -l "$BRANCH" 2>/dev/null | head -1) +# Fall back to repo name match +[ -z "$PLAN" ] && PLAN=$(ls -t ~/.claude/plans/*.md 2>/dev/null | xargs grep -l "$REPO" 2>/dev/null | head -1) +# Last resort: most recent plan modified in the last 24 hours +[ -z "$PLAN" ] && PLAN=$(find ~/.claude/plans -name '*.md' -mmin -1440 -maxdepth 1 2>/dev/null | xargs ls -t 2>/dev/null | head -1) +[ -n "$PLAN" ] && echo "PLAN_FILE: $PLAN" || echo "NO_PLAN_FILE" +``` + +3. **Validation:** If a plan file was found via content-based search (not conversation context), read the first 20 lines and verify it is relevant to the current branch's work. If it appears to be from a different project or feature, treat as "no plan file found." + +**Error handling:** +- No plan file found → skip with "No plan file detected — skipping." +- Plan file found but unreadable (permissions, encoding) → skip with "Plan file found but unreadable — skipping." + +### Actionable Item Extraction + +Read the plan file. Extract every actionable item — anything that describes work to be done. Look for: + +- **Checkbox items:** `- [ ] ...` or `- [x] ...` +- **Numbered steps** under implementation headings: "1. Create ...", "2. Add ...", "3. Modify ..." +- **Imperative statements:** "Add X to Y", "Create a Z service", "Modify the W controller" +- **File-level specifications:** "New file: path/to/file.ts", "Modify path/to/existing.rb" +- **Test requirements:** "Test that X", "Add test for Y", "Verify Z" +- **Data model changes:** "Add column X to table Y", "Create migration for Z" + +**Ignore:** +- Context/Background sections (`## Context`, `## Background`, `## Problem`) +- Questions and open items (marked with ?, "TBD", "TODO: decide") +- Review report sections (`## GSTACK REVIEW REPORT`) +- Explicitly deferred items ("Future:", "Out of scope:", "NOT in scope:", "P2:", "P3:", "P4:") +- CEO Review Decisions sections (these record choices, not work items) + +**Cap:** Extract at most 50 items. If the plan has more, note: "Showing top 50 of N plan items — full list in plan file." + +**No items found:** If the plan contains no extractable actionable items, skip with: "Plan file contains no actionable items — skipping completion audit." + +For each item, note: +- The item text (verbatim or concise summary) +- Its category: CODE | TEST | MIGRATION | CONFIG | DOCS + +### Cross-Reference Against Diff + +Run `git diff origin/...HEAD` and `git log origin/..HEAD --oneline` to understand what was implemented. + +For each extracted plan item, check the diff and classify: + +- **DONE** — Clear evidence in the diff that this item was implemented. Cite the specific file(s) changed. +- **PARTIAL** — Some work toward this item exists in the diff but it's incomplete (e.g., model created but controller missing, function exists but edge cases not handled). +- **NOT DONE** — No evidence in the diff that this item was addressed. +- **CHANGED** — The item was implemented using a different approach than the plan described, but the same goal is achieved. Note the difference. + +**Be conservative with DONE** — require clear evidence in the diff. A file being touched is not enough; the specific functionality described must be present. +**Be generous with CHANGED** — if the goal is met by different means, that counts as addressed. + +### Output Format + +``` +PLAN COMPLETION AUDIT +═══════════════════════════════ +Plan: {plan file path} + +## Implementation Items + [DONE] Create UserService — src/services/user_service.rb (+142 lines) + [PARTIAL] Add validation — model validates but missing controller checks + [NOT DONE] Add caching layer — no cache-related changes in diff + [CHANGED] "Redis queue" → implemented with Sidekiq instead + +## Test Items + [DONE] Unit tests for UserService — test/services/user_service_test.rb + [NOT DONE] E2E test for signup flow + +## Migration Items + [DONE] Create users table — db/migrate/20240315_create_users.rb + +───────────────────────────────── +COMPLETION: 4/7 DONE, 1 PARTIAL, 1 NOT DONE, 1 CHANGED +───────────────────────────────── +``` + +### Integration with Scope Drift Detection + +The plan completion results augment the existing Scope Drift Detection. If a plan file is found: + +- **NOT DONE items** become additional evidence for **MISSING REQUIREMENTS** in the scope drift report. +- **Items in the diff that don't match any plan item** become evidence for **SCOPE CREEP** detection. + +This is **INFORMATIONAL** — does not block the review (consistent with existing scope drift behavior). + +Update the scope drift output to include plan file context: + +``` +Scope Check: [CLEAN / DRIFT DETECTED / REQUIREMENTS MISSING] +Intent: +Plan: +Delivered: <1-line summary of what the diff actually does> +Plan items: N DONE, M PARTIAL, K NOT DONE +[If NOT DONE: list each missing item] +[If scope creep: list each out-of-scope change not in the plan] +``` + +**No plan file found:** Fall back to existing scope drift behavior (check TODOS.md and PR description only). + +4. Evaluate with skepticism (incorporating plan completion results if available): **SCOPE CREEP detection:** - Files changed that are unrelated to the stated intent diff --git a/ship/SKILL.md b/ship/SKILL.md index 16d0e4b37..af2ea565c 100644 --- a/ship/SKILL.md +++ b/ship/SKILL.md @@ -3,7 +3,6 @@ name: ship preamble-tier: 4 version: 1.0.0 description: | - MANUAL TRIGGER ONLY: invoke only when user types /ship. Ship workflow: detect + merge base branch, run tests, review diff, bump VERSION, update CHANGELOG, commit, push, create PR. Use when asked to "ship", "deploy", "push to main", "create a PR", or "merge and push". Proactively suggest when the user says code is ready or asks about deploying. allowed-tools: @@ -108,6 +107,7 @@ This only happens once. If `TEL_PROMPTED` is `yes`, skip this entirely. 2. **Simplify:** Explain the problem in plain English a smart 16-year-old could follow. No raw function names, no internal jargon, no implementation details. Use concrete examples and analogies. Say what it DOES, not what it's called. 3. **Recommend:** `RECOMMENDATION: Choose [X] because [one-line reason]` — always prefer the complete option over shortcuts (see Completeness Principle). Include `Completeness: X/10` for each option. Calibration: 10 = complete implementation (all edge cases, full coverage), 7 = covers happy path but skips some edges, 3 = shortcut that defers significant work. If both options are 8+, pick the higher; if one is ≤5, flag it. 4. **Options:** Lettered options: `A) ... B) ... C) ...` — when an option involves effort, show both scales: `(human: ~X / CC: ~Y)` +5. **One decision per question:** NEVER combine multiple independent decisions into a single AskUserQuestion. Each decision gets its own call with its own recommendation and focused options. Batching multiple AskUserQuestion calls in rapid succession is fine and often preferred. Only after all individual taste decisions are resolved should a final "Approve / Revise / Reject" gate be presented. Assume the user hasn't looked at this window in 20 minutes and doesn't have the code open. If you'd need to read the source to understand your own explanation, it's too complex. @@ -115,54 +115,97 @@ Per-skill instructions may add additional formatting rules on top of this baseli ## Completeness Principle — Boil the Lake -AI makes completeness near-free. Always recommend the complete option over shortcuts — the delta is minutes with CC+gstack. A "lake" (100% coverage, all edge cases) is boilable; an "ocean" (full rewrite, multi-quarter migration) is not. Boil lakes, flag oceans. +AI-assisted coding makes the marginal cost of completeness near-zero. When you present options: -**Effort reference** — always show both scales: +- If Option A is the complete implementation (full parity, all edge cases, 100% coverage) and Option B is a shortcut that saves modest effort — **always recommend A**. The delta between 80 lines and 150 lines is meaningless with CC+gstack. "Good enough" is the wrong instinct when "complete" costs minutes more. +- **Lake vs. ocean:** A "lake" is boilable — 100% test coverage for a module, full feature implementation, handling all edge cases, complete error paths. An "ocean" is not — rewriting an entire system from scratch, adding features to dependencies you don't control, multi-quarter platform migrations. Recommend boiling lakes. Flag oceans as out of scope. +- **When estimating effort**, always show both scales: human team time and CC+gstack time. The compression ratio varies by task type — use this reference: | Task type | Human team | CC+gstack | Compression | |-----------|-----------|-----------|-------------| -| Boilerplate | 2 days | 15 min | ~100x | -| Tests | 1 day | 15 min | ~50x | -| Feature | 1 week | 30 min | ~30x | -| Bug fix | 4 hours | 15 min | ~20x | +| Boilerplate / scaffolding | 2 days | 15 min | ~100x | +| Test writing | 1 day | 15 min | ~50x | +| Feature implementation | 1 week | 30 min | ~30x | +| Bug fix + regression test | 4 hours | 15 min | ~20x | +| Architecture / design | 2 days | 4 hours | ~5x | +| Research / exploration | 1 day | 3 hours | ~3x | -Include `Completeness: X/10` for each option (10=all edge cases, 7=happy path, 3=shortcut). +- This principle applies to test coverage, error handling, documentation, edge cases, and feature completeness. Don't skip the last 10% to "save time" — with AI, that 10% costs seconds. -## Repo Ownership — See Something, Say Something +**Anti-patterns — DON'T do this:** +- BAD: "Choose B — it covers 90% of the value with less code." (If A is only 70 lines more, choose A.) +- BAD: "We can skip edge case handling to save time." (Edge case handling costs minutes with CC.) +- BAD: "Let's defer test coverage to a follow-up PR." (Tests are the cheapest lake to boil.) +- BAD: Quoting only human-team effort: "This would take 2 weeks." (Say: "2 weeks human / ~1 hour CC.") -`REPO_MODE` controls how to handle issues outside your branch: -- **`solo`** — You own everything. Investigate and offer to fix proactively. -- **`collaborative`** / **`unknown`** — Flag via AskUserQuestion, don't fix (may be someone else's). +## Repo Ownership Mode — See Something, Say Something -Always flag anything that looks wrong — one sentence, what you noticed and its impact. +`REPO_MODE` from the preamble tells you who owns issues in this repo: + +- **`solo`** — One person does 80%+ of the work. They own everything. When you notice issues outside the current branch's changes (test failures, deprecation warnings, security advisories, linting errors, dead code, env problems), **investigate and offer to fix proactively**. The solo dev is the only person who will fix it. Default to action. +- **`collaborative`** — Multiple active contributors. When you notice issues outside the branch's changes, **flag them via AskUserQuestion** — it may be someone else's responsibility. Default to asking, not fixing. +- **`unknown`** — Treat as collaborative (safer default — ask before fixing). + +**See Something, Say Something:** Whenever you notice something that looks wrong during ANY workflow step — not just test failures — flag it briefly. One sentence: what you noticed and its impact. In solo mode, follow up with "Want me to fix it?" In collaborative mode, just flag it and move on. + +Never let a noticed issue silently pass. The whole point is proactive communication. ## Search Before Building -Before building anything unfamiliar, **search first.** See `~/.claude/skills/gstack/ETHOS.md`. -- **Layer 1** (tried and true) — don't reinvent. **Layer 2** (new and popular) — scrutinize. **Layer 3** (first principles) — prize above all. +Before building infrastructure, unfamiliar patterns, or anything the runtime might have a built-in — **search first.** Read `~/.claude/skills/gstack/ETHOS.md` for the full philosophy. -**Eureka:** When first-principles reasoning contradicts conventional wisdom, name it and log: +**Three layers of knowledge:** +- **Layer 1** (tried and true — in distribution). Don't reinvent the wheel. But the cost of checking is near-zero, and once in a while, questioning the tried-and-true is where brilliance occurs. +- **Layer 2** (new and popular — search for these). But scrutinize: humans are subject to mania. Search results are inputs to your thinking, not answers. +- **Layer 3** (first principles — prize these above all). Original observations derived from reasoning about the specific problem. The most valuable of all. + +**Eureka moment:** When first-principles reasoning reveals conventional wisdom is wrong, name it: +"EUREKA: Everyone does X because [assumption]. But [evidence] shows this is wrong. Y is better because [reasoning]." + +Log eureka moments: ```bash jq -n --arg ts "$(date -u +%Y-%m-%dT%H:%M:%SZ)" --arg skill "SKILL_NAME" --arg branch "$(git branch --show-current 2>/dev/null)" --arg insight "ONE_LINE_SUMMARY" '{ts:$ts,skill:$skill,branch:$branch,insight:$insight}' >> ~/.gstack/analytics/eureka.jsonl 2>/dev/null || true ``` +Replace SKILL_NAME and ONE_LINE_SUMMARY. Runs inline — don't stop the workflow. + +**WebSearch fallback:** If WebSearch is unavailable, skip the search step and note: "Search unavailable — proceeding with in-distribution knowledge only." ## Contributor Mode -If `_CONTRIB` is `true`: you are in **contributor mode**. At the end of each major workflow step, rate your gstack experience 0-10. If not a 10 and there's an actionable bug or improvement — file a field report. +If `_CONTRIB` is `true`: you are in **contributor mode**. You're a gstack user who also helps make it better. -**File only:** gstack tooling bugs where the input was reasonable but gstack failed. **Skip:** user app bugs, network errors, auth failures on user's site. +**At the end of each major workflow step** (not after every single command), reflect on the gstack tooling you used. Rate your experience 0 to 10. If it wasn't a 10, think about why. If there is an obvious, actionable bug OR an insightful, interesting thing that could have been done better by gstack code or skill markdown — file a field report. Maybe our contributor will help make us better! + +**Calibration — this is the bar:** For example, `$B js "await fetch(...)"` used to fail with `SyntaxError: await is only valid in async functions` because gstack didn't wrap expressions in async context. Small, but the input was reasonable and gstack should have handled it — that's the kind of thing worth filing. Things less consequential than this, ignore. + +**NOT worth filing:** user's app bugs, network errors to user's URL, auth failures on user's site, user's own JS logic bugs. + +**To file:** write `~/.gstack/contributor-logs/{slug}.md` with **all sections below** (do not truncate — include every section through the Date/Version footer): -**To file:** write `~/.gstack/contributor-logs/{slug}.md`: ``` # {Title} -**What I tried:** {action} | **What happened:** {result} | **Rating:** {0-10} -## Repro + +Hey gstack team — ran into this while using /{skill-name}: + +**What I was trying to do:** {what the user/agent was attempting} +**What happened instead:** {what actually happened} +**My rating:** {0-10} — {one sentence on why it wasn't a 10} + +## Steps to reproduce 1. {step} + +## Raw output +``` +{paste the actual error or unexpected output here} +``` + ## What would make this a 10 -{one sentence} -**Date:** {YYYY-MM-DD} | **Version:** {version} | **Skill:** /{skill} +{one sentence: what gstack should have done differently} + +**Date:** {YYYY-MM-DD} | **Version:** {gstack version} | **Skill:** /{skill} ``` -Slug: lowercase hyphens, max 60 chars. Skip if exists. Max 3/session. File inline, don't stop. + +Slug: lowercase, hyphens, max 60 chars (e.g. `browse-js-no-await`). Skip if file already exists. Max 3 reports per session. File inline and continue — don't stop the workflow. Tell user: "Filed gstack field report: {title}" ## Completion Status Protocol @@ -283,6 +326,9 @@ You are running the `/ship` workflow. This is a **non-interactive, fully automat - Pre-landing review finds ASK items that need user judgment - MINOR or MAJOR version bump needed (ask — see Step 4) - Greptile review comments that need user decision (complex fixes, false positives) +- AI-assessed coverage below minimum threshold (hard gate with user override — see Step 3.4) +- Plan items NOT DONE with no user override (see Step 3.45) +- Plan verification failures (see Step 3.47) - TODOS.md missing and user wants to create one (ask — see Step 5.5) - TODOS.md disorganized and user wants to reorganize (ask — see Step 5.5) @@ -294,7 +340,7 @@ You are running the `/ship` workflow. This is a **non-interactive, fully automat - Multi-file changesets (auto-split into bisectable commits) - TODOS.md completed-item detection (auto-mark) - Auto-fixable review findings (dead code, N+1, stale comments — fixed automatically) -- Test coverage gaps (auto-generate and commit, or flag in PR body) +- Test coverage gaps within target threshold (auto-generate and commit, or flag in PR body) --- @@ -986,6 +1032,181 @@ Repo: {owner/repo} --- +## Step 3.45: Plan Completion Audit + +### Plan File Discovery + +1. **Conversation context (primary):** Check if there is an active plan file in this conversation — Claude Code system messages include plan file paths when in plan mode. Look for references like `~/.claude/plans/*.md` in system messages. If found, use it directly — this is the most reliable signal. + +2. **Content-based search (fallback):** If no plan file is referenced in conversation context, search by content: + +```bash +BRANCH=$(git branch --show-current 2>/dev/null | tr '/' '-') +REPO=$(basename "$(git rev-parse --show-toplevel 2>/dev/null)") +# Try branch name match first (most specific) +PLAN=$(ls -t ~/.claude/plans/*.md 2>/dev/null | xargs grep -l "$BRANCH" 2>/dev/null | head -1) +# Fall back to repo name match +[ -z "$PLAN" ] && PLAN=$(ls -t ~/.claude/plans/*.md 2>/dev/null | xargs grep -l "$REPO" 2>/dev/null | head -1) +# Last resort: most recent plan modified in the last 24 hours +[ -z "$PLAN" ] && PLAN=$(find ~/.claude/plans -name '*.md' -mmin -1440 -maxdepth 1 2>/dev/null | xargs ls -t 2>/dev/null | head -1) +[ -n "$PLAN" ] && echo "PLAN_FILE: $PLAN" || echo "NO_PLAN_FILE" +``` + +3. **Validation:** If a plan file was found via content-based search (not conversation context), read the first 20 lines and verify it is relevant to the current branch's work. If it appears to be from a different project or feature, treat as "no plan file found." + +**Error handling:** +- No plan file found → skip with "No plan file detected — skipping." +- Plan file found but unreadable (permissions, encoding) → skip with "Plan file found but unreadable — skipping." + +### Actionable Item Extraction + +Read the plan file. Extract every actionable item — anything that describes work to be done. Look for: + +- **Checkbox items:** `- [ ] ...` or `- [x] ...` +- **Numbered steps** under implementation headings: "1. Create ...", "2. Add ...", "3. Modify ..." +- **Imperative statements:** "Add X to Y", "Create a Z service", "Modify the W controller" +- **File-level specifications:** "New file: path/to/file.ts", "Modify path/to/existing.rb" +- **Test requirements:** "Test that X", "Add test for Y", "Verify Z" +- **Data model changes:** "Add column X to table Y", "Create migration for Z" + +**Ignore:** +- Context/Background sections (`## Context`, `## Background`, `## Problem`) +- Questions and open items (marked with ?, "TBD", "TODO: decide") +- Review report sections (`## GSTACK REVIEW REPORT`) +- Explicitly deferred items ("Future:", "Out of scope:", "NOT in scope:", "P2:", "P3:", "P4:") +- CEO Review Decisions sections (these record choices, not work items) + +**Cap:** Extract at most 50 items. If the plan has more, note: "Showing top 50 of N plan items — full list in plan file." + +**No items found:** If the plan contains no extractable actionable items, skip with: "Plan file contains no actionable items — skipping completion audit." + +For each item, note: +- The item text (verbatim or concise summary) +- Its category: CODE | TEST | MIGRATION | CONFIG | DOCS + +### Cross-Reference Against Diff + +Run `git diff origin/...HEAD` and `git log origin/..HEAD --oneline` to understand what was implemented. + +For each extracted plan item, check the diff and classify: + +- **DONE** — Clear evidence in the diff that this item was implemented. Cite the specific file(s) changed. +- **PARTIAL** — Some work toward this item exists in the diff but it's incomplete (e.g., model created but controller missing, function exists but edge cases not handled). +- **NOT DONE** — No evidence in the diff that this item was addressed. +- **CHANGED** — The item was implemented using a different approach than the plan described, but the same goal is achieved. Note the difference. + +**Be conservative with DONE** — require clear evidence in the diff. A file being touched is not enough; the specific functionality described must be present. +**Be generous with CHANGED** — if the goal is met by different means, that counts as addressed. + +### Output Format + +``` +PLAN COMPLETION AUDIT +═══════════════════════════════ +Plan: {plan file path} + +## Implementation Items + [DONE] Create UserService — src/services/user_service.rb (+142 lines) + [PARTIAL] Add validation — model validates but missing controller checks + [NOT DONE] Add caching layer — no cache-related changes in diff + [CHANGED] "Redis queue" → implemented with Sidekiq instead + +## Test Items + [DONE] Unit tests for UserService — test/services/user_service_test.rb + [NOT DONE] E2E test for signup flow + +## Migration Items + [DONE] Create users table — db/migrate/20240315_create_users.rb + +───────────────────────────────── +COMPLETION: 4/7 DONE, 1 PARTIAL, 1 NOT DONE, 1 CHANGED +───────────────────────────────── +``` + +### Gate Logic + +After producing the completion checklist: + +- **All DONE or CHANGED:** Pass. "Plan completion: PASS — all items addressed." Continue. +- **Only PARTIAL items (no NOT DONE):** Continue with a note in the PR body. Not blocking. +- **Any NOT DONE items:** Use AskUserQuestion: + - Show the completion checklist above + - "{N} items from the plan are NOT DONE. These were part of the original plan but are missing from the implementation." + - RECOMMENDATION: depends on item count and severity. If 1-2 minor items (docs, config), recommend B. If core functionality is missing, recommend A. + - Options: + A) Stop — implement the missing items before shipping + B) Ship anyway — defer these to a follow-up (will create P1 TODOs in Step 5.5) + C) These items were intentionally dropped — remove from scope + - If A: STOP. List the missing items for the user to implement. + - If B: Continue. For each NOT DONE item, create a P1 TODO in Step 5.5 with "Deferred from plan: {plan file path}". + - If C: Continue. Note in PR body: "Plan items intentionally dropped: {list}." + +**No plan file found:** Skip entirely. "No plan file detected — skipping plan completion audit." + +**Include in PR body (Step 8):** Add a `## Plan Completion` section with the checklist summary. + +--- + +## Step 3.47: Plan Verification + +Automatically verify the plan's testing/verification steps using the `/qa-only` skill. + +### 1. Check for verification section + +Using the plan file already discovered in Step 3.45, look for a verification section. Match any of these headings: `## Verification`, `## Test plan`, `## Testing`, `## How to test`, `## Manual testing`, or any section with verification-flavored items (URLs to visit, things to check visually, interactions to test). + +**If no verification section found:** Skip with "No verification steps found in plan — skipping auto-verification." +**If no plan file was found in Step 3.45:** Skip (already handled). + +### 2. Check for running dev server + +Before invoking browse-based verification, check if a dev server is reachable: + +```bash +curl -s -o /dev/null -w '%{http_code}' http://localhost:3000 2>/dev/null || \ +curl -s -o /dev/null -w '%{http_code}' http://localhost:8080 2>/dev/null || \ +curl -s -o /dev/null -w '%{http_code}' http://localhost:5173 2>/dev/null || \ +curl -s -o /dev/null -w '%{http_code}' http://localhost:4000 2>/dev/null || echo "NO_SERVER" +``` + +**If NO_SERVER:** Skip with "No dev server detected — skipping plan verification. Run /qa separately after deploying." + +### 3. Invoke /qa-only inline + +Read the `/qa-only` skill from disk: + +```bash +cat ${CLAUDE_SKILL_DIR}/../qa-only/SKILL.md +``` + +**If unreadable:** Skip with "Could not load /qa-only — skipping plan verification." + +Follow the /qa-only workflow with these modifications: +- **Skip the preamble** (already handled by /ship) +- **Use the plan's verification section as the primary test input** — treat each verification item as a test case +- **Use the detected dev server URL** as the base URL +- **Skip the fix loop** — this is report-only verification during /ship +- **Cap at the verification items from the plan** — do not expand into general site QA + +### 4. Gate logic + +- **All verification items PASS:** Continue silently. "Plan verification: PASS." +- **Any FAIL:** Use AskUserQuestion: + - Show the failures with screenshot evidence + - RECOMMENDATION: Choose A if failures indicate broken functionality. Choose B if cosmetic only. + - Options: + A) Fix the failures before shipping (recommended for functional issues) + B) Ship anyway — known issues (acceptable for cosmetic issues) +- **No verification section / no server / unreadable skill:** Skip (non-blocking). + +### 5. Include in PR body + +Add a `## Verification Results` section to the PR body (Step 8): +- If verification ran: summary of results (N PASS, M FAIL, K SKIPPED) +- If skipped: reason for skipping (no plan, no server, no verification section) + +--- + ## Step 3.5: Pre-Landing Review Review the diff for structural issues that tests don't catch. @@ -1448,6 +1669,16 @@ gh pr create --base --title ": " --body "$(cat <<'EOF' +## Plan Completion + + + + +## Verification Results + + + + ## TODOS @@ -1488,6 +1719,32 @@ doc updates — the user runs `/ship` and documentation stays current without a --- +## Step 8.75: Persist ship metrics + +Log coverage and plan completion data so `/retro` can track trends: + +```bash +eval "$(~/.claude/skills/gstack/bin/gstack-slug 2>/dev/null)" && mkdir -p ~/.gstack/projects/$SLUG +``` + +Append to `~/.gstack/projects/$SLUG/$BRANCH-reviews.jsonl`: + +```bash +echo '{"skill":"ship","timestamp":"'"$(date -u +%Y-%m-%dT%H:%M:%SZ)"'","coverage_pct":COVERAGE_PCT,"plan_items_total":PLAN_TOTAL,"plan_items_done":PLAN_DONE,"verification_result":"VERIFY_RESULT","version":"VERSION","branch":"BRANCH"}' >> ~/.gstack/projects/$SLUG/$BRANCH-reviews.jsonl +``` + +Substitute from earlier steps: +- **COVERAGE_PCT**: coverage percentage from Step 3.4 diagram (integer, or -1 if undetermined) +- **PLAN_TOTAL**: total plan items extracted in Step 3.45 (0 if no plan file) +- **PLAN_DONE**: count of DONE + CHANGED items from Step 3.45 (0 if no plan file) +- **VERIFY_RESULT**: "pass", "fail", or "skipped" from Step 3.47 +- **VERSION**: from the VERSION file +- **BRANCH**: current branch name + +This step is automatic — never skip it, never ask for confirmation. + +--- + ## Important Rules - **Never skip tests.** If tests fail, stop. From 7b13a7f9a16a85d83f18049dc27cc069b1f02065 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Tue, 24 Mar 2026 20:32:33 -0700 Subject: [PATCH 4/5] chore: bump version and changelog (v0.11.19.0) Co-Authored-By: Claude Opus 4.6 (1M context) --- CHANGELOG.md | 43 +++++++++++++++++++++++++++++++++++++++++++ VERSION | 2 +- package.json | 2 +- 3 files changed, 45 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 437a7eedd..e63678647 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,5 +1,48 @@ # Changelog +## [0.11.19.0] - 2026-03-24 + +### Fixed + +- **Auto-upgrade no longer breaks.** The root gstack skill description was 7 characters from the Codex 1024-char limit. Every new skill addition pushed it closer. Moved the skill routing table from the description (bounded) to the body (unlimited), dropping from 1017 to 409 chars with 615 chars of headroom. +- **Codex reviews now run in the correct repo.** In multi-workspace setups (like Conductor), Codex could pick up the wrong project directory. All `codex exec` calls now explicitly set `-C` to the git root. + +### Added + +- **900-char early warning test.** A new test fails if any Codex skill description exceeds 900 chars, catching description bloat before it breaks builds. + +## [0.11.18.1] - 2026-03-24 + +### Changed + +- **One decision per question — everywhere.** Every skill now presents decisions one at a time, each with its own focused question, recommendation, and options. No more wall-of-text questions that bundle unrelated choices together. This was already enforced in the three plan-review skills; now it's a universal rule across all 23+ skills. + +## [0.11.18.0] - 2026-03-24 — Ship With Teeth + +`/ship` and `/review` now actually enforce the quality gates they've been talking about. Coverage audit becomes a real gate (not just a diagram), plan completion gets verified against the diff, and verification steps from your plan run automatically. + +### Added + +- **Test coverage gate in /ship.** AI-assessed coverage below 60% is a hard stop. 60-79% gets a prompt. 80%+ passes. Thresholds are configurable per-project via `## Test Coverage` in CLAUDE.md. +- **Coverage warning in /review.** Low coverage is now flagged prominently before you reach the /ship gate, so you can write tests early. +- **Plan completion audit.** /ship reads your plan file, extracts every actionable item, cross-references against the diff, and shows you a DONE/NOT DONE/PARTIAL/CHANGED checklist. Missing items are a shipping blocker (with override). +- **Plan-aware scope drift detection.** /review's scope drift check now reads the plan file too — not just TODOS.md and PR description. +- **Auto-verification via /qa-only.** /ship reads your plan's verification section and runs /qa-only inline to test it — if a dev server is running on localhost. No server, no problem — it skips gracefully. +- **Shared plan file discovery.** Conversation context first, content-based grep fallback second. Used by plan completion, plan review reports, and verification. +- **Ship metrics logging.** Coverage %, plan completion ratio, and verification results are logged to review JSONL for /retro to track trends. +- **Plan completion in /retro.** Weekly retros now show plan completion rates across shipped branches. + +## [0.11.17.0] - 2026-03-24 — Cleaner Skill Descriptions + Proactive Opt-Out + +### Changed + +- **Skill descriptions are now clean and readable.** Removed the ugly "MANUAL TRIGGER ONLY" prefix from every skill description that was wasting 58 characters and causing build errors for Codex integration. +- **You can now opt out of proactive skill suggestions.** The first time you run any gstack skill, you'll be asked whether you want gstack to suggest skills during your workflow. If you prefer to invoke skills manually, just say no — it's saved as a global setting. You can change your mind anytime with `gstack-config set proactive true/false`. + +### Fixed + +- **Telemetry source tagging no longer crashes.** Fixed duration guards and source field validation in the telemetry logger so it handles edge cases cleanly instead of erroring. + ## [0.11.16.1] - 2026-03-24 — Installation ID Privacy Fix ### Fixed diff --git a/VERSION b/VERSION index f71aefdff..d20322e5c 100644 --- a/VERSION +++ b/VERSION @@ -1 +1 @@ -0.11.16.1 +0.11.19.0 diff --git a/package.json b/package.json index cbf826161..f666c9af9 100644 --- a/package.json +++ b/package.json @@ -1,6 +1,6 @@ { "name": "gstack", - "version": "0.11.16.1", + "version": "0.11.19.0", "description": "Garry's Stack — Claude Code skills + fast headless browser. One repo, one install, entire AI engineering workflow.", "license": "MIT", "type": "module", From e8ae6e32b48cace048efbc515aa6b08dac711967 Mon Sep 17 00:00:00 2001 From: Garry Tan Date: Tue, 24 Mar 2026 20:49:18 -0700 Subject: [PATCH 5/5] fix: Codex wrong-repo + routing table to body + 900-char guard (v0.11.19.0) - Add -C "$(git rev-parse --show-toplevel)" to all 14 codex exec calls so Codex always runs in the correct repo (fixes Conductor multi-workspace bug) - Move skill routing table from description to body in SKILL.md.tmpl (description was already shortened on main; routing table was missing from body) - Add 900-char warning threshold test for Codex descriptions - Bump version + sync package.json Co-Authored-By: Claude Opus 4.6 (1M context) --- SKILL.md | 22 ++++++++++++++++++++++ SKILL.md.tmpl | 22 ++++++++++++++++++++++ autoplan/SKILL.md | 6 +++--- autoplan/SKILL.md.tmpl | 6 +++--- codex/SKILL.md | 6 +++--- codex/SKILL.md.tmpl | 6 +++--- scripts/resolvers/design.ts | 6 +++--- scripts/resolvers/review.ts | 6 +++--- test/gen-skill-docs.test.ts | 18 ++++++++++++++++++ 9 files changed, 80 insertions(+), 18 deletions(-) diff --git a/SKILL.md b/SKILL.md index dada1e75d..f6d2831e5 100644 --- a/SKILL.md +++ b/SKILL.md @@ -297,6 +297,28 @@ If `PROACTIVE` is `false`: do NOT proactively suggest other gstack skills during Only run skills the user explicitly invokes. This preference persists across sessions via `gstack-config`. +If `PROACTIVE` is `true` (default): suggest adjacent gstack skills when relevant to the +user's workflow stage: +- Brainstorming → /office-hours +- Strategy → /plan-ceo-review +- Architecture → /plan-eng-review +- Design → /plan-design-review or /design-consultation +- Auto-review → /autoplan +- Debugging → /investigate +- QA → /qa +- Code review → /review +- Visual audit → /design-review +- Shipping → /ship +- Docs → /document-release +- Retro → /retro +- Second opinion → /codex +- Prod safety → /careful or /guard +- Scoped edits → /freeze or /unfreeze +- Upgrades → /gstack-upgrade + +If the user opts out of suggestions, run `gstack-config set proactive false`. +If they opt back in, run `gstack-config set proactive true`. + # gstack browse: QA Testing & Dogfooding Persistent headless Chromium. First call auto-starts (~3s), then ~100-200ms per command. diff --git a/SKILL.md.tmpl b/SKILL.md.tmpl index fca8fa605..31bd28375 100644 --- a/SKILL.md.tmpl +++ b/SKILL.md.tmpl @@ -20,6 +20,28 @@ If `PROACTIVE` is `false`: do NOT proactively suggest other gstack skills during Only run skills the user explicitly invokes. This preference persists across sessions via `gstack-config`. +If `PROACTIVE` is `true` (default): suggest adjacent gstack skills when relevant to the +user's workflow stage: +- Brainstorming → /office-hours +- Strategy → /plan-ceo-review +- Architecture → /plan-eng-review +- Design → /plan-design-review or /design-consultation +- Auto-review → /autoplan +- Debugging → /investigate +- QA → /qa +- Code review → /review +- Visual audit → /design-review +- Shipping → /ship +- Docs → /document-release +- Retro → /retro +- Second opinion → /codex +- Prod safety → /careful or /guard +- Scoped edits → /freeze or /unfreeze +- Upgrades → /gstack-upgrade + +If the user opts out of suggestions, run `gstack-config set proactive false`. +If they opt back in, run `gstack-config set proactive true`. + # gstack browse: QA Testing & Dogfooding Persistent headless Chromium. First call auto-starts (~3s), then ~100-200ms per command. diff --git a/autoplan/SKILL.md b/autoplan/SKILL.md index e9161eabe..148749006 100644 --- a/autoplan/SKILL.md +++ b/autoplan/SKILL.md @@ -547,7 +547,7 @@ Override: every AskUserQuestion → auto-decide using the 6 principles. What alternatives were dismissed too quickly? What competitive or market risks are unaddressed? What scope decisions will look foolish in 6 months? Be adversarial. No compliments. Just the strategic blind spots. - File: " -s read-only --enable web_search_cached` + File: " -C "$(git rev-parse --show-toplevel)" -s read-only --enable web_search_cached` Timeout: 10 minutes **Claude CEO subagent** (via Agent tool): @@ -658,7 +658,7 @@ Override: every AskUserQuestion → auto-decide using the 6 principles. accessibility requirements (keyboard nav, contrast, touch targets) specified or aspirational? Does the plan describe specific UI decisions or generic patterns? What design decisions will haunt the implementer if left ambiguous? - Be opinionated. No hedging." -s read-only --enable web_search_cached` + Be opinionated. No hedging." -C "$(git rev-parse --show-toplevel)" -s read-only --enable web_search_cached` Timeout: 10 minutes **Claude design subagent** (via Agent tool): @@ -723,7 +723,7 @@ Override: every AskUserQuestion → auto-decide using the 6 principles. CEO: Design: - File: " -s read-only --enable web_search_cached` + File: " -C "$(git rev-parse --show-toplevel)" -s read-only --enable web_search_cached` Timeout: 10 minutes **Claude eng subagent** (via Agent tool): diff --git a/autoplan/SKILL.md.tmpl b/autoplan/SKILL.md.tmpl index b3e0a3402..661e8fb01 100644 --- a/autoplan/SKILL.md.tmpl +++ b/autoplan/SKILL.md.tmpl @@ -204,7 +204,7 @@ Override: every AskUserQuestion → auto-decide using the 6 principles. What alternatives were dismissed too quickly? What competitive or market risks are unaddressed? What scope decisions will look foolish in 6 months? Be adversarial. No compliments. Just the strategic blind spots. - File: " -s read-only --enable web_search_cached` + File: " -C "$(git rev-parse --show-toplevel)" -s read-only --enable web_search_cached` Timeout: 10 minutes **Claude CEO subagent** (via Agent tool): @@ -315,7 +315,7 @@ Override: every AskUserQuestion → auto-decide using the 6 principles. accessibility requirements (keyboard nav, contrast, touch targets) specified or aspirational? Does the plan describe specific UI decisions or generic patterns? What design decisions will haunt the implementer if left ambiguous? - Be opinionated. No hedging." -s read-only --enable web_search_cached` + Be opinionated. No hedging." -C "$(git rev-parse --show-toplevel)" -s read-only --enable web_search_cached` Timeout: 10 minutes **Claude design subagent** (via Agent tool): @@ -380,7 +380,7 @@ Override: every AskUserQuestion → auto-decide using the 6 principles. CEO: Design: - File: " -s read-only --enable web_search_cached` + File: " -C "$(git rev-parse --show-toplevel)" -s read-only --enable web_search_cached` Timeout: 10 minutes **Claude eng subagent** (via Agent tool): diff --git a/codex/SKILL.md b/codex/SKILL.md index f34b8db4c..8bce22e54 100644 --- a/codex/SKILL.md +++ b/codex/SKILL.md @@ -518,7 +518,7 @@ With focus (e.g., "security"): 2. Run codex exec with **JSONL output** to capture reasoning traces and tool calls (5-minute timeout): ```bash -codex exec "" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached --json 2>/dev/null | python3 -c " +codex exec "" -C "$(git rev-parse --show-toplevel)" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached --json 2>/dev/null | python3 -c " import sys, json for line in sys.stdin: line = line.strip() @@ -603,7 +603,7 @@ THE PLAN: For a **new session:** ```bash -codex exec "" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached --json 2>"$TMPERR" | python3 -c " +codex exec "" -C "$(git rev-parse --show-toplevel)" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached --json 2>"$TMPERR" | python3 -c " import sys, json for line in sys.stdin: line = line.strip() @@ -636,7 +636,7 @@ for line in sys.stdin: For a **resumed session** (user chose "Continue"): ```bash -codex exec resume "" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached --json 2>"$TMPERR" | python3 -c " +codex exec resume "" -C "$(git rev-parse --show-toplevel)" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached --json 2>"$TMPERR" | python3 -c " " ``` diff --git a/codex/SKILL.md.tmpl b/codex/SKILL.md.tmpl index c0b7adb1b..338df93b6 100644 --- a/codex/SKILL.md.tmpl +++ b/codex/SKILL.md.tmpl @@ -159,7 +159,7 @@ With focus (e.g., "security"): 2. Run codex exec with **JSONL output** to capture reasoning traces and tool calls (5-minute timeout): ```bash -codex exec "" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached --json 2>/dev/null | python3 -c " +codex exec "" -C "$(git rev-parse --show-toplevel)" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached --json 2>/dev/null | python3 -c " import sys, json for line in sys.stdin: line = line.strip() @@ -244,7 +244,7 @@ THE PLAN: For a **new session:** ```bash -codex exec "" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached --json 2>"$TMPERR" | python3 -c " +codex exec "" -C "$(git rev-parse --show-toplevel)" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached --json 2>"$TMPERR" | python3 -c " import sys, json for line in sys.stdin: line = line.strip() @@ -277,7 +277,7 @@ for line in sys.stdin: For a **resumed session** (user chose "Continue"): ```bash -codex exec resume "" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached --json 2>"$TMPERR" | python3 -c " +codex exec resume "" -C "$(git rev-parse --show-toplevel)" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached --json 2>"$TMPERR" | python3 -c " " ``` diff --git a/scripts/resolvers/design.ts b/scripts/resolvers/design.ts index 30b1fe2c8..c4926112a 100644 --- a/scripts/resolvers/design.ts +++ b/scripts/resolvers/design.ts @@ -17,7 +17,7 @@ If Codex is available, run a lightweight design check on the diff: \`\`\`bash TMPERR_DRL=$(mktemp /tmp/codex-drl-XXXXXXXX) -codex exec "Review the git diff on this branch. Run 7 litmus checks (YES/NO each): ${litmusList} Flag any hard rejections: ${rejectionList} 5 most important design findings only. Reference file:line." -s read-only -c 'model_reasoning_effort="high"' --enable web_search_cached 2>"$TMPERR_DRL" +codex exec "Review the git diff on this branch. Run 7 litmus checks (YES/NO each): ${litmusList} Flag any hard rejections: ${rejectionList} 5 most important design findings only. Reference file:line." -C "$(git rev-parse --show-toplevel)" -s read-only -c 'model_reasoning_effort="high"' --enable web_search_cached 2>"$TMPERR_DRL" \`\`\` Use a 5-minute timeout (\`timeout: 300000\`). After the command completes, read stderr: @@ -467,7 +467,7 @@ If user chooses A, launch both voices simultaneously: 1. **Codex** (via Bash, \`model_reasoning_effort="medium"\`): \`\`\`bash TMPERR_SKETCH=$(mktemp /tmp/codex-sketch-XXXXXXXX) -codex exec "For this product approach, provide: a visual thesis (one sentence — mood, material, energy), a content plan (hero → support → detail → CTA), and 2 interaction ideas that change page feel. Apply beautiful defaults: composition-first, brand-first, cardless, poster not document. Be opinionated." -s read-only -c 'model_reasoning_effort="medium"' --enable web_search_cached 2>"$TMPERR_SKETCH" +codex exec "For this product approach, provide: a visual thesis (one sentence — mood, material, energy), a content plan (hero → support → detail → CTA), and 2 interaction ideas that change page feel. Apply beautiful defaults: composition-first, brand-first, cardless, poster not document. Be opinionated." -C "$(git rev-parse --show-toplevel)" -s read-only -c 'model_reasoning_effort="medium"' --enable web_search_cached 2>"$TMPERR_SKETCH" \`\`\` Use a 5-minute timeout (\`timeout: 300000\`). After completion: \`cat "$TMPERR_SKETCH" && rm -f "$TMPERR_SKETCH"\` @@ -636,7 +636,7 @@ which codex 2>/dev/null && echo "CODEX_AVAILABLE" || echo "CODEX_NOT_AVAILABLE" 1. **Codex design voice** (via Bash): \`\`\`bash TMPERR_DESIGN=$(mktemp /tmp/codex-design-XXXXXXXX) -codex exec "${escapedCodexPrompt}" -s read-only -c 'model_reasoning_effort="${reasoningEffort}"' --enable web_search_cached 2>"$TMPERR_DESIGN" +codex exec "${escapedCodexPrompt}" -C "$(git rev-parse --show-toplevel)" -s read-only -c 'model_reasoning_effort="${reasoningEffort}"' --enable web_search_cached 2>"$TMPERR_DESIGN" \`\`\` Use a 5-minute timeout (\`timeout: 300000\`). After the command completes, read stderr: \`\`\`bash diff --git a/scripts/resolvers/review.ts b/scripts/resolvers/review.ts index 1831e098b..2b83f36d4 100644 --- a/scripts/resolvers/review.ts +++ b/scripts/resolvers/review.ts @@ -286,7 +286,7 @@ Write the full prompt (context block + instructions) to this file. Use the mode- \`\`\`bash TMPERR_OH=$(mktemp /tmp/codex-oh-err-XXXXXXXX) -codex exec "$(cat "$CODEX_PROMPT_FILE")" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached 2>"$TMPERR_OH" +codex exec "$(cat "$CODEX_PROMPT_FILE")" -C "$(git rev-parse --show-toplevel)" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached 2>"$TMPERR_OH" \`\`\` Use a 5-minute timeout (\`timeout: 300000\`). After the command completes, read stderr: @@ -370,7 +370,7 @@ Claude's structured review already ran. Now add a **cross-model adversarial chal \`\`\`bash TMPERR_ADV=$(mktemp /tmp/codex-adv-XXXXXXXX) -codex exec "Review the changes on this branch against the base branch. Run git diff origin/ to see the diff. Your job is to find ways this code will fail in production. Think like an attacker and a chaos engineer. Find edge cases, race conditions, security holes, resource leaks, failure modes, and silent data corruption paths. Be adversarial. Be thorough. No compliments — just the problems." -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached 2>"$TMPERR_ADV" +codex exec "Review the changes on this branch against the base branch. Run git diff origin/ to see the diff. Your job is to find ways this code will fail in production. Think like an attacker and a chaos engineer. Find edge cases, race conditions, security holes, resource leaks, failure modes, and silent data corruption paths. Be adversarial. Be thorough. No compliments — just the problems." -C "$(git rev-parse --show-toplevel)" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached 2>"$TMPERR_ADV" \`\`\` Set the Bash tool's \`timeout\` parameter to \`300000\` (5 minutes). Do NOT use the \`timeout\` shell command — it doesn't exist on macOS. After the command completes, read stderr: @@ -525,7 +525,7 @@ THE PLAN: \`\`\`bash TMPERR_PV=$(mktemp /tmp/codex-planreview-XXXXXXXX) -codex exec "" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached 2>"$TMPERR_PV" +codex exec "" -C "$(git rev-parse --show-toplevel)" -s read-only -c 'model_reasoning_effort="xhigh"' --enable web_search_cached 2>"$TMPERR_PV" \`\`\` Use a 5-minute timeout (\`timeout: 300000\`). After the command completes, read stderr: diff --git a/test/gen-skill-docs.test.ts b/test/gen-skill-docs.test.ts index d8a071a1f..c26bb64b5 100644 --- a/test/gen-skill-docs.test.ts +++ b/test/gen-skill-docs.test.ts @@ -152,6 +152,24 @@ describe('gen-skill-docs', () => { } }); + test('every Codex SKILL.md description stays under 900-char warning threshold', () => { + const WARN_THRESHOLD = 900; + const agentsDir = path.join(ROOT, '.agents', 'skills'); + if (!fs.existsSync(agentsDir)) return; + const violations: string[] = []; + for (const entry of fs.readdirSync(agentsDir, { withFileTypes: true })) { + if (!entry.isDirectory()) continue; + const skillMd = path.join(agentsDir, entry.name, 'SKILL.md'); + if (!fs.existsSync(skillMd)) continue; + const content = fs.readFileSync(skillMd, 'utf-8'); + const description = extractDescription(content); + if (description.length > WARN_THRESHOLD) { + violations.push(`${entry.name}: ${description.length} chars (limit ${MAX_SKILL_DESCRIPTION_LENGTH}, ${MAX_SKILL_DESCRIPTION_LENGTH - description.length} remaining)`); + } + } + expect(violations).toEqual([]); + }); + test('package.json version matches VERSION file', () => { const pkg = JSON.parse(fs.readFileSync(path.join(ROOT, 'package.json'), 'utf-8')); const version = fs.readFileSync(path.join(ROOT, 'VERSION'), 'utf-8').trim();