From 191afcc7b9a636702992c2b9e7e3f4baf760d3ab Mon Sep 17 00:00:00 2001
From: Etan Joseph Heyman <etan@heyman.net>
Date: Fri, 20 Mar 2026 19:56:08 +0200
Subject: [PATCH] feat: enrichment backfill script (Flash-Lite) + CLAUDE.md
 compact instructions

- enrichment_backfill.py: batch tag enrichment via Gemini 2.5 Flash-Lite
  (thinking_budget:0 silently ignored by Flash, Flash-Lite has no thinking)
- Supports --test (10 chunks dry run) and --limit N
- Auto-detects unfaceted chunks, newest first
- Merges faceted tags (dom:*, act:*, topics) with existing tags
- CLAUDE.md: added Compact Instructions + BrainBar stub warnings

Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
---
 CLAUDE.md                      |  35 +++++++
 scripts/enrichment_backfill.py | 177 +++++++++++++++++++++++++++++++++
 2 files changed, 212 insertions(+)
 create mode 100644 scripts/enrichment_backfill.py

diff --git a/CLAUDE.md b/CLAUDE.md
index 46ab387e..e94ebfbd 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -7,6 +7,41 @@
 - Build a local, private knowledge base from Claude Code sessions.
 - Provide fast search, context retrieval, and exports for downstream tools.
 
+---
+
+## BrainBar Stub Warnings
+
+BrainBar Swift daemon has 4 STUB tools returning fake success:
+- brain_digest, brain_update, brain_expand, brain_tags — ALL BROKEN
+- Working: brain_search, brain_store, brain_recall, brain_entity
+- Last successful digest: March 14, 2026
+
+## Compact Instructions
+
+When compacting this session, follow these rules strictly:
+
+### NEVER preserve
+- /loop, QUEUE-OPERATION, cron polling (3+ identical system/cron messages = keep ZERO)
+- BrainLayer search injections (re-injected fresh each turn)
+- Full file contents re-readable from disk (keep path + one-line summary of decision made)
+
+### ALWAYS preserve verbatim
+- User vision/goal/decision statements (if stated 3x+, note "[USER STATED Nx]")
+- User repetitions in DIFFERENT places = importance signal, keep ONE with annotation
+- Short user messages (approvals, frustration signals) — these carry intent
+- Sprint plan with priority ratings
+- All decisions with rationale (WHY not just WHAT)
+- Modified file paths with one-line change summary
+
+### Structure summary as
+1. **Session Intent**: What the user wants (exact quotes)
+2. **Decisions Made**: Each + rationale + who
+3. **Artifact Trail**: Files, tests, commands
+4. **Current State**: Working/broken/in-progress
+5. **Next Steps**: Ordered by sprint plan priority
+
+---
+
 ## Stack (WHAT)
 - Python package + Typer CLI in `src/brainlayer/`
 - sqlite-vec storage via APSW (`vector_store.py`)
diff --git a/scripts/enrichment_backfill.py b/scripts/enrichment_backfill.py
new file mode 100644
index 00000000..e26536a7
--- /dev/null
+++ b/scripts/enrichment_backfill.py
@@ -0,0 +1,177 @@
+"""Backfill enrichment: run faceted tag prompt on chunks missing faceted tags.
+
+Usage:
+  GOOGLE_API_KEY=... python3 scripts/enrichment_backfill.py [--limit N] [--test]
+
+  --test: run on 10 chunks, print results, don't commit to DB
+  --limit N: process N chunks (default: all unfaceted)
+"""
+
+import json
+import os
+import sys
+import time
+import sqlite3
+import argparse
+from pathlib import Path
+
+sys.path.insert(0, str(Path(__file__).parent.parent))
+
+API_KEY = os.environ.get("GOOGLE_API_KEY")
+if not API_KEY:
+    print("ERROR: GOOGLE_API_KEY required")
+    sys.exit(1)
+
+from google import genai
+
+client = genai.Client(api_key=API_KEY)
+MODEL = "gemini-2.5-flash-lite"  # No thinking by design. Fast, cheap, good for classification.
+DB_PATH = str(Path.home() / ".local/share/brainlayer/brainlayer.db")
+
+PROMPT = """You are a knowledge base tagger for a personal multi-project development knowledge base. Your job is to identify WHAT SPECIFIC THING each chunk is about — not the kind of work being done.
+
+## Critical distinction
+
+GOOD tags describe the SUBJECT: "brainlayer-search-quality", "6pm-confirmation-flow", "importance-calibration"
+BAD tags describe the FORMAT: "typescript", "debugging", "code-review", "feature-dev"
+
+Ask yourself: "If someone searches for this topic in 6 months, what words would they use?" Tag with THOSE words.
+
+## Output schema (JSON)
+
+Return a JSON object with these fields in this exact order:
+
+- **a_reasoning** (string): 1-2 sentences explaining what specific subject this chunk discusses.
+- **b_topics** (string[]): 1-3 object tags — specific, hyphenated, 2-4 words.
+- **c_activity** (string): Exactly one of: act:debugging, act:implementing, act:designing, act:reviewing, act:researching, act:planning, act:configuring, act:refactoring, act:testing, act:learning
+- **d_domain** (string[]): 0-3 technology domains from: dom:typescript, dom:python, dom:swift, dom:sql, dom:react, dom:convex, dom:supabase, dom:mcp, dom:vertex-ai, dom:ollama, dom:mlx, dom:git, dom:telegram, dom:whatsapp, dom:macos, dom:cli, dom:css, dom:html, dom:docker, dom:railway, dom:linear, dom:obsidian. Empty array if no specific technology.
+- **e_confidence** (number): 0.0-1.0 confidence in your tagging. Below 0.5 = low-content chunk.
+
+## Now tag this chunk:
+
+{chunk_content}"""
+
+
+def get_unfaceted_chunks(db, limit=None):
+    """Get chunks that don't have faceted tags yet, newest first."""
+    sql = """
+        SELECT rowid, id, content, source, tags
+        FROM chunks
+        WHERE (tags NOT LIKE '%dom:%' AND tags NOT LIKE '%act:%') OR tags IS NULL
+        ORDER BY rowid DESC
+    """
+    if limit:
+        sql += f" LIMIT {limit}"
+    return db.execute(sql).fetchall()
+
+
+def enrich_chunk(content):
+    """Call Gemini to get faceted tags for a chunk."""
+    try:
+        response = client.models.generate_content(
+            model=MODEL,
+            contents=PROMPT.replace("{chunk_content}", content[:2000]),
+            config={
+                "response_mime_type": "application/json",
+            }
+        )
+        return json.loads(response.text)
+    except json.JSONDecodeError:
+        return None
+    except Exception as e:
+        print(f"  API error: {e}")
+        return None
+
+
+def merge_tags(existing_tags, new_result):
+    """Merge new faceted tags with existing tags."""
+    existing = []
+    if existing_tags:
+        try:
+            existing = json.loads(existing_tags)
+            if not isinstance(existing, list):
+                existing = [existing_tags]
+        except json.JSONDecodeError:
+            existing = [existing_tags] if existing_tags else []
+
+    # Remove old activity/domain tags if present
+    existing = [t for t in existing if not t.startswith("dom:") and not t.startswith("act:")]
+
+    new_tags = new_result.get("b_topics", []) + [new_result.get("c_activity", "")] + new_result.get("d_domain", [])
+    new_tags = [t for t in new_tags if t]  # remove empties
+
+    merged = list(set(existing + new_tags))
+    return json.dumps(merged)
+
+
+def main():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--test", action="store_true", help="Test on 10 chunks, don't write to DB")
+    parser.add_argument("--limit", type=int, default=None, help="Max chunks to process")
+    args = parser.parse_args()
+
+    if args.test:
+        args.limit = 10
+
+    db = sqlite3.connect(DB_PATH, timeout=30)
+    db.execute("PRAGMA journal_mode=WAL")
+    db.execute("PRAGMA busy_timeout=10000")
+
+    chunks = get_unfaceted_chunks(db, args.limit)
+    total = len(chunks)
+    print(f"Chunks to process: {total}", flush=True)
+
+    if total == 0:
+        print("Nothing to enrich!")
+        return
+
+    done = 0
+    errors = 0
+    t0 = time.time()
+
+    for rowid, chunk_id, content, source, existing_tags in chunks:
+        result = enrich_chunk(content)
+
+        if result and "b_topics" in result:
+            merged = merge_tags(existing_tags, result)
+            confidence = result.get("e_confidence", 0)
+
+            if not args.test:
+                db.execute(
+                    "UPDATE chunks SET tags = ?, tag_confidence = ? WHERE rowid = ?",
+                    (merged, confidence, rowid)
+                )
+
+            done += 1
+            if done % 50 == 0:
+                if not args.test:
+                    db.commit()
+                elapsed = time.time() - t0
+                rate = done / elapsed
+                eta = (total - done) / rate / 60 if rate > 0 else 0
+                print(f"  [{done}/{total}] {rate:.1f}/sec, {eta:.0f}min left", flush=True)
+
+            if args.test:
+                print(f"\n  Chunk {done}: {content[:80]}...")
+                print(f"  Topics: {result.get('b_topics', [])}")
+                print(f"  Activity: {result.get('c_activity', '')}")
+                print(f"  Domain: {result.get('d_domain', [])}")
+                print(f"  Confidence: {result.get('e_confidence', 0)}")
+        else:
+            errors += 1
+            if args.test:
+                print(f"\n  Chunk FAILED: {content[:60]}... → {result}")
+
+        # Flash-Lite: 30 RPM free tier, 2000 RPM paid. Minimal sleep.
+        time.sleep(0.1)
+
+    if not args.test:
+        db.commit()
+
+    elapsed = time.time() - t0
+    print(f"\nDone: {done}/{total} enriched, {errors} errors, {elapsed:.0f}s ({done/elapsed:.1f}/sec)", flush=True)
+    db.close()
+
+
+if __name__ == "__main__":
+    main()