From a7f1a8016b208d35beec56a7b94454923a33c21a Mon Sep 17 00:00:00 2001
From: Etan Joseph Heyman <etan@heyman.net>
Date: Mon, 9 Mar 2026 13:18:32 +0200
Subject: [PATCH 1/3] feat: eval suite + entity injection in prompt hook (Phase
 0 + Phase A)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Phase 0 — Baselines:
- tests/test_eval_baselines.py: 23-case eval suite across 8 domains
  (entity routing, tag filter, recency, Hebrew FTS, cross-project,
   decision retrieval, memory, mined real queries)
- tests/eval_baselines.json: recorded baseline results
- scripts/run_evals.py: CLI runner for before/after comparison
- tests/conftest.py: register `live` pytest mark

Phase A — Entity routing in prompt hook:
- hooks/brainlayer-prompt-search.py: detect known entity names
  (person, company, agent) in user prompt → inject [Entity: Name — type]
  section + linked chunks before FTS results
- Possessive stripping ("Simon's" → "Simon") for bigram matching
- Filter: only person/company/agent types (skip technology/concept noise)

Before/After scores (run: python tests/test_eval_baselines.py):
- brain_search quality: 94.7% (18/19) — unchanged (already good)
- hook entity injection: 25% → 100% (3 tests now pass)
- combined: 82.6% → 95.7% (+13.1pp)

Known gaps (xfail):
- Hebrew semantic accuracy (query returns unrelated Hebrew content)
- "today" temporal awareness in raw hybrid_search

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 hooks/brainlayer-prompt-search.py | 311 +++++++++++++
 scripts/run_evals.py              |  95 ++++
 tests/conftest.py                 |   8 +
 tests/eval_baselines.json         | 361 +++++++++++++++
 tests/test_eval_baselines.py      | 703 ++++++++++++++++++++++++++++++
 5 files changed, 1478 insertions(+)
 create mode 100755 hooks/brainlayer-prompt-search.py
 create mode 100644 scripts/run_evals.py
 create mode 100644 tests/eval_baselines.json
 create mode 100644 tests/test_eval_baselines.py

diff --git a/hooks/brainlayer-prompt-search.py b/hooks/brainlayer-prompt-search.py
new file mode 100755
index 00000000..74f16bef
--- /dev/null
+++ b/hooks/brainlayer-prompt-search.py
@@ -0,0 +1,311 @@
+#!/usr/bin/env python3
+"""
+BrainLayer UserPromptSubmit Hook — auto-searches memories relevant to the user's prompt.
+
+Extracts keywords from the prompt, runs FTS5 search against BrainLayer.
+Two modes:
+  - Light (default): top 3 results, ~300 tokens
+  - Deep (triggered by memory words): top 8 results, ~800 tokens
+
+Output: plain text to stdout (injected as Claude context).
+Target: <500ms total.
+"""
+
+import json
+import os
+import re
+import sqlite3
+import sys
+import time
+
+DEADLINE_MS = 450
+
+# Prompts shorter than this are probably greetings/commands — skip search
+MIN_PROMPT_LENGTH = 15
+
+# Trigger words that activate deep mode (more results)
+DEEP_TRIGGERS = {
+    "remember",
+    "last time",
+    "previous",
+    "previously",
+    "before",
+    "history",
+    "earlier",
+    "we discussed",
+    "we decided",
+    "we talked",
+    "recall",
+    "forgot",
+    "what was",
+    "what were",
+    "when did",
+    "how did",
+    "brainlayer",
+}
+
+# Common English stop words to skip during keyword extraction
+STOP_WORDS = {
+    "a", "an", "the", "is", "it", "in", "on", "at", "to", "for", "of",
+    "and", "or", "but", "not", "with", "this", "that", "from", "by",
+    "are", "was", "were", "be", "been", "being", "have", "has", "had",
+    "do", "does", "did", "will", "would", "could", "should", "may",
+    "might", "can", "shall", "must", "need", "let", "me", "my", "i",
+    "you", "your", "we", "our", "they", "them", "their", "he", "she",
+    "his", "her", "its", "if", "then", "else", "when", "where", "how",
+    "what", "which", "who", "why", "so", "just", "also", "very", "too",
+    "up", "out", "about", "into", "over", "after", "some", "any", "all",
+    "no", "yes", "ok", "okay", "please", "thanks", "thank", "hey",
+    "hi", "hello", "sure", "right", "well", "now", "here", "there",
+    "like", "want", "think", "know", "see", "look", "make", "take",
+    "get", "go", "come", "use", "try", "help", "tell", "give", "show",
+    "work", "call", "run", "set", "add", "put", "keep", "find", "read",
+    "write", "create", "build", "check", "start", "stop", "change",
+    "move", "open", "close", "new", "old", "good", "bad", "big",
+    "small", "first", "last", "next", "more", "less", "much", "many",
+    "each", "every", "other", "same", "different", "own", "still",
+    "already", "again", "even", "really", "actually", "probably",
+    "maybe", "file", "code", "thing", "way", "something", "anything",
+}
+
+DB_PATHS = [
+    os.path.expanduser("~/.local/share/zikaron/zikaron.db"),
+    os.path.expanduser("~/.local/share/brainlayer/brainlayer.db"),
+]
+
+
+def get_db_path():
+    env = os.environ.get("BRAINLAYER_DB")
+    if env and os.path.exists(env):
+        return env
+    for p in DB_PATHS:
+        if os.path.exists(p):
+            return p
+    return None
+
+
+def is_deep_mode(prompt_lower):
+    for trigger in DEEP_TRIGGERS:
+        if trigger in prompt_lower:
+            return True
+    return False
+
+
+def extract_keywords(prompt):
+    """Extract meaningful keywords from the prompt for FTS5 search."""
+    # Remove URLs, paths, code blocks
+    text = re.sub(r"https?://\S+", "", prompt)
+    text = re.sub(r"[/~]\S+", "", text)
+    text = re.sub(r"`[^`]+`", "", text)
+
+    # Extract words (keep hyphens for compound terms like "6pm-mini")
+    words = re.findall(r"[a-zA-Z0-9][\w-]*", text.lower())
+
+    # Filter out stop words and short words
+    keywords = []
+    seen = set()
+    for w in words:
+        if w not in STOP_WORDS and len(w) > 2 and w not in seen:
+            keywords.append(w)
+            seen.add(w)
+
+    return keywords[:8]  # Cap at 8 keywords for FTS5 performance
+
+
+def truncate(text, max_chars=200):
+    # Clean up multi-line content for compact display
+    text = re.sub(r"\n+", " | ", text.strip())
+    if len(text) <= max_chars:
+        return text
+    return text[:max_chars].rsplit(" ", 1)[0] + "..."
+
+
+def elapsed_ms(start):
+    return (time.monotonic() - start) * 1000
+
+
+def detect_entities_in_prompt(prompt, conn):
+    """Detect known KG entity names in the prompt.
+
+    Checks bigrams and single capitalized words (3+ chars) against kg_entities.
+    Returns list of dicts: {id, name, entity_type}.
+    Fast: exact SQL LOWER() match, no FTS5 overhead.
+
+    Only injects context for high-signal entity types (person, company, agent).
+    Technology/concept entities are too noisy for automatic injection.
+    """
+    # Entity types that warrant automatic context injection
+    INJECT_TYPES = {"person", "company", "agent"}
+
+    def _clean_word(w):
+        """Strip trailing punctuation and possessive suffixes ('s, 's)."""
+        # Remove all non-alphanumeric except hyphen (for compound words)
+        cleaned = re.sub(r"[^a-zA-Z0-9-]", "", w)
+        # Strip trailing possessive suffix "s" preceded by nothing (was apostrophe)
+        if cleaned.endswith("s") and len(cleaned) > 2:
+            # heuristic: if original had 's or 's before 's, strip the trailing s
+            if re.search(r"'s?$", w):
+                cleaned = cleaned[:-1]
+        return cleaned
+
+    words = prompt.split()
+    cleaned_words = [_clean_word(w) for w in words]
+    candidates = []
+
+    # Bigrams: "Avi Simon", "Fedor Sidorov" etc.
+    for i in range(len(cleaned_words) - 1):
+        w1, w2 = cleaned_words[i], cleaned_words[i + 1]
+        if not w1 or not w2:
+            continue
+        # At least one word must start uppercase (entities are proper nouns)
+        if w1[0].isupper() or w2[0].isupper():
+            candidates.append(f"{w1} {w2}")
+
+    # Single capitalized words (4+ chars to avoid "What", "Tell", etc.)
+    for w in cleaned_words:
+        if len(w) >= 4 and w[0].isupper() and not w.isupper():
+            candidates.append(w)
+
+    if not candidates:
+        return []
+
+    matched = []
+    seen_ids = set()
+    try:
+        for candidate in candidates:
+            rows = conn.execute(
+                "SELECT id, name, entity_type FROM kg_entities WHERE LOWER(name) = LOWER(?) LIMIT 1",
+                (candidate,),
+            ).fetchall()
+            if rows:
+                eid, name, etype = rows[0]
+                if eid not in seen_ids and etype in INJECT_TYPES:
+                    seen_ids.add(eid)
+                    matched.append({"id": eid, "name": name, "entity_type": etype})
+    except sqlite3.Error:
+        pass
+
+    return matched
+
+
+def get_entity_chunks(entity_id, conn, limit=3):
+    """Get top linked chunk summaries for an entity."""
+    try:
+        rows = conn.execute(
+            """
+            SELECT c.content, c.created_at, c.project
+            FROM kg_entity_chunks ec
+            JOIN chunks c ON c.id = ec.chunk_id
+            WHERE ec.entity_id = ?
+            ORDER BY ec.relevance DESC
+            LIMIT ?
+            """,
+            (entity_id, limit),
+        ).fetchall()
+        return rows
+    except sqlite3.Error:
+        return []
+
+
+def main():
+    start = time.monotonic()
+
+    try:
+        hook_input = json.loads(sys.stdin.read())
+    except (json.JSONDecodeError, EOFError):
+        sys.exit(0)
+
+    prompt = hook_input.get("prompt", "")
+    if not prompt or len(prompt) < MIN_PROMPT_LENGTH:
+        sys.exit(0)
+
+    prompt_lower = prompt.lower()
+
+    # Skip if prompt is a slash command
+    if prompt.strip().startswith("/"):
+        sys.exit(0)
+
+    deep = is_deep_mode(prompt_lower)
+    keywords = extract_keywords(prompt)
+
+    if not keywords:
+        sys.exit(0)
+
+    db_path = get_db_path()
+    if not db_path:
+        sys.exit(0)
+
+    try:
+        conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True, timeout=2)
+        conn.execute("PRAGMA journal_mode=WAL")
+        conn.execute("PRAGMA query_only=true")
+    except sqlite3.Error:
+        sys.exit(0)
+
+    limit = 8 if deep else 3
+
+    # Build FTS5 query: join keywords with OR for broader matching
+    fts_query = " OR ".join(f'"{kw}"' for kw in keywords)
+
+    lines = []
+    try:
+        # Phase A: Entity routing — detect known entity names in prompt
+        # and inject entity profile before FTS results.
+        if elapsed_ms(start) < DEADLINE_MS:
+            entities = detect_entities_in_prompt(prompt, conn)
+            for entity in entities[:2]:  # at most 2 entities per prompt
+                etype = entity["entity_type"]
+                ename = entity["name"]
+                lines.append(f"[Entity: {ename} — {etype}]")
+                # Get entity-linked chunks for context
+                entity_chunks = get_entity_chunks(entity["id"], conn, limit=2)
+                for content, created_at, project in entity_chunks:
+                    date = created_at[:10] if created_at else "?"
+                    proj = f" ({project})" if project else ""
+                    lines.append(f"- [{date}{proj}] {truncate(content, max_chars=150)}")
+
+        if elapsed_ms(start) < DEADLINE_MS:
+            rows = conn.execute(
+                """
+                SELECT c.content, c.importance, c.project, c.tags, c.created_at
+                FROM chunks_fts f
+                JOIN chunks c ON c.id = f.chunk_id
+                WHERE chunks_fts MATCH ?
+                ORDER BY rank
+                LIMIT ?
+                """,
+                (fts_query, limit),
+            ).fetchall()
+
+            if rows:
+                mode_label = "deep" if deep else "auto"
+                if lines:
+                    # Entity section already started — add separator
+                    lines.append(f"[BrainLayer {mode_label}] Memories matching your prompt:")
+                else:
+                    lines.append(f"[BrainLayer {mode_label}] Memories matching your prompt:")
+                for content, importance, project, tags, created_at in rows:
+                    date = created_at[:10] if created_at else "?"
+                    imp = f" imp:{importance:.0f}" if importance else ""
+                    proj = f" ({project})" if project else ""
+                    lines.append(
+                        f"- [{date}{imp}{proj}] {truncate(content)}"
+                    )
+
+                if not deep:
+                    lines.append(
+                        "(Use brain_search for deeper results.)"
+                    )
+    except sqlite3.Error:
+        pass
+    finally:
+        conn.close()
+
+    if lines:
+        print("\n".join(lines))
+
+    sys.exit(0)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/scripts/run_evals.py b/scripts/run_evals.py
new file mode 100644
index 00000000..c629e7f2
--- /dev/null
+++ b/scripts/run_evals.py
@@ -0,0 +1,95 @@
+#!/usr/bin/env python3
+"""Run all eval cases and save scored results to tests/eval_baselines.json.
+
+Usage:
+    python scripts/run_evals.py               # run + save + print summary
+    python scripts/run_evals.py --no-save     # run + print, don't save
+    python scripts/run_evals.py --diff        # compare to saved baseline
+
+This script calls run_baseline() and run_hook_baseline() from test_eval_baselines.py
+and writes the combined results to tests/eval_baselines.json.
+"""
+
+import argparse
+import json
+import sys
+from pathlib import Path
+
+# Ensure src is importable
+src = Path(__file__).parent.parent / "src"
+if str(src) not in sys.path:
+    sys.path.insert(0, str(src))
+
+tests_dir = Path(__file__).parent.parent / "tests"
+if str(tests_dir) not in sys.path:
+    sys.path.insert(0, str(tests_dir))
+
+from test_eval_baselines import run_baseline, run_hook_baseline
+
+BASELINE_FILE = Path(__file__).parent.parent / "tests" / "eval_baselines.json"
+
+
+def main():
+    parser = argparse.ArgumentParser(description="Run BrainLayer eval suite")
+    parser.add_argument("--no-save", action="store_true", help="Don't save results")
+    parser.add_argument("--diff", action="store_true", help="Compare to saved baseline")
+    args = parser.parse_args()
+
+    # Load previous baseline for diff
+    prev = None
+    if args.diff and BASELINE_FILE.exists():
+        prev = json.loads(BASELINE_FILE.read_text())
+
+    print("Running search quality evals...")
+    search_results = run_baseline()
+    print("Running hook entity injection evals...")
+    hook_results = run_hook_baseline()
+
+    combined = {
+        "search": search_results,
+        "hook": hook_results,
+        "combined_score_pct": round(
+            (search_results["pass_count"] + hook_results["pass_count"])
+            / (search_results["total"] + hook_results["total"]) * 100,
+            1,
+        ),
+    }
+
+    # Print summary
+    print(f"\n=== brain_search quality ===")
+    print(f"Score: {search_results['pass_count']}/{search_results['total']} ({search_results['score_pct']}%)")
+    if prev:
+        prev_pct = prev.get("search", {}).get("score_pct", 0)
+        delta = search_results["score_pct"] - prev_pct
+        print(f"Delta vs baseline: {delta:+.1f}%")
+    for case in search_results["cases"]:
+        status = "✓" if case["passed"] else "✗"
+        rank = f"rank={case['actual_rank']}" if case["actual_rank"] else "not found"
+        print(f"  {status} [{case['name']}] {rank}")
+        if not case["passed"]:
+            print(f"       top: {case['top_snippet'][:70]!r}")
+
+    print(f"\n=== hook entity injection ===")
+    print(f"Score: {hook_results['pass_count']}/{hook_results['total']} ({hook_results['score_pct']}%)")
+    if prev:
+        prev_hook_pct = prev.get("hook", {}).get("score_pct", 0)
+        hook_delta = hook_results["score_pct"] - prev_hook_pct
+        print(f"Delta vs baseline: {hook_delta:+.1f}%")
+    for case in hook_results["cases"]:
+        status = "✓" if case["passed"] else "✗"
+        print(f"  {status} [{case['name']}]")
+        if not case["passed"]:
+            print(f"       output: {case['output_preview'][:80]!r}")
+
+    print(f"\nCombined: {combined['combined_score_pct']}%")
+    if prev:
+        prev_combined = prev.get("combined_score_pct", 0)
+        print(f"Delta vs baseline: {combined['combined_score_pct'] - prev_combined:+.1f}%")
+
+    if not args.no_save:
+        BASELINE_FILE.write_text(json.dumps(combined, indent=2))
+        print(f"\nSaved to {BASELINE_FILE}")
+
+
+if __name__ == "__main__":
+    main()
diff --git a/tests/conftest.py b/tests/conftest.py
index 8f197199..8e9109aa 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -5,6 +5,14 @@
 import pytest
 
 
+def pytest_configure(config):
+    """Register custom pytest marks."""
+    config.addinivalue_line(
+        "markers",
+        "live: mark test as requiring a live production DB (skipped in CI if DB absent)",
+    )
+
+
 @pytest.fixture
 def test_user() -> str:
     """Username for path-based tests.
diff --git a/tests/eval_baselines.json b/tests/eval_baselines.json
new file mode 100644
index 00000000..249d7de5
--- /dev/null
+++ b/tests/eval_baselines.json
@@ -0,0 +1,361 @@
+{
+  "search": {
+    "run_date": "2026-03-09",
+    "db": "/Users/etanheyman/.local/share/zikaron/zikaron.db",
+    "cases": [
+      {
+        "name": "entity_avi_simon",
+        "query": "Avi Simon platform invites schedule",
+        "expected_snippets": [
+          "avi simon",
+          "6pm",
+          "6PM"
+        ],
+        "top_n": 3,
+        "project": null,
+        "tag": null,
+        "passed": true,
+        "actual_rank": 1,
+        "top_chunk_id": "manual-82fa7ad5a3614d46",
+        "top_snippet": "Avi Simon \u2014 6PM Platform Tech Advisor Call (Feb 24, 2026)\n\nContext: 60-min tech screen call with Avi"
+      },
+      {
+        "name": "entity_fedor",
+        "query": "Fedor iOS build handover GitHub",
+        "expected_snippets": [
+          "fedor",
+          "Fedor",
+          "iOS"
+        ],
+        "top_n": 3,
+        "project": null,
+        "tag": null,
+        "passed": true,
+        "actual_rank": 1,
+        "top_chunk_id": "manual-e731b63ab50c4769",
+        "top_snippet": "DECISION: MeHayom iOS build ownership (March 10 2026): Fedor submits iOS build to App Store review, "
+      },
+      {
+        "name": "entity_yuval_mehayom",
+        "query": "MeHayom Yuval sprint payment",
+        "expected_snippets": [
+          "yuval",
+          "Yuval",
+          "MeHayom"
+        ],
+        "top_n": 3,
+        "project": null,
+        "tag": null,
+        "passed": true,
+        "actual_rank": 1,
+        "top_chunk_id": "manual-6494601725414371",
+        "top_snippet": "[2026-03-10] MeHayom Sprint 1: First 50% payment (2,100 NIS) landed in bank. Yuval added Etan to App"
+      },
+      {
+        "name": "tag_decision",
+        "query": "important decision",
+        "expected_snippets": [
+          "decision",
+          "DECISION",
+          "chose"
+        ],
+        "top_n": 3,
+        "project": null,
+        "tag": "decision",
+        "passed": true,
+        "actual_rank": 2,
+        "top_chunk_id": "manual-0822d7551cde4362",
+        "top_snippet": "6PM Research Results Summary (Feb 28, 3 deep research papers):\n\nEXTRACTION OPTIMIZATION:\n- Tool desc"
+      },
+      {
+        "name": "tag_voicelayer_scoped",
+        "query": "architecture decision voicelayer",
+        "expected_snippets": [
+          "VoiceLayer",
+          "voice"
+        ],
+        "top_n": 3,
+        "project": "voicelayer",
+        "tag": null,
+        "passed": true,
+        "actual_rank": 1,
+        "top_chunk_id": "manual-a5b9d46fcc0b484a",
+        "top_snippet": "VoiceLayer architecture BUG: FlowBar is the client, MCP servers are the servers \u2014 backwards. Each MC"
+      },
+      {
+        "name": "recency_milestone",
+        "query": "measurement mandate evals before improvements",
+        "expected_snippets": [
+          "measurement",
+          "evals",
+          "baseline"
+        ],
+        "top_n": 3,
+        "project": null,
+        "tag": null,
+        "passed": true,
+        "actual_rank": 1,
+        "top_chunk_id": "manual-1a0c8f70a35d4ac4",
+        "top_snippet": "MEASUREMENT MANDATE (March 9, 2026): All improvements must be measured before and after. No \"it feel"
+      },
+      {
+        "name": "hebrew_style_correction",
+        "query": "em dashes Hebrew writing style correction freelance",
+        "expected_snippets": [
+          "hebrew",
+          "Hebrew",
+          "em dash"
+        ],
+        "top_n": 3,
+        "project": null,
+        "tag": null,
+        "passed": true,
+        "actual_rank": 1,
+        "top_chunk_id": "manual-83d0d8baa2c549b4",
+        "top_snippet": "ETAN'S HEBREW WRITING STYLE \u2014 Freelance/WhatsApp (extracted from iterative corrections, March 10 202"
+      },
+      {
+        "name": "cross_fts5_architecture",
+        "query": "FTS5 search quality gaps summary tags indexed",
+        "expected_snippets": [
+          "FTS5",
+          "fts5",
+          "summary"
+        ],
+        "top_n": 3,
+        "project": "brainlayer",
+        "tag": null,
+        "passed": true,
+        "actual_rank": 1,
+        "top_chunk_id": "manual-17ea700f44794d55",
+        "top_snippet": "BrainLayer Search Pipeline Gaps (from Cursor audit Mar 2):\n1. FTS5 only indexes chunks.content \u2014 sum"
+      },
+      {
+        "name": "cross_golems_monorepo",
+        "query": "golems monorepo architecture golem-powers CLI",
+        "expected_snippets": [
+          "golems",
+          "golem",
+          "monorepo"
+        ],
+        "top_n": 3,
+        "project": null,
+        "tag": null,
+        "passed": true,
+        "actual_rank": 1,
+        "top_chunk_id": "/Users/etanheyman/.claude/projects/-Users-etanheym",
+        "top_snippet": "Do a VERY THOROUGH exploration of the Golems monorepo automation infrastructure. I need to understan"
+      },
+      {
+        "name": "decision_voicelayer_rule",
+        "query": "voicelayer local CLI voice tools architecture rule",
+        "expected_snippets": [
+          "local",
+          "CLI",
+          "VoiceLayer"
+        ],
+        "top_n": 3,
+        "project": null,
+        "tag": null,
+        "passed": true,
+        "actual_rank": 1,
+        "top_chunk_id": "/Users/etanheyman/.claude/projects/-Users-etanheym",
+        "top_snippet": "## Summary of Key Findings\n\n### 1. **Async Voice Architecture Research** (first 100 lines)\nThe hybri"
+      },
+      {
+        "name": "decision_brainlayer_v3",
+        "query": "BrainLayer v3 architecture decisions sqlite-vec",
+        "expected_snippets": [
+          "BrainLayer",
+          "architecture",
+          "v3"
+        ],
+        "top_n": 3,
+        "project": "brainlayer",
+        "tag": null,
+        "passed": true,
+        "actual_rank": 1,
+        "top_chunk_id": "manual-705bf11e2e7c436a",
+        "top_snippet": "BrainLayer v3 Plan \u2014 Major Architecture Decisions (Feb 24, 2026):\n\n1. VISION: BrainLayer becomes the"
+      },
+      {
+        "name": "memory_whoop",
+        "query": "remember when we discussed WHOOP recovery score",
+        "expected_snippets": [
+          "whoop",
+          "Whoop",
+          "WHOOP"
+        ],
+        "top_n": 3,
+        "project": null,
+        "tag": null,
+        "passed": true,
+        "actual_rank": 1,
+        "top_chunk_id": "/Users/etanheyman/.claude/projects/-Users-etanheym",
+        "top_snippet": "\u23fa Bash(TOKEN=$(python3 -c \"import json; print(json.load(open('/tmp/whoop-tokens.json'))['access_toke"
+      },
+      {
+        "name": "memory_coach_schedule",
+        "query": "morning schedule wake up huberman protocol",
+        "expected_snippets": [
+          "schedule",
+          "morning",
+          "huberman"
+        ],
+        "top_n": 3,
+        "project": null,
+        "tag": null,
+        "passed": true,
+        "actual_rank": 1,
+        "top_chunk_id": "/Users/etanheyman/.claude/projects/-Users-etanheym",
+        "top_snippet": "   - **`/Users/etanheyman/Gits/golems/skills/golem-powers/coach/workflows/health.md`** (REWRITTEN th"
+      },
+      {
+        "name": "mined_search_quality",
+        "query": "brainlayer search quality evaluation evals measurement mandate",
+        "expected_snippets": [
+          "measurement",
+          "evals",
+          "baseline"
+        ],
+        "top_n": 3,
+        "project": null,
+        "tag": null,
+        "passed": true,
+        "actual_rank": 1,
+        "top_chunk_id": "manual-1a0c8f70a35d4ac4",
+        "top_snippet": "MEASUREMENT MANDATE (March 9, 2026): All improvements must be measured before and after. No \"it feel"
+      },
+      {
+        "name": "mined_enrichment_backend",
+        "query": "enrichment MLX Groq backend progress stats chunks",
+        "expected_snippets": [
+          "enrichment",
+          "MLX",
+          "Groq"
+        ],
+        "top_n": 3,
+        "project": "brainlayer",
+        "tag": null,
+        "passed": true,
+        "actual_rank": 1,
+        "top_chunk_id": "manual-0717f7968aff4c5c",
+        "top_snippet": "Switched enrichment from MLX to Groq (March 2 2026). Reason: MLX server (Qwen2.5-Coder-14B, ~5GB RAM"
+      },
+      {
+        "name": "mined_6pm_decisions",
+        "query": "6pm scheduling architecture confirmed decisions",
+        "expected_snippets": [
+          "6pm",
+          "6PM",
+          "scheduling"
+        ],
+        "top_n": 3,
+        "project": null,
+        "tag": null,
+        "passed": true,
+        "actual_rank": 1,
+        "top_chunk_id": "/Users/etanheyman/.claude/projects/-Users-etanheym",
+        "top_snippet": "You are 6pmClaude \u2014 working on the 6PM blind scheduling system.\n\n## First: Load Context\n\n```\nbrain_s"
+      },
+      {
+        "name": "mined_cursor_cli",
+        "query": "cursor CLI agent versus cursor IDE difference",
+        "expected_snippets": [
+          "cursor",
+          "Cursor",
+          "CLI"
+        ],
+        "top_n": 3,
+        "project": null,
+        "tag": null,
+        "passed": true,
+        "actual_rank": 1,
+        "top_chunk_id": "/Users/etanheyman/.claude/projects/-Users-etanheym",
+        "top_snippet": "Perfect! Now I have comprehensive information. Let me compile a detailed research report for you:\n\n#"
+      },
+      {
+        "name": "gap_auth_cross_project",
+        "query": "authentication JWT tokens security implementation",
+        "expected_snippets": [
+          "auth",
+          "JWT",
+          "token"
+        ],
+        "top_n": 3,
+        "project": null,
+        "tag": null,
+        "passed": true,
+        "actual_rank": 1,
+        "top_chunk_id": "/Users/etanheyman/.claude/projects/-Users-etanheym",
+        "top_snippet": "from brainlayer import brain\nresult = brain.search(\"how did I implement auth\")\nbrain.store(\"Decided "
+      },
+      {
+        "name": "gap_hebrew_semantic",
+        "query": "\u05dc\u05d5\u05d7 \u05d6\u05de\u05e0\u05d9\u05dd \u05d1\u05d5\u05e7\u05e8 \u05e7\u05d5\u05d3",
+        "expected_snippets": [
+          "\u05dc\u05d5\u05d7 \u05d6\u05de\u05e0\u05d9\u05dd",
+          "schedule",
+          "morning"
+        ],
+        "top_n": 3,
+        "project": null,
+        "tag": null,
+        "passed": false,
+        "actual_rank": null,
+        "top_chunk_id": "/Users/etanheyman/.claude/projects/-Users-etanheym",
+        "top_snippet": "\u05d4\u05d9\u05d9, \u05d0\u05e0\u05d9 \u05de\u05db\u05d9\u05df \u05d0\u05ea \u05d4\u05e4\u05e8\u05d5\u05d9\u05e7\u05d8 \u05d4\u05d6\u05d4 \u05d1\u05e9\u05d1\u05d9\u05dc \u05d0\u05de\u05d0 \u05e9\u05dc\u05d9 \u05d5\u05d1\u05df \u05d6\u05d5\u05d2\u05d4. \u05d4\u05dd \u05d8\u05e1\u05d9\u05dd \u05dc\u05d0\u05dc\u05d1\u05e0\u05d9\u05d4.\n\n\u05d9\u05e9 \u05dc\u05d4\u05dd \u05db\u05d1\u05e8 \u05db\u05de\u05d4 \u05d3\u05d1\u05e8\u05d9\u05dd \u05de\u05ea\u05d5\u05db\u05e0\u05e0\u05d9\u05dd"
+      }
+    ],
+    "pass_count": 18,
+    "total": 19,
+    "score_pct": 94.7
+  },
+  "hook": {
+    "section": "hook_entity_injection",
+    "pass_count": 4,
+    "total": 4,
+    "score_pct": 100.0,
+    "cases": [
+      {
+        "name": "hook_entity_avi_simon",
+        "query": "What are Avi Simon's meeting preferences?",
+        "expected": [
+          "[entity:",
+          "entity: avi simon",
+          "person"
+        ],
+        "passed": true,
+        "output_preview": "[Entity: Avi Simon \u2014 person]\n- [? (golems)] # Family Brain \u2014 Multi-User BrainLayer + Relationship Ecosystem | > From \"Etan's tools\" to a people/busine"
+      },
+      {
+        "name": "hook_entity_fedor",
+        "query": "What is Fedor working on with GitHub access?",
+        "expected": [
+          "[entity:",
+          "entity: fedor",
+          "person"
+        ],
+        "passed": true,
+        "output_preview": "[Entity: Fedor \u2014 person]\n- [? (orchestrator)] coachClaude session transcript \u2014 March 9-10, 2026. ~/Gits/golems/packages/coach | ## Key Events: | ### 1"
+      },
+      {
+        "name": "hook_entity_first_line",
+        "query": "Tell me about Avi Simon and his 6PM project",
+        "expected": [
+          "[entity"
+        ],
+        "passed": true,
+        "output_preview": "[Entity: Avi Simon \u2014 person]\n- [? (golems)] # Family Brain \u2014 Multi-User BrainLayer + Relationship Ecosystem | > From \"Etan's tools\" to a people/busine"
+      },
+      {
+        "name": "hook_no_entity_generic",
+        "query": "How does authentication work in Python?",
+        "expected": null,
+        "passed": true,
+        "output_preview": "[BrainLayer auto] Memories matching your prompt:\n- [2026-02-19 imp:7 (golems)] Also, I got the authentication application successfully set up on PyPi."
+      }
+    ]
+  },
+  "combined_score_pct": 95.7
+}
\ No newline at end of file
diff --git a/tests/test_eval_baselines.py b/tests/test_eval_baselines.py
new file mode 100644
index 00000000..31f16b17
--- /dev/null
+++ b/tests/test_eval_baselines.py
@@ -0,0 +1,703 @@
+"""Phase 0: Search Quality Eval Suite — Baselines.
+
+Measures real BrainLayer search quality against the production DB.
+Requires: `pytest -m live` (skipped in CI, requires live DB).
+
+Grade: expected content snippet in top N = PASS.
+Baseline file: tests/eval_baselines.json (generated by scripts/run_evals.py).
+
+Run baseline:
+    python scripts/run_evals.py > tests/eval_baselines.json
+
+Run live tests:
+    pytest tests/test_eval_baselines.py -m live -v
+
+Test cases cover:
+- Entity routing (entity names in query)
+- Tag filter (structured queries)
+- Recency (recent content prioritized)
+- Hebrew FTS (non-ASCII content)
+- Cross-project search
+- Decision/correction retrieval
+- Memory word detection
+- Real mined queries from session history
+"""
+
+import json
+from pathlib import Path
+
+import pytest
+
+# ── Fixtures ────────────────────────────────────────────────────────────────
+
+EVAL_CASES_FILE = Path(__file__).parent / "eval_baselines.json"
+
+
+@pytest.fixture(scope="module")
+def live_store():
+    """Real production VectorStore — requires live DB.
+
+    Skips if DB is absent (CI environment).
+    """
+    pytest.importorskip("brainlayer", reason="brainlayer not installed")
+    import os
+    import sys
+
+    src = Path(__file__).parent.parent / "src"
+    if str(src) not in sys.path:
+        sys.path.insert(0, str(src))
+
+    from brainlayer.paths import get_db_path
+    from brainlayer.vector_store import VectorStore
+
+    db = get_db_path()
+    if not os.path.exists(db):
+        pytest.skip(f"Live DB not found at {db}")
+
+    store = VectorStore(db)
+    yield store
+    store.close()
+
+
+@pytest.fixture(scope="module")
+def live_model():
+    """Real embedding model — cached across tests in this module."""
+    import sys
+
+    src = Path(__file__).parent.parent / "src"
+    if str(src) not in sys.path:
+        sys.path.insert(0, str(src))
+
+    from brainlayer.embeddings import get_embedding_model
+
+    return get_embedding_model()
+
+
+def _search(store, model, query, n=5, project=None, tag=None) -> tuple[list[str], list[str]]:
+    """Run hybrid search; return (chunk_ids, documents)."""
+    emb = model.embed_query(query)
+    results = store.hybrid_search(
+        query_embedding=emb,
+        query_text=query,
+        n_results=n,
+        project_filter=project,
+        tag_filter=tag,
+    )
+    return results["ids"][0], results["documents"][0]
+
+
+def _passes(docs: list[str], expected_snippets: list[str], top_n: int) -> bool:
+    """Return True if any expected_snippet appears in any of the top_n docs."""
+    for doc in docs[:top_n]:
+        doc_lower = doc.lower()
+        for snippet in expected_snippets:
+            if snippet.lower() in doc_lower:
+                return True
+    return False
+
+
+# ── Entity Routing Tests ─────────────────────────────────────────────────────
+
+
+@pytest.mark.live
+class TestEntityRouting:
+    """Entity names in query should surface entity-linked chunks."""
+
+    def test_avi_simon_entity(self, live_store, live_model):
+        """Query about Avi Simon should surface 6PM / MeHayom interaction chunks."""
+        ids, docs = _search(live_store, live_model, "Avi Simon platform invites schedule", n=5)
+        assert _passes(docs, ["avi simon", "6pm", "6PM"], top_n=3), (
+            f"Expected Avi Simon content in top 3, got: {[d[:60] for d in docs[:3]]}"
+        )
+
+    def test_fedor_entity(self, live_store, live_model):
+        """Query about Fedor should surface MeHayom / iOS build chunks."""
+        ids, docs = _search(live_store, live_model, "Fedor iOS build handover GitHub", n=5)
+        assert _passes(docs, ["fedor", "Fedor", "iOS", "MeHayom"], top_n=3), (
+            f"Expected Fedor content in top 3, got: {[d[:60] for d in docs[:3]]}"
+        )
+
+    def test_yuval_mehayom_entity(self, live_store, live_model):
+        """Query about Yuval (MeHayom client) should surface sprint/payment chunks."""
+        ids, docs = _search(live_store, live_model, "MeHayom Yuval sprint payment", n=5)
+        assert _passes(docs, ["yuval", "Yuval", "mehayom", "MeHayom"], top_n=3), (
+            f"Expected MeHayom/Yuval content in top 3, got: {[d[:60] for d in docs[:3]]}"
+        )
+
+
+# ── Tag Filter Tests ─────────────────────────────────────────────────────────
+
+
+@pytest.mark.live
+class TestTagFilter:
+    """Tag-filtered searches should respect chunk tag metadata."""
+
+    def test_decision_tag_returns_decisions(self, live_store, live_model):
+        """tag='decision' filter should return brain_store chunks tagged 'decision'."""
+        ids, docs = _search(
+            live_store,
+            live_model,
+            "important decision",
+            n=5,
+            tag="decision",
+        )
+        assert len(ids) >= 1, "tag='decision' filter returned 0 results"
+        # All results should contain decision-related content
+        assert _passes(docs, ["decision", "DECISION", "Decision", "chose", "decided"], top_n=3), (
+            f"tag=decision filter didn't return decision content, got: {[d[:60] for d in docs[:3]]}"
+        )
+
+    def test_voicelayer_decision_scoped(self, live_store, live_model):
+        """project=voicelayer + decision query should return voicelayer decisions."""
+        ids, docs = _search(
+            live_store,
+            live_model,
+            "architecture decision voicelayer",
+            n=5,
+            project="voicelayer",
+        )
+        assert len(ids) >= 1, "voicelayer project scoped search returned 0 results"
+        assert _passes(docs, ["voicelayer", "VoiceLayer", "voice", "Voice"], top_n=3), (
+            f"Expected VoiceLayer content, got: {[d[:60] for d in docs[:3]]}"
+        )
+
+
+# ── Recency Tests ────────────────────────────────────────────────────────────
+
+
+@pytest.mark.live
+class TestRecency:
+    """Recent content should rank above old content for recency-sensitive queries."""
+
+    @pytest.mark.xfail(
+        reason="hybrid_search lacks temporal 'today' awareness; _brain_search router handles this via _current_context"
+    )
+    def test_today_returns_recent_chunks(self, live_store, live_model):
+        """'what happened today' should return chunks from 2026-03-09 or 2026-03-10.
+
+        Gap: hybrid_search scores by relevance+recency but doesn't boost 'today=24h'.
+        Fix: route 'today' queries through _brain_search → _current_context.
+        """
+        ids, docs = _search(live_store, live_model, "what happened today current work progress", n=5)
+        cursor = live_store.conn.cursor()
+        recent_found = False
+        for cid in ids[:5]:
+            rows = list(cursor.execute("SELECT created_at FROM chunks WHERE id = ?", (cid,)))
+            if rows and rows[0][0]:
+                date_str = str(rows[0][0])
+                if "2026-03-09" in date_str or "2026-03-10" in date_str:
+                    recent_found = True
+                    break
+        assert recent_found, (
+            f"Expected at least one chunk from 2026-03-09/10 in top 5 for 'today' query. "
+            f"Got chunk_ids: {ids[:5]}"
+        )
+
+    def test_recent_milestone_findable(self, live_store, live_model):
+        """Recent brain_store milestone should appear for relevant query."""
+        ids, docs = _search(
+            live_store,
+            live_model,
+            "measurement mandate evals before improvements",
+            n=5,
+        )
+        assert _passes(docs, ["measurement", "Measurement", "MEASUREMENT", "evals", "baseline"], top_n=3), (
+            f"Expected measurement mandate chunk in top 3, got: {[d[:60] for d in docs[:3]]}"
+        )
+
+
+# ── Hebrew FTS Tests ─────────────────────────────────────────────────────────
+
+
+@pytest.mark.live
+class TestHebrewFTS:
+    """Hebrew content should be discoverable via FTS5 and semantic search."""
+
+    def test_hebrew_query_returns_hebrew_content(self, live_store, live_model):
+        """Hebrew query should return Hebrew-language chunks."""
+        ids, docs = _search(live_store, live_model, "לוח זמנים בוקר תזמון", n=5)
+        # Check that results contain Hebrew characters
+        hebrew_found = any(any("\u0590" <= c <= "\u05ea" for c in doc) for doc in docs[:3])
+        assert hebrew_found, (
+            f"Expected Hebrew content in top 3 results for Hebrew query. "
+            f"Got: {[d[:60] for d in docs[:3]]}"
+        )
+
+    def test_hebrew_style_correction_findable(self, live_store, live_model):
+        """Hebrew writing style correction should be findable by style query."""
+        ids, docs = _search(
+            live_store,
+            live_model,
+            "em dashes Hebrew writing style correction freelance",
+            n=5,
+        )
+        assert _passes(docs, ["hebrew", "Hebrew", "HEBREW", "em dash", "אתן", "style"], top_n=3), (
+            f"Expected Hebrew style chunk in top 3, got: {[d[:60] for d in docs[:3]]}"
+        )
+
+
+# ── Cross-Project Search Tests ───────────────────────────────────────────────
+
+
+@pytest.mark.live
+class TestCrossProject:
+    """Cross-project searches should surface multi-repo content."""
+
+    def test_brainlayer_fts5_architecture_findable(self, live_store, live_model):
+        """BrainLayer FTS5 architecture discussion should be findable."""
+        ids, docs = _search(
+            live_store,
+            live_model,
+            "FTS5 search quality gaps summary tags indexed",
+            n=5,
+            project="brainlayer",
+        )
+        assert _passes(docs, ["FTS5", "fts5", "summary", "tags", "indexed", "gaps"], top_n=3), (
+            f"Expected FTS5 gap analysis in top 3, got: {[d[:60] for d in docs[:3]]}"
+        )
+
+    def test_golems_architecture_findable(self, live_store, live_model):
+        """Golems architecture discussions should be findable cross-project."""
+        ids, docs = _search(
+            live_store,
+            live_model,
+            "golems monorepo architecture golem-powers CLI",
+            n=5,
+        )
+        assert _passes(docs, ["golems", "Golems", "golem", "monorepo"], top_n=3), (
+            f"Expected golems architecture in top 3, got: {[d[:60] for d in docs[:3]]}"
+        )
+
+
+# ── Decision/Correction Retrieval Tests ─────────────────────────────────────
+
+
+@pytest.mark.live
+class TestDecisionRetrieval:
+    """Stored decisions and corrections should be retrievable."""
+
+    def test_voicelayer_architectural_rule_findable(self, live_store, live_model):
+        """The VoiceLayer architectural rule (local tools) should be findable."""
+        ids, docs = _search(
+            live_store,
+            live_model,
+            "voicelayer local CLI voice tools architecture rule",
+            n=5,
+        )
+        assert _passes(
+            docs,
+            ["local", "CLI", "architecture", "VoiceLayer", "voicelayer", "voice"],
+            top_n=3,
+        ), (
+            f"Expected VoiceLayer architecture rule in top 3, got: {[d[:60] for d in docs[:3]]}"
+        )
+
+    def test_brainlayer_decision_findable(self, live_store, live_model):
+        """BrainLayer v3 major architecture decisions should be findable."""
+        ids, docs = _search(
+            live_store,
+            live_model,
+            "BrainLayer v3 architecture decisions sqlite-vec embeddings",
+            n=5,
+            project="brainlayer",
+        )
+        assert _passes(docs, ["BrainLayer", "brainlayer", "architecture", "v3", "sqlite"], top_n=3), (
+            f"Expected BrainLayer architecture decision in top 3, got: {[d[:60] for d in docs[:3]]}"
+        )
+
+
+# ── Memory / Deep Mode Tests ─────────────────────────────────────────────────
+
+
+@pytest.mark.live
+class TestMemoryRetrieval:
+    """Memory recall queries should surface stored manual chunks."""
+
+    def test_whoop_discussion_findable(self, live_store, live_model):
+        """WHOOP recovery discussions should be findable via memory query."""
+        ids, docs = _search(
+            live_store,
+            live_model,
+            "remember when we discussed WHOOP recovery score",
+            n=5,
+        )
+        assert _passes(docs, ["whoop", "Whoop", "WHOOP", "recovery", "Recovery"], top_n=3), (
+            f"Expected WHOOP content in top 3, got: {[d[:60] for d in docs[:3]]}"
+        )
+
+    def test_coach_schedule_findable(self, live_store, live_model):
+        """Coach scheduling context should be findable."""
+        ids, docs = _search(
+            live_store,
+            live_model,
+            "morning schedule wake up huberman protocol",
+            n=5,
+        )
+        assert _passes(docs, ["schedule", "morning", "huberman", "Huberman", "wake", "coach"], top_n=3), (
+            f"Expected schedule/huberman content in top 3, got: {[d[:60] for d in docs[:3]]}"
+        )
+
+
+# ── Mined From Logs: Real User Queries ───────────────────────────────────────
+
+
+@pytest.mark.live
+class TestMinedQueries:
+    """Real queries from session transcripts — highest-value test cases."""
+
+    def test_brainlayer_search_quality_evaluation(self, live_store, live_model):
+        """User's actual query: search quality eval measurement (seen in sessions)."""
+        ids, docs = _search(
+            live_store,
+            live_model,
+            "brainlayer search quality evaluation evals measurement mandate",
+            n=5,
+        )
+        assert _passes(docs, ["measurement", "MEASUREMENT", "evals", "baseline", "quality"], top_n=3), (
+            f"Expected measurement mandate chunk in top 3, got: {[d[:60] for d in docs[:3]]}"
+        )
+
+    def test_enrichment_backend_mlx_groq(self, live_store, live_model):
+        """User query: enrichment backend status (from brainlayer sessions)."""
+        ids, docs = _search(
+            live_store,
+            live_model,
+            "enrichment MLX Groq backend progress stats chunks",
+            n=5,
+            project="brainlayer",
+        )
+        assert _passes(docs, ["enrichment", "MLX", "mlx", "Groq", "groq", "backend", "chunks"], top_n=3), (
+            f"Expected enrichment backend info in top 3, got: {[d[:60] for d in docs[:3]]}"
+        )
+
+    def test_6pm_architecture_decisions(self, live_store, live_model):
+        """User query: 6pm architecture decisions (mined from 6pm-mini sessions)."""
+        ids, docs = _search(
+            live_store,
+            live_model,
+            "6pm scheduling architecture confirmed decisions",
+            n=5,
+        )
+        assert _passes(docs, ["6pm", "6PM", "scheduling", "architecture", "decision"], top_n=3), (
+            f"Expected 6PM architecture in top 3, got: {[d[:60] for d in docs[:3]]}"
+        )
+
+    def test_cursor_cli_vs_ide_difference(self, live_store, live_model):
+        """User query: cursor CLI agent vs cursor IDE (mined from 6pm sessions)."""
+        ids, docs = _search(
+            live_store,
+            live_model,
+            "cursor CLI agent versus cursor IDE difference",
+            n=5,
+        )
+        assert _passes(docs, ["cursor", "Cursor", "CLI", "IDE", "agent"], top_n=3), (
+            f"Expected cursor CLI vs IDE content in top 3, got: {[d[:60] for d in docs[:3]]}"
+        )
+
+
+# ── Gap Identification Tests ─────────────────────────────────────────────────
+
+
+@pytest.mark.live
+class TestKnownGaps:
+    """Test cases expected to FAIL at baseline — document search gaps."""
+
+    @pytest.mark.xfail(reason="Cross-project auth: FTS returns brainlayer code not auth patterns")
+    def test_authentication_patterns_cross_project(self, live_store, live_model):
+        """Searching 'authentication JWT' cross-project should find auth patterns, not brainlayer code."""
+        ids, docs = _search(
+            live_store,
+            live_model,
+            "authentication JWT tokens security implementation",
+            n=5,
+        )
+        # Expect actual auth implementation, not brainlayer internals
+        assert _passes(docs, ["auth", "JWT", "token", "security"], top_n=3) and not _passes(
+            docs, ["hybrid_search", "vector_store", "VectorStore"], top_n=2
+        ), (f"Expected auth patterns, got brainlayer internals: {[d[:60] for d in docs[:3]]}")
+
+    @pytest.mark.xfail(reason="Hebrew semantic: returns unrelated Hebrew content (Albania trip)")
+    def test_hebrew_schedule_semantic_accuracy(self, live_store, live_model):
+        """Hebrew 'schedule morning' query should return schedule content, not unrelated Hebrew."""
+        ids, docs = _search(live_store, live_model, "לוח זמנים בוקר קוד", n=5)
+        # Should NOT return Albania trip planning
+        albania_in_results = any("אלבניה" in doc for doc in docs[:3])
+        assert not albania_in_results, (
+            f"Hebrew query returned Albania trip content instead of schedule. "
+            f"This is the gap to fix."
+        )
+
+
+# ── Baseline Runner ──────────────────────────────────────────────────────────
+
+
+def run_baseline() -> dict:
+    """Run all eval cases and return scored results.
+
+    Called by scripts/run_evals.py to generate eval_baselines.json.
+    """
+    import os
+    import sys
+
+    src = Path(__file__).parent.parent / "src"
+    if str(src) not in sys.path:
+        sys.path.insert(0, str(src))
+
+    from brainlayer.embeddings import get_embedding_model
+    from brainlayer.paths import get_db_path
+    from brainlayer.vector_store import VectorStore
+
+    db = get_db_path()
+    store = VectorStore(db)
+    model = get_embedding_model()
+
+    # All eval cases: (name, query, expected_snippets, top_n, project, tag)
+    eval_cases = [
+        # Entity routing
+        ("entity_avi_simon", "Avi Simon platform invites schedule", ["avi simon", "6pm", "6PM"], 3, None, None),
+        ("entity_fedor", "Fedor iOS build handover GitHub", ["fedor", "Fedor", "iOS", "MeHayom"], 3, None, None),
+        ("entity_yuval_mehayom", "MeHayom Yuval sprint payment", ["yuval", "Yuval", "MeHayom"], 3, None, None),
+        # Tag filter
+        ("tag_decision", "important decision", ["decision", "DECISION", "chose", "decided"], 3, None, "decision"),
+        ("tag_voicelayer_scoped", "architecture decision voicelayer", ["VoiceLayer", "voice"], 3, "voicelayer", None),
+        # Recency
+        ("recency_milestone", "measurement mandate evals before improvements", ["measurement", "evals", "baseline"], 3, None, None),
+        # Hebrew FTS
+        ("hebrew_style_correction", "em dashes Hebrew writing style correction freelance", ["hebrew", "Hebrew", "em dash", "style"], 3, None, None),
+        # Cross-project
+        ("cross_fts5_architecture", "FTS5 search quality gaps summary tags indexed", ["FTS5", "fts5", "summary", "gaps"], 3, "brainlayer", None),
+        ("cross_golems_monorepo", "golems monorepo architecture golem-powers CLI", ["golems", "golem", "monorepo"], 3, None, None),
+        # Decision retrieval
+        ("decision_voicelayer_rule", "voicelayer local CLI voice tools architecture rule", ["local", "CLI", "VoiceLayer", "voice"], 3, None, None),
+        ("decision_brainlayer_v3", "BrainLayer v3 architecture decisions sqlite-vec", ["BrainLayer", "architecture", "v3", "sqlite"], 3, "brainlayer", None),
+        # Memory retrieval
+        ("memory_whoop", "remember when we discussed WHOOP recovery score", ["whoop", "Whoop", "WHOOP", "recovery"], 3, None, None),
+        ("memory_coach_schedule", "morning schedule wake up huberman protocol", ["schedule", "morning", "huberman", "Huberman", "coach"], 3, None, None),
+        # Mined from logs
+        ("mined_search_quality", "brainlayer search quality evaluation evals measurement mandate", ["measurement", "evals", "baseline", "quality"], 3, None, None),
+        ("mined_enrichment_backend", "enrichment MLX Groq backend progress stats chunks", ["enrichment", "MLX", "Groq", "backend"], 3, "brainlayer", None),
+        ("mined_6pm_decisions", "6pm scheduling architecture confirmed decisions", ["6pm", "6PM", "scheduling", "architecture"], 3, None, None),
+        ("mined_cursor_cli", "cursor CLI agent versus cursor IDE difference", ["cursor", "Cursor", "CLI", "IDE"], 3, None, None),
+        # Known gaps (expected to fail at baseline)
+        ("gap_auth_cross_project", "authentication JWT tokens security implementation", ["auth", "JWT", "token"], 3, None, None),
+        ("gap_hebrew_semantic", "לוח זמנים בוקר קוד", ["לוח זמנים", "schedule", "morning"], 3, None, None),
+    ]
+
+    results = {
+        "run_date": __import__("datetime").date.today().isoformat(),
+        "db": str(db),
+        "cases": [],
+        "pass_count": 0,
+        "total": len(eval_cases),
+        "score_pct": 0.0,
+    }
+
+    for name, query, expected_snippets, top_n, project, tag in eval_cases:
+        try:
+            ids, docs = _search(store, model, query, n=top_n + 2, project=project, tag=tag)
+            passed = _passes(docs, expected_snippets, top_n=top_n)
+            # Find actual rank of first matching doc
+            actual_rank = None
+            for i, doc in enumerate(docs):
+                if any(s.lower() in doc.lower() for s in expected_snippets):
+                    actual_rank = i + 1
+                    break
+            top_snippet = docs[0][:100] if docs else ""
+            top_chunk_id = ids[0] if ids else ""
+        except Exception as e:
+            passed = False
+            actual_rank = None
+            top_snippet = f"ERROR: {e}"
+            top_chunk_id = ""
+
+        case_result = {
+            "name": name,
+            "query": query,
+            "expected_snippets": expected_snippets[:3],
+            "top_n": top_n,
+            "project": project,
+            "tag": tag,
+            "passed": passed,
+            "actual_rank": actual_rank,
+            "top_chunk_id": top_chunk_id[:50],
+            "top_snippet": top_snippet,
+        }
+        results["cases"].append(case_result)
+        if passed:
+            results["pass_count"] += 1
+
+    results["score_pct"] = round(results["pass_count"] / results["total"] * 100, 1)
+    store.close()
+    return results
+
+
+# ── Prompt Hook Entity Injection Tests ──────────────────────────────────────
+
+
+@pytest.mark.live
+class TestPromptHookEntityInjection:
+    """The UserPromptSubmit hook should detect entity names and inject entity context.
+
+    Baseline: FAIL (hook only does FTS5 keyword search — no entity detection).
+    After Phase A: PASS (hook detects entities → injects KG profile + linked chunks).
+    """
+
+    HOOK_PATH = Path.home() / ".claude" / "hooks" / "brainlayer-prompt-search.py"
+
+    def _call_hook(self, prompt: str) -> str:
+        """Run the hook subprocess and return its stdout."""
+        import subprocess
+
+        if not self.HOOK_PATH.exists():
+            pytest.skip(f"Hook not found at {self.HOOK_PATH}")
+
+        result = subprocess.run(
+            ["python3", str(self.HOOK_PATH)],
+            input=json.dumps({"prompt": prompt}),
+            capture_output=True,
+            text=True,
+            timeout=5,
+        )
+        return result.stdout
+
+    def test_entity_detected_avi_simon(self):
+        """Hook should detect 'Avi Simon' as a known entity and inject entity label."""
+        output = self._call_hook("What are Avi Simon's meeting preferences?")
+        # After Phase A: output should mention entity type or entity header
+        assert "[entity:" in output.lower() or "entity: avi simon" in output.lower() or (
+            "avi simon" in output.lower() and "person" in output.lower()
+        ), (
+            f"Expected entity injection for 'Avi Simon' in hook output. "
+            f"This is the Phase A gap (baseline = FAIL).\n"
+            f"Actual output: {output[:300]!r}"
+        )
+
+    def test_entity_detected_fedor(self):
+        """Hook should detect 'Fedor' as a known entity and inject entity label."""
+        output = self._call_hook("What is Fedor working on with GitHub access?")
+        assert "[entity:" in output.lower() or "entity: fedor" in output.lower() or (
+            "fedor" in output.lower() and "person" in output.lower()
+        ), (
+            f"Expected entity injection for 'Fedor' in hook output. "
+            f"Phase A gap (baseline = FAIL).\n"
+            f"Actual output: {output[:300]!r}"
+        )
+
+    def test_hook_injects_entity_before_fts_results(self):
+        """When entity detected, entity section should appear first in hook output."""
+        output = self._call_hook("Tell me about Avi Simon and his 6PM project")
+        lines = [l for l in output.strip().split("\n") if l.strip()]
+        if not lines:
+            pytest.skip("Hook returned no output")
+        # Entity section should come before FTS results
+        first_line = lines[0].lower()
+        assert "[entity" in first_line or "entity:" in first_line, (
+            f"First line should be entity header, got: {first_line!r}"
+        )
+
+    def test_no_entity_in_generic_query(self):
+        """Hook should NOT inject entity section for generic queries."""
+        output = self._call_hook("How does authentication work in Python?")
+        # Generic query — no entity injection expected
+        assert "[entity:" not in output.lower(), (
+            f"Generic query should not trigger entity injection. Got: {output[:200]!r}"
+        )
+
+
+def run_hook_baseline() -> dict:
+    """Run hook entity injection eval cases. Returns scored results.
+
+    Called by scripts/run_evals.py as part of full baseline.
+    """
+    import subprocess
+
+    hook_path = Path.home() / ".claude" / "hooks" / "brainlayer-prompt-search.py"
+
+    def call_hook(prompt: str) -> str:
+        if not hook_path.exists():
+            return ""
+        r = subprocess.run(
+            ["python3", str(hook_path)],
+            input=json.dumps({"prompt": prompt}),
+            capture_output=True, text=True, timeout=5,
+        )
+        return r.stdout
+
+    hook_cases = [
+        ("hook_entity_avi_simon", "What are Avi Simon's meeting preferences?",
+         ["[entity:", "entity: avi simon", "person"]),
+        ("hook_entity_fedor", "What is Fedor working on with GitHub access?",
+         ["[entity:", "entity: fedor", "person"]),
+        ("hook_entity_first_line", "Tell me about Avi Simon and his 6PM project",
+         ["[entity"]),  # entity header must be first line
+        ("hook_no_entity_generic", "How does authentication work in Python?",
+         None),  # None = expect entity NOT present
+    ]
+
+    cases = []
+    pass_count = 0
+    for name, prompt, expected in hook_cases:
+        output = call_hook(prompt)
+        output_lower = output.lower()
+
+        if expected is None:
+            # Negative test: entity injection should NOT appear
+            passed = "[entity:" not in output_lower
+        else:
+            passed = any(e.lower() in output_lower for e in expected)
+
+        cases.append({
+            "name": name,
+            "query": prompt,
+            "expected": expected,
+            "passed": passed,
+            "output_preview": output[:150],
+        })
+        if passed:
+            pass_count += 1
+
+    return {
+        "section": "hook_entity_injection",
+        "pass_count": pass_count,
+        "total": len(hook_cases),
+        "score_pct": round(pass_count / len(hook_cases) * 100, 1),
+        "cases": cases,
+    }
+
+
+if __name__ == "__main__":
+    import json
+
+    baseline = run_baseline()
+    hook_baseline = run_hook_baseline()
+
+    print(f"\n=== brain_search quality ===")
+    print(f"Baseline Score: {baseline['pass_count']}/{baseline['total']} ({baseline['score_pct']}%)")
+    print(f"Run date: {baseline['run_date']}")
+    for case in baseline["cases"]:
+        status = "✓ PASS" if case["passed"] else "✗ FAIL"
+        rank = f"rank={case['actual_rank']}" if case["actual_rank"] else "not found"
+        print(f"  {status} [{case['name']}] {rank}")
+        if not case["passed"]:
+            print(f"         top: {case['top_snippet'][:70]!r}")
+
+    print(f"\n=== hook entity injection ===")
+    print(f"Baseline Score: {hook_baseline['pass_count']}/{hook_baseline['total']} ({hook_baseline['score_pct']}%)")
+    for case in hook_baseline["cases"]:
+        status = "✓ PASS" if case["passed"] else "✗ FAIL"
+        print(f"  {status} [{case['name']}]")
+        if not case["passed"]:
+            print(f"         output: {case['output_preview'][:80]!r}")
+
+    combined = {
+        "search": baseline,
+        "hook": hook_baseline,
+        "combined_score_pct": round(
+            (baseline["pass_count"] + hook_baseline["pass_count"])
+            / (baseline["total"] + hook_baseline["total"]) * 100, 1
+        ),
+    }
+    print(f"\nCombined: {combined['combined_score_pct']}%")
+    output_path = Path(__file__).parent / "eval_baselines.json"
+    output_path.write_text(json.dumps(combined, indent=2))
+    print(f"Saved to {output_path}")

From 0d5cc75ecfadcb36167d3de62ace11c48991a647 Mon Sep 17 00:00:00 2001
From: Etan Joseph Heyman <etan@heyman.net>
Date: Mon, 9 Mar 2026 13:24:19 +0200
Subject: [PATCH 2/3] fix: resolve ruff lint errors (unused import, f-string
 placeholders)

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 tests/test_eval_baselines.py | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/tests/test_eval_baselines.py b/tests/test_eval_baselines.py
index 31f16b17..d9e0b933 100644
--- a/tests/test_eval_baselines.py
+++ b/tests/test_eval_baselines.py
@@ -423,8 +423,8 @@ def test_hebrew_schedule_semantic_accuracy(self, live_store, live_model):
         # Should NOT return Albania trip planning
         albania_in_results = any("אלבניה" in doc for doc in docs[:3])
         assert not albania_in_results, (
-            f"Hebrew query returned Albania trip content instead of schedule. "
-            f"This is the gap to fix."
+            "Hebrew query returned Albania trip content instead of schedule. "
+            "This is the gap to fix."
         )
 
 
@@ -436,7 +436,6 @@ def run_baseline() -> dict:
 
     Called by scripts/run_evals.py to generate eval_baselines.json.
     """
-    import os
     import sys
 
     src = Path(__file__).parent.parent / "src"
@@ -671,7 +670,7 @@ def call_hook(prompt: str) -> str:
     baseline = run_baseline()
     hook_baseline = run_hook_baseline()
 
-    print(f"\n=== brain_search quality ===")
+    print("\n=== brain_search quality ===")
     print(f"Baseline Score: {baseline['pass_count']}/{baseline['total']} ({baseline['score_pct']}%)")
     print(f"Run date: {baseline['run_date']}")
     for case in baseline["cases"]:
@@ -681,7 +680,7 @@ def call_hook(prompt: str) -> str:
         if not case["passed"]:
             print(f"         top: {case['top_snippet'][:70]!r}")
 
-    print(f"\n=== hook entity injection ===")
+    print("\n=== hook entity injection ===")
     print(f"Baseline Score: {hook_baseline['pass_count']}/{hook_baseline['total']} ({hook_baseline['score_pct']}%)")
     for case in hook_baseline["cases"]:
         status = "✓ PASS" if case["passed"] else "✗ FAIL"

From 8823418d0cade64fa141399df202a5f873f88e85 Mon Sep 17 00:00:00 2001
From: Etan Joseph Heyman <etan@heyman.net>
Date: Mon, 9 Mar 2026 13:25:57 +0200
Subject: [PATCH 3/3] style: format test_eval_baselines.py with ruff

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
---
 tests/test_eval_baselines.py | 188 ++++++++++++++++++++++++++---------
 1 file changed, 143 insertions(+), 45 deletions(-)

diff --git a/tests/test_eval_baselines.py b/tests/test_eval_baselines.py
index d9e0b933..91f29052 100644
--- a/tests/test_eval_baselines.py
+++ b/tests/test_eval_baselines.py
@@ -189,8 +189,7 @@ def test_today_returns_recent_chunks(self, live_store, live_model):
                     recent_found = True
                     break
         assert recent_found, (
-            f"Expected at least one chunk from 2026-03-09/10 in top 5 for 'today' query. "
-            f"Got chunk_ids: {ids[:5]}"
+            f"Expected at least one chunk from 2026-03-09/10 in top 5 for 'today' query. Got chunk_ids: {ids[:5]}"
         )
 
     def test_recent_milestone_findable(self, live_store, live_model):
@@ -219,8 +218,7 @@ def test_hebrew_query_returns_hebrew_content(self, live_store, live_model):
         # Check that results contain Hebrew characters
         hebrew_found = any(any("\u0590" <= c <= "\u05ea" for c in doc) for doc in docs[:3])
         assert hebrew_found, (
-            f"Expected Hebrew content in top 3 results for Hebrew query. "
-            f"Got: {[d[:60] for d in docs[:3]]}"
+            f"Expected Hebrew content in top 3 results for Hebrew query. Got: {[d[:60] for d in docs[:3]]}"
         )
 
     def test_hebrew_style_correction_findable(self, live_store, live_model):
@@ -288,9 +286,7 @@ def test_voicelayer_architectural_rule_findable(self, live_store, live_model):
             docs,
             ["local", "CLI", "architecture", "VoiceLayer", "voicelayer", "voice"],
             top_n=3,
-        ), (
-            f"Expected VoiceLayer architecture rule in top 3, got: {[d[:60] for d in docs[:3]]}"
-        )
+        ), f"Expected VoiceLayer architecture rule in top 3, got: {[d[:60] for d in docs[:3]]}"
 
     def test_brainlayer_decision_findable(self, live_store, live_model):
         """BrainLayer v3 major architecture decisions should be findable."""
@@ -414,7 +410,7 @@ def test_authentication_patterns_cross_project(self, live_store, live_model):
         # Expect actual auth implementation, not brainlayer internals
         assert _passes(docs, ["auth", "JWT", "token", "security"], top_n=3) and not _passes(
             docs, ["hybrid_search", "vector_store", "VectorStore"], top_n=2
-        ), (f"Expected auth patterns, got brainlayer internals: {[d[:60] for d in docs[:3]]}")
+        ), f"Expected auth patterns, got brainlayer internals: {[d[:60] for d in docs[:3]]}"
 
     @pytest.mark.xfail(reason="Hebrew semantic: returns unrelated Hebrew content (Albania trip)")
     def test_hebrew_schedule_semantic_accuracy(self, live_store, live_model):
@@ -423,8 +419,7 @@ def test_hebrew_schedule_semantic_accuracy(self, live_store, live_model):
         # Should NOT return Albania trip planning
         albania_in_results = any("אלבניה" in doc for doc in docs[:3])
         assert not albania_in_results, (
-            "Hebrew query returned Albania trip content instead of schedule. "
-            "This is the gap to fix."
+            "Hebrew query returned Albania trip content instead of schedule. This is the gap to fix."
         )
 
 
@@ -460,25 +455,116 @@ def run_baseline() -> dict:
         ("tag_decision", "important decision", ["decision", "DECISION", "chose", "decided"], 3, None, "decision"),
         ("tag_voicelayer_scoped", "architecture decision voicelayer", ["VoiceLayer", "voice"], 3, "voicelayer", None),
         # Recency
-        ("recency_milestone", "measurement mandate evals before improvements", ["measurement", "evals", "baseline"], 3, None, None),
+        (
+            "recency_milestone",
+            "measurement mandate evals before improvements",
+            ["measurement", "evals", "baseline"],
+            3,
+            None,
+            None,
+        ),
         # Hebrew FTS
-        ("hebrew_style_correction", "em dashes Hebrew writing style correction freelance", ["hebrew", "Hebrew", "em dash", "style"], 3, None, None),
+        (
+            "hebrew_style_correction",
+            "em dashes Hebrew writing style correction freelance",
+            ["hebrew", "Hebrew", "em dash", "style"],
+            3,
+            None,
+            None,
+        ),
         # Cross-project
-        ("cross_fts5_architecture", "FTS5 search quality gaps summary tags indexed", ["FTS5", "fts5", "summary", "gaps"], 3, "brainlayer", None),
-        ("cross_golems_monorepo", "golems monorepo architecture golem-powers CLI", ["golems", "golem", "monorepo"], 3, None, None),
+        (
+            "cross_fts5_architecture",
+            "FTS5 search quality gaps summary tags indexed",
+            ["FTS5", "fts5", "summary", "gaps"],
+            3,
+            "brainlayer",
+            None,
+        ),
+        (
+            "cross_golems_monorepo",
+            "golems monorepo architecture golem-powers CLI",
+            ["golems", "golem", "monorepo"],
+            3,
+            None,
+            None,
+        ),
         # Decision retrieval
-        ("decision_voicelayer_rule", "voicelayer local CLI voice tools architecture rule", ["local", "CLI", "VoiceLayer", "voice"], 3, None, None),
-        ("decision_brainlayer_v3", "BrainLayer v3 architecture decisions sqlite-vec", ["BrainLayer", "architecture", "v3", "sqlite"], 3, "brainlayer", None),
+        (
+            "decision_voicelayer_rule",
+            "voicelayer local CLI voice tools architecture rule",
+            ["local", "CLI", "VoiceLayer", "voice"],
+            3,
+            None,
+            None,
+        ),
+        (
+            "decision_brainlayer_v3",
+            "BrainLayer v3 architecture decisions sqlite-vec",
+            ["BrainLayer", "architecture", "v3", "sqlite"],
+            3,
+            "brainlayer",
+            None,
+        ),
         # Memory retrieval
-        ("memory_whoop", "remember when we discussed WHOOP recovery score", ["whoop", "Whoop", "WHOOP", "recovery"], 3, None, None),
-        ("memory_coach_schedule", "morning schedule wake up huberman protocol", ["schedule", "morning", "huberman", "Huberman", "coach"], 3, None, None),
+        (
+            "memory_whoop",
+            "remember when we discussed WHOOP recovery score",
+            ["whoop", "Whoop", "WHOOP", "recovery"],
+            3,
+            None,
+            None,
+        ),
+        (
+            "memory_coach_schedule",
+            "morning schedule wake up huberman protocol",
+            ["schedule", "morning", "huberman", "Huberman", "coach"],
+            3,
+            None,
+            None,
+        ),
         # Mined from logs
-        ("mined_search_quality", "brainlayer search quality evaluation evals measurement mandate", ["measurement", "evals", "baseline", "quality"], 3, None, None),
-        ("mined_enrichment_backend", "enrichment MLX Groq backend progress stats chunks", ["enrichment", "MLX", "Groq", "backend"], 3, "brainlayer", None),
-        ("mined_6pm_decisions", "6pm scheduling architecture confirmed decisions", ["6pm", "6PM", "scheduling", "architecture"], 3, None, None),
-        ("mined_cursor_cli", "cursor CLI agent versus cursor IDE difference", ["cursor", "Cursor", "CLI", "IDE"], 3, None, None),
+        (
+            "mined_search_quality",
+            "brainlayer search quality evaluation evals measurement mandate",
+            ["measurement", "evals", "baseline", "quality"],
+            3,
+            None,
+            None,
+        ),
+        (
+            "mined_enrichment_backend",
+            "enrichment MLX Groq backend progress stats chunks",
+            ["enrichment", "MLX", "Groq", "backend"],
+            3,
+            "brainlayer",
+            None,
+        ),
+        (
+            "mined_6pm_decisions",
+            "6pm scheduling architecture confirmed decisions",
+            ["6pm", "6PM", "scheduling", "architecture"],
+            3,
+            None,
+            None,
+        ),
+        (
+            "mined_cursor_cli",
+            "cursor CLI agent versus cursor IDE difference",
+            ["cursor", "Cursor", "CLI", "IDE"],
+            3,
+            None,
+            None,
+        ),
         # Known gaps (expected to fail at baseline)
-        ("gap_auth_cross_project", "authentication JWT tokens security implementation", ["auth", "JWT", "token"], 3, None, None),
+        (
+            "gap_auth_cross_project",
+            "authentication JWT tokens security implementation",
+            ["auth", "JWT", "token"],
+            3,
+            None,
+            None,
+        ),
         ("gap_hebrew_semantic", "לוח זמנים בוקר קוד", ["לוח זמנים", "schedule", "morning"], 3, None, None),
     ]
 
@@ -563,8 +649,10 @@ def test_entity_detected_avi_simon(self):
         """Hook should detect 'Avi Simon' as a known entity and inject entity label."""
         output = self._call_hook("What are Avi Simon's meeting preferences?")
         # After Phase A: output should mention entity type or entity header
-        assert "[entity:" in output.lower() or "entity: avi simon" in output.lower() or (
-            "avi simon" in output.lower() and "person" in output.lower()
+        assert (
+            "[entity:" in output.lower()
+            or "entity: avi simon" in output.lower()
+            or ("avi simon" in output.lower() and "person" in output.lower())
         ), (
             f"Expected entity injection for 'Avi Simon' in hook output. "
             f"This is the Phase A gap (baseline = FAIL).\n"
@@ -574,8 +662,10 @@ def test_entity_detected_avi_simon(self):
     def test_entity_detected_fedor(self):
         """Hook should detect 'Fedor' as a known entity and inject entity label."""
         output = self._call_hook("What is Fedor working on with GitHub access?")
-        assert "[entity:" in output.lower() or "entity: fedor" in output.lower() or (
-            "fedor" in output.lower() and "person" in output.lower()
+        assert (
+            "[entity:" in output.lower()
+            or "entity: fedor" in output.lower()
+            or ("fedor" in output.lower() and "person" in output.lower())
         ), (
             f"Expected entity injection for 'Fedor' in hook output. "
             f"Phase A gap (baseline = FAIL).\n"
@@ -618,19 +708,25 @@ def call_hook(prompt: str) -> str:
         r = subprocess.run(
             ["python3", str(hook_path)],
             input=json.dumps({"prompt": prompt}),
-            capture_output=True, text=True, timeout=5,
+            capture_output=True,
+            text=True,
+            timeout=5,
         )
         return r.stdout
 
     hook_cases = [
-        ("hook_entity_avi_simon", "What are Avi Simon's meeting preferences?",
-         ["[entity:", "entity: avi simon", "person"]),
-        ("hook_entity_fedor", "What is Fedor working on with GitHub access?",
-         ["[entity:", "entity: fedor", "person"]),
-        ("hook_entity_first_line", "Tell me about Avi Simon and his 6PM project",
-         ["[entity"]),  # entity header must be first line
-        ("hook_no_entity_generic", "How does authentication work in Python?",
-         None),  # None = expect entity NOT present
+        (
+            "hook_entity_avi_simon",
+            "What are Avi Simon's meeting preferences?",
+            ["[entity:", "entity: avi simon", "person"],
+        ),
+        ("hook_entity_fedor", "What is Fedor working on with GitHub access?", ["[entity:", "entity: fedor", "person"]),
+        (
+            "hook_entity_first_line",
+            "Tell me about Avi Simon and his 6PM project",
+            ["[entity"],
+        ),  # entity header must be first line
+        ("hook_no_entity_generic", "How does authentication work in Python?", None),  # None = expect entity NOT present
     ]
 
     cases = []
@@ -645,13 +741,15 @@ def call_hook(prompt: str) -> str:
         else:
             passed = any(e.lower() in output_lower for e in expected)
 
-        cases.append({
-            "name": name,
-            "query": prompt,
-            "expected": expected,
-            "passed": passed,
-            "output_preview": output[:150],
-        })
+        cases.append(
+            {
+                "name": name,
+                "query": prompt,
+                "expected": expected,
+                "passed": passed,
+                "output_preview": output[:150],
+            }
+        )
         if passed:
             pass_count += 1
 
@@ -692,8 +790,8 @@ def call_hook(prompt: str) -> str:
         "search": baseline,
         "hook": hook_baseline,
         "combined_score_pct": round(
-            (baseline["pass_count"] + hook_baseline["pass_count"])
-            / (baseline["total"] + hook_baseline["total"]) * 100, 1
+            (baseline["pass_count"] + hook_baseline["pass_count"]) / (baseline["total"] + hook_baseline["total"]) * 100,
+            1,
         ),
     }
     print(f"\nCombined: {combined['combined_score_pct']}%")