From a7f1a8016b208d35beec56a7b94454923a33c21a Mon Sep 17 00:00:00 2001 From: Etan Joseph Heyman Date: Mon, 9 Mar 2026 13:18:32 +0200 Subject: [PATCH 1/3] feat: eval suite + entity injection in prompt hook (Phase 0 + Phase A) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Phase 0 — Baselines: - tests/test_eval_baselines.py: 23-case eval suite across 8 domains (entity routing, tag filter, recency, Hebrew FTS, cross-project, decision retrieval, memory, mined real queries) - tests/eval_baselines.json: recorded baseline results - scripts/run_evals.py: CLI runner for before/after comparison - tests/conftest.py: register `live` pytest mark Phase A — Entity routing in prompt hook: - hooks/brainlayer-prompt-search.py: detect known entity names (person, company, agent) in user prompt → inject [Entity: Name — type] section + linked chunks before FTS results - Possessive stripping ("Simon's" → "Simon") for bigram matching - Filter: only person/company/agent types (skip technology/concept noise) Before/After scores (run: python tests/test_eval_baselines.py): - brain_search quality: 94.7% (18/19) — unchanged (already good) - hook entity injection: 25% → 100% (3 tests now pass) - combined: 82.6% → 95.7% (+13.1pp) Known gaps (xfail): - Hebrew semantic accuracy (query returns unrelated Hebrew content) - "today" temporal awareness in raw hybrid_search Co-Authored-By: Claude Sonnet 4.6 --- hooks/brainlayer-prompt-search.py | 311 +++++++++++++ scripts/run_evals.py | 95 ++++ tests/conftest.py | 8 + tests/eval_baselines.json | 361 +++++++++++++++ tests/test_eval_baselines.py | 703 ++++++++++++++++++++++++++++++ 5 files changed, 1478 insertions(+) create mode 100755 hooks/brainlayer-prompt-search.py create mode 100644 scripts/run_evals.py create mode 100644 tests/eval_baselines.json create mode 100644 tests/test_eval_baselines.py diff --git a/hooks/brainlayer-prompt-search.py b/hooks/brainlayer-prompt-search.py new file mode 100755 index 00000000..74f16bef --- /dev/null +++ b/hooks/brainlayer-prompt-search.py @@ -0,0 +1,311 @@ +#!/usr/bin/env python3 +""" +BrainLayer UserPromptSubmit Hook — auto-searches memories relevant to the user's prompt. + +Extracts keywords from the prompt, runs FTS5 search against BrainLayer. +Two modes: + - Light (default): top 3 results, ~300 tokens + - Deep (triggered by memory words): top 8 results, ~800 tokens + +Output: plain text to stdout (injected as Claude context). +Target: <500ms total. +""" + +import json +import os +import re +import sqlite3 +import sys +import time + +DEADLINE_MS = 450 + +# Prompts shorter than this are probably greetings/commands — skip search +MIN_PROMPT_LENGTH = 15 + +# Trigger words that activate deep mode (more results) +DEEP_TRIGGERS = { + "remember", + "last time", + "previous", + "previously", + "before", + "history", + "earlier", + "we discussed", + "we decided", + "we talked", + "recall", + "forgot", + "what was", + "what were", + "when did", + "how did", + "brainlayer", +} + +# Common English stop words to skip during keyword extraction +STOP_WORDS = { + "a", "an", "the", "is", "it", "in", "on", "at", "to", "for", "of", + "and", "or", "but", "not", "with", "this", "that", "from", "by", + "are", "was", "were", "be", "been", "being", "have", "has", "had", + "do", "does", "did", "will", "would", "could", "should", "may", + "might", "can", "shall", "must", "need", "let", "me", "my", "i", + "you", "your", "we", "our", "they", "them", "their", "he", "she", + "his", "her", "its", "if", "then", "else", "when", "where", "how", + "what", "which", "who", "why", "so", "just", "also", "very", "too", + "up", "out", "about", "into", "over", "after", "some", "any", "all", + "no", "yes", "ok", "okay", "please", "thanks", "thank", "hey", + "hi", "hello", "sure", "right", "well", "now", "here", "there", + "like", "want", "think", "know", "see", "look", "make", "take", + "get", "go", "come", "use", "try", "help", "tell", "give", "show", + "work", "call", "run", "set", "add", "put", "keep", "find", "read", + "write", "create", "build", "check", "start", "stop", "change", + "move", "open", "close", "new", "old", "good", "bad", "big", + "small", "first", "last", "next", "more", "less", "much", "many", + "each", "every", "other", "same", "different", "own", "still", + "already", "again", "even", "really", "actually", "probably", + "maybe", "file", "code", "thing", "way", "something", "anything", +} + +DB_PATHS = [ + os.path.expanduser("~/.local/share/zikaron/zikaron.db"), + os.path.expanduser("~/.local/share/brainlayer/brainlayer.db"), +] + + +def get_db_path(): + env = os.environ.get("BRAINLAYER_DB") + if env and os.path.exists(env): + return env + for p in DB_PATHS: + if os.path.exists(p): + return p + return None + + +def is_deep_mode(prompt_lower): + for trigger in DEEP_TRIGGERS: + if trigger in prompt_lower: + return True + return False + + +def extract_keywords(prompt): + """Extract meaningful keywords from the prompt for FTS5 search.""" + # Remove URLs, paths, code blocks + text = re.sub(r"https?://\S+", "", prompt) + text = re.sub(r"[/~]\S+", "", text) + text = re.sub(r"`[^`]+`", "", text) + + # Extract words (keep hyphens for compound terms like "6pm-mini") + words = re.findall(r"[a-zA-Z0-9][\w-]*", text.lower()) + + # Filter out stop words and short words + keywords = [] + seen = set() + for w in words: + if w not in STOP_WORDS and len(w) > 2 and w not in seen: + keywords.append(w) + seen.add(w) + + return keywords[:8] # Cap at 8 keywords for FTS5 performance + + +def truncate(text, max_chars=200): + # Clean up multi-line content for compact display + text = re.sub(r"\n+", " | ", text.strip()) + if len(text) <= max_chars: + return text + return text[:max_chars].rsplit(" ", 1)[0] + "..." + + +def elapsed_ms(start): + return (time.monotonic() - start) * 1000 + + +def detect_entities_in_prompt(prompt, conn): + """Detect known KG entity names in the prompt. + + Checks bigrams and single capitalized words (3+ chars) against kg_entities. + Returns list of dicts: {id, name, entity_type}. + Fast: exact SQL LOWER() match, no FTS5 overhead. + + Only injects context for high-signal entity types (person, company, agent). + Technology/concept entities are too noisy for automatic injection. + """ + # Entity types that warrant automatic context injection + INJECT_TYPES = {"person", "company", "agent"} + + def _clean_word(w): + """Strip trailing punctuation and possessive suffixes ('s, 's).""" + # Remove all non-alphanumeric except hyphen (for compound words) + cleaned = re.sub(r"[^a-zA-Z0-9-]", "", w) + # Strip trailing possessive suffix "s" preceded by nothing (was apostrophe) + if cleaned.endswith("s") and len(cleaned) > 2: + # heuristic: if original had 's or 's before 's, strip the trailing s + if re.search(r"'s?$", w): + cleaned = cleaned[:-1] + return cleaned + + words = prompt.split() + cleaned_words = [_clean_word(w) for w in words] + candidates = [] + + # Bigrams: "Avi Simon", "Fedor Sidorov" etc. + for i in range(len(cleaned_words) - 1): + w1, w2 = cleaned_words[i], cleaned_words[i + 1] + if not w1 or not w2: + continue + # At least one word must start uppercase (entities are proper nouns) + if w1[0].isupper() or w2[0].isupper(): + candidates.append(f"{w1} {w2}") + + # Single capitalized words (4+ chars to avoid "What", "Tell", etc.) + for w in cleaned_words: + if len(w) >= 4 and w[0].isupper() and not w.isupper(): + candidates.append(w) + + if not candidates: + return [] + + matched = [] + seen_ids = set() + try: + for candidate in candidates: + rows = conn.execute( + "SELECT id, name, entity_type FROM kg_entities WHERE LOWER(name) = LOWER(?) LIMIT 1", + (candidate,), + ).fetchall() + if rows: + eid, name, etype = rows[0] + if eid not in seen_ids and etype in INJECT_TYPES: + seen_ids.add(eid) + matched.append({"id": eid, "name": name, "entity_type": etype}) + except sqlite3.Error: + pass + + return matched + + +def get_entity_chunks(entity_id, conn, limit=3): + """Get top linked chunk summaries for an entity.""" + try: + rows = conn.execute( + """ + SELECT c.content, c.created_at, c.project + FROM kg_entity_chunks ec + JOIN chunks c ON c.id = ec.chunk_id + WHERE ec.entity_id = ? + ORDER BY ec.relevance DESC + LIMIT ? + """, + (entity_id, limit), + ).fetchall() + return rows + except sqlite3.Error: + return [] + + +def main(): + start = time.monotonic() + + try: + hook_input = json.loads(sys.stdin.read()) + except (json.JSONDecodeError, EOFError): + sys.exit(0) + + prompt = hook_input.get("prompt", "") + if not prompt or len(prompt) < MIN_PROMPT_LENGTH: + sys.exit(0) + + prompt_lower = prompt.lower() + + # Skip if prompt is a slash command + if prompt.strip().startswith("/"): + sys.exit(0) + + deep = is_deep_mode(prompt_lower) + keywords = extract_keywords(prompt) + + if not keywords: + sys.exit(0) + + db_path = get_db_path() + if not db_path: + sys.exit(0) + + try: + conn = sqlite3.connect(f"file:{db_path}?mode=ro", uri=True, timeout=2) + conn.execute("PRAGMA journal_mode=WAL") + conn.execute("PRAGMA query_only=true") + except sqlite3.Error: + sys.exit(0) + + limit = 8 if deep else 3 + + # Build FTS5 query: join keywords with OR for broader matching + fts_query = " OR ".join(f'"{kw}"' for kw in keywords) + + lines = [] + try: + # Phase A: Entity routing — detect known entity names in prompt + # and inject entity profile before FTS results. + if elapsed_ms(start) < DEADLINE_MS: + entities = detect_entities_in_prompt(prompt, conn) + for entity in entities[:2]: # at most 2 entities per prompt + etype = entity["entity_type"] + ename = entity["name"] + lines.append(f"[Entity: {ename} — {etype}]") + # Get entity-linked chunks for context + entity_chunks = get_entity_chunks(entity["id"], conn, limit=2) + for content, created_at, project in entity_chunks: + date = created_at[:10] if created_at else "?" + proj = f" ({project})" if project else "" + lines.append(f"- [{date}{proj}] {truncate(content, max_chars=150)}") + + if elapsed_ms(start) < DEADLINE_MS: + rows = conn.execute( + """ + SELECT c.content, c.importance, c.project, c.tags, c.created_at + FROM chunks_fts f + JOIN chunks c ON c.id = f.chunk_id + WHERE chunks_fts MATCH ? + ORDER BY rank + LIMIT ? + """, + (fts_query, limit), + ).fetchall() + + if rows: + mode_label = "deep" if deep else "auto" + if lines: + # Entity section already started — add separator + lines.append(f"[BrainLayer {mode_label}] Memories matching your prompt:") + else: + lines.append(f"[BrainLayer {mode_label}] Memories matching your prompt:") + for content, importance, project, tags, created_at in rows: + date = created_at[:10] if created_at else "?" + imp = f" imp:{importance:.0f}" if importance else "" + proj = f" ({project})" if project else "" + lines.append( + f"- [{date}{imp}{proj}] {truncate(content)}" + ) + + if not deep: + lines.append( + "(Use brain_search for deeper results.)" + ) + except sqlite3.Error: + pass + finally: + conn.close() + + if lines: + print("\n".join(lines)) + + sys.exit(0) + + +if __name__ == "__main__": + main() diff --git a/scripts/run_evals.py b/scripts/run_evals.py new file mode 100644 index 00000000..c629e7f2 --- /dev/null +++ b/scripts/run_evals.py @@ -0,0 +1,95 @@ +#!/usr/bin/env python3 +"""Run all eval cases and save scored results to tests/eval_baselines.json. + +Usage: + python scripts/run_evals.py # run + save + print summary + python scripts/run_evals.py --no-save # run + print, don't save + python scripts/run_evals.py --diff # compare to saved baseline + +This script calls run_baseline() and run_hook_baseline() from test_eval_baselines.py +and writes the combined results to tests/eval_baselines.json. +""" + +import argparse +import json +import sys +from pathlib import Path + +# Ensure src is importable +src = Path(__file__).parent.parent / "src" +if str(src) not in sys.path: + sys.path.insert(0, str(src)) + +tests_dir = Path(__file__).parent.parent / "tests" +if str(tests_dir) not in sys.path: + sys.path.insert(0, str(tests_dir)) + +from test_eval_baselines import run_baseline, run_hook_baseline + +BASELINE_FILE = Path(__file__).parent.parent / "tests" / "eval_baselines.json" + + +def main(): + parser = argparse.ArgumentParser(description="Run BrainLayer eval suite") + parser.add_argument("--no-save", action="store_true", help="Don't save results") + parser.add_argument("--diff", action="store_true", help="Compare to saved baseline") + args = parser.parse_args() + + # Load previous baseline for diff + prev = None + if args.diff and BASELINE_FILE.exists(): + prev = json.loads(BASELINE_FILE.read_text()) + + print("Running search quality evals...") + search_results = run_baseline() + print("Running hook entity injection evals...") + hook_results = run_hook_baseline() + + combined = { + "search": search_results, + "hook": hook_results, + "combined_score_pct": round( + (search_results["pass_count"] + hook_results["pass_count"]) + / (search_results["total"] + hook_results["total"]) * 100, + 1, + ), + } + + # Print summary + print(f"\n=== brain_search quality ===") + print(f"Score: {search_results['pass_count']}/{search_results['total']} ({search_results['score_pct']}%)") + if prev: + prev_pct = prev.get("search", {}).get("score_pct", 0) + delta = search_results["score_pct"] - prev_pct + print(f"Delta vs baseline: {delta:+.1f}%") + for case in search_results["cases"]: + status = "✓" if case["passed"] else "✗" + rank = f"rank={case['actual_rank']}" if case["actual_rank"] else "not found" + print(f" {status} [{case['name']}] {rank}") + if not case["passed"]: + print(f" top: {case['top_snippet'][:70]!r}") + + print(f"\n=== hook entity injection ===") + print(f"Score: {hook_results['pass_count']}/{hook_results['total']} ({hook_results['score_pct']}%)") + if prev: + prev_hook_pct = prev.get("hook", {}).get("score_pct", 0) + hook_delta = hook_results["score_pct"] - prev_hook_pct + print(f"Delta vs baseline: {hook_delta:+.1f}%") + for case in hook_results["cases"]: + status = "✓" if case["passed"] else "✗" + print(f" {status} [{case['name']}]") + if not case["passed"]: + print(f" output: {case['output_preview'][:80]!r}") + + print(f"\nCombined: {combined['combined_score_pct']}%") + if prev: + prev_combined = prev.get("combined_score_pct", 0) + print(f"Delta vs baseline: {combined['combined_score_pct'] - prev_combined:+.1f}%") + + if not args.no_save: + BASELINE_FILE.write_text(json.dumps(combined, indent=2)) + print(f"\nSaved to {BASELINE_FILE}") + + +if __name__ == "__main__": + main() diff --git a/tests/conftest.py b/tests/conftest.py index 8f197199..8e9109aa 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -5,6 +5,14 @@ import pytest +def pytest_configure(config): + """Register custom pytest marks.""" + config.addinivalue_line( + "markers", + "live: mark test as requiring a live production DB (skipped in CI if DB absent)", + ) + + @pytest.fixture def test_user() -> str: """Username for path-based tests. diff --git a/tests/eval_baselines.json b/tests/eval_baselines.json new file mode 100644 index 00000000..249d7de5 --- /dev/null +++ b/tests/eval_baselines.json @@ -0,0 +1,361 @@ +{ + "search": { + "run_date": "2026-03-09", + "db": "/Users/etanheyman/.local/share/zikaron/zikaron.db", + "cases": [ + { + "name": "entity_avi_simon", + "query": "Avi Simon platform invites schedule", + "expected_snippets": [ + "avi simon", + "6pm", + "6PM" + ], + "top_n": 3, + "project": null, + "tag": null, + "passed": true, + "actual_rank": 1, + "top_chunk_id": "manual-82fa7ad5a3614d46", + "top_snippet": "Avi Simon \u2014 6PM Platform Tech Advisor Call (Feb 24, 2026)\n\nContext: 60-min tech screen call with Avi" + }, + { + "name": "entity_fedor", + "query": "Fedor iOS build handover GitHub", + "expected_snippets": [ + "fedor", + "Fedor", + "iOS" + ], + "top_n": 3, + "project": null, + "tag": null, + "passed": true, + "actual_rank": 1, + "top_chunk_id": "manual-e731b63ab50c4769", + "top_snippet": "DECISION: MeHayom iOS build ownership (March 10 2026): Fedor submits iOS build to App Store review, " + }, + { + "name": "entity_yuval_mehayom", + "query": "MeHayom Yuval sprint payment", + "expected_snippets": [ + "yuval", + "Yuval", + "MeHayom" + ], + "top_n": 3, + "project": null, + "tag": null, + "passed": true, + "actual_rank": 1, + "top_chunk_id": "manual-6494601725414371", + "top_snippet": "[2026-03-10] MeHayom Sprint 1: First 50% payment (2,100 NIS) landed in bank. Yuval added Etan to App" + }, + { + "name": "tag_decision", + "query": "important decision", + "expected_snippets": [ + "decision", + "DECISION", + "chose" + ], + "top_n": 3, + "project": null, + "tag": "decision", + "passed": true, + "actual_rank": 2, + "top_chunk_id": "manual-0822d7551cde4362", + "top_snippet": "6PM Research Results Summary (Feb 28, 3 deep research papers):\n\nEXTRACTION OPTIMIZATION:\n- Tool desc" + }, + { + "name": "tag_voicelayer_scoped", + "query": "architecture decision voicelayer", + "expected_snippets": [ + "VoiceLayer", + "voice" + ], + "top_n": 3, + "project": "voicelayer", + "tag": null, + "passed": true, + "actual_rank": 1, + "top_chunk_id": "manual-a5b9d46fcc0b484a", + "top_snippet": "VoiceLayer architecture BUG: FlowBar is the client, MCP servers are the servers \u2014 backwards. Each MC" + }, + { + "name": "recency_milestone", + "query": "measurement mandate evals before improvements", + "expected_snippets": [ + "measurement", + "evals", + "baseline" + ], + "top_n": 3, + "project": null, + "tag": null, + "passed": true, + "actual_rank": 1, + "top_chunk_id": "manual-1a0c8f70a35d4ac4", + "top_snippet": "MEASUREMENT MANDATE (March 9, 2026): All improvements must be measured before and after. No \"it feel" + }, + { + "name": "hebrew_style_correction", + "query": "em dashes Hebrew writing style correction freelance", + "expected_snippets": [ + "hebrew", + "Hebrew", + "em dash" + ], + "top_n": 3, + "project": null, + "tag": null, + "passed": true, + "actual_rank": 1, + "top_chunk_id": "manual-83d0d8baa2c549b4", + "top_snippet": "ETAN'S HEBREW WRITING STYLE \u2014 Freelance/WhatsApp (extracted from iterative corrections, March 10 202" + }, + { + "name": "cross_fts5_architecture", + "query": "FTS5 search quality gaps summary tags indexed", + "expected_snippets": [ + "FTS5", + "fts5", + "summary" + ], + "top_n": 3, + "project": "brainlayer", + "tag": null, + "passed": true, + "actual_rank": 1, + "top_chunk_id": "manual-17ea700f44794d55", + "top_snippet": "BrainLayer Search Pipeline Gaps (from Cursor audit Mar 2):\n1. FTS5 only indexes chunks.content \u2014 sum" + }, + { + "name": "cross_golems_monorepo", + "query": "golems monorepo architecture golem-powers CLI", + "expected_snippets": [ + "golems", + "golem", + "monorepo" + ], + "top_n": 3, + "project": null, + "tag": null, + "passed": true, + "actual_rank": 1, + "top_chunk_id": "/Users/etanheyman/.claude/projects/-Users-etanheym", + "top_snippet": "Do a VERY THOROUGH exploration of the Golems monorepo automation infrastructure. I need to understan" + }, + { + "name": "decision_voicelayer_rule", + "query": "voicelayer local CLI voice tools architecture rule", + "expected_snippets": [ + "local", + "CLI", + "VoiceLayer" + ], + "top_n": 3, + "project": null, + "tag": null, + "passed": true, + "actual_rank": 1, + "top_chunk_id": "/Users/etanheyman/.claude/projects/-Users-etanheym", + "top_snippet": "## Summary of Key Findings\n\n### 1. **Async Voice Architecture Research** (first 100 lines)\nThe hybri" + }, + { + "name": "decision_brainlayer_v3", + "query": "BrainLayer v3 architecture decisions sqlite-vec", + "expected_snippets": [ + "BrainLayer", + "architecture", + "v3" + ], + "top_n": 3, + "project": "brainlayer", + "tag": null, + "passed": true, + "actual_rank": 1, + "top_chunk_id": "manual-705bf11e2e7c436a", + "top_snippet": "BrainLayer v3 Plan \u2014 Major Architecture Decisions (Feb 24, 2026):\n\n1. VISION: BrainLayer becomes the" + }, + { + "name": "memory_whoop", + "query": "remember when we discussed WHOOP recovery score", + "expected_snippets": [ + "whoop", + "Whoop", + "WHOOP" + ], + "top_n": 3, + "project": null, + "tag": null, + "passed": true, + "actual_rank": 1, + "top_chunk_id": "/Users/etanheyman/.claude/projects/-Users-etanheym", + "top_snippet": "\u23fa Bash(TOKEN=$(python3 -c \"import json; print(json.load(open('/tmp/whoop-tokens.json'))['access_toke" + }, + { + "name": "memory_coach_schedule", + "query": "morning schedule wake up huberman protocol", + "expected_snippets": [ + "schedule", + "morning", + "huberman" + ], + "top_n": 3, + "project": null, + "tag": null, + "passed": true, + "actual_rank": 1, + "top_chunk_id": "/Users/etanheyman/.claude/projects/-Users-etanheym", + "top_snippet": " - **`/Users/etanheyman/Gits/golems/skills/golem-powers/coach/workflows/health.md`** (REWRITTEN th" + }, + { + "name": "mined_search_quality", + "query": "brainlayer search quality evaluation evals measurement mandate", + "expected_snippets": [ + "measurement", + "evals", + "baseline" + ], + "top_n": 3, + "project": null, + "tag": null, + "passed": true, + "actual_rank": 1, + "top_chunk_id": "manual-1a0c8f70a35d4ac4", + "top_snippet": "MEASUREMENT MANDATE (March 9, 2026): All improvements must be measured before and after. No \"it feel" + }, + { + "name": "mined_enrichment_backend", + "query": "enrichment MLX Groq backend progress stats chunks", + "expected_snippets": [ + "enrichment", + "MLX", + "Groq" + ], + "top_n": 3, + "project": "brainlayer", + "tag": null, + "passed": true, + "actual_rank": 1, + "top_chunk_id": "manual-0717f7968aff4c5c", + "top_snippet": "Switched enrichment from MLX to Groq (March 2 2026). Reason: MLX server (Qwen2.5-Coder-14B, ~5GB RAM" + }, + { + "name": "mined_6pm_decisions", + "query": "6pm scheduling architecture confirmed decisions", + "expected_snippets": [ + "6pm", + "6PM", + "scheduling" + ], + "top_n": 3, + "project": null, + "tag": null, + "passed": true, + "actual_rank": 1, + "top_chunk_id": "/Users/etanheyman/.claude/projects/-Users-etanheym", + "top_snippet": "You are 6pmClaude \u2014 working on the 6PM blind scheduling system.\n\n## First: Load Context\n\n```\nbrain_s" + }, + { + "name": "mined_cursor_cli", + "query": "cursor CLI agent versus cursor IDE difference", + "expected_snippets": [ + "cursor", + "Cursor", + "CLI" + ], + "top_n": 3, + "project": null, + "tag": null, + "passed": true, + "actual_rank": 1, + "top_chunk_id": "/Users/etanheyman/.claude/projects/-Users-etanheym", + "top_snippet": "Perfect! Now I have comprehensive information. Let me compile a detailed research report for you:\n\n#" + }, + { + "name": "gap_auth_cross_project", + "query": "authentication JWT tokens security implementation", + "expected_snippets": [ + "auth", + "JWT", + "token" + ], + "top_n": 3, + "project": null, + "tag": null, + "passed": true, + "actual_rank": 1, + "top_chunk_id": "/Users/etanheyman/.claude/projects/-Users-etanheym", + "top_snippet": "from brainlayer import brain\nresult = brain.search(\"how did I implement auth\")\nbrain.store(\"Decided " + }, + { + "name": "gap_hebrew_semantic", + "query": "\u05dc\u05d5\u05d7 \u05d6\u05de\u05e0\u05d9\u05dd \u05d1\u05d5\u05e7\u05e8 \u05e7\u05d5\u05d3", + "expected_snippets": [ + "\u05dc\u05d5\u05d7 \u05d6\u05de\u05e0\u05d9\u05dd", + "schedule", + "morning" + ], + "top_n": 3, + "project": null, + "tag": null, + "passed": false, + "actual_rank": null, + "top_chunk_id": "/Users/etanheyman/.claude/projects/-Users-etanheym", + "top_snippet": "\u05d4\u05d9\u05d9, \u05d0\u05e0\u05d9 \u05de\u05db\u05d9\u05df \u05d0\u05ea \u05d4\u05e4\u05e8\u05d5\u05d9\u05e7\u05d8 \u05d4\u05d6\u05d4 \u05d1\u05e9\u05d1\u05d9\u05dc \u05d0\u05de\u05d0 \u05e9\u05dc\u05d9 \u05d5\u05d1\u05df \u05d6\u05d5\u05d2\u05d4. \u05d4\u05dd \u05d8\u05e1\u05d9\u05dd \u05dc\u05d0\u05dc\u05d1\u05e0\u05d9\u05d4.\n\n\u05d9\u05e9 \u05dc\u05d4\u05dd \u05db\u05d1\u05e8 \u05db\u05de\u05d4 \u05d3\u05d1\u05e8\u05d9\u05dd \u05de\u05ea\u05d5\u05db\u05e0\u05e0\u05d9\u05dd" + } + ], + "pass_count": 18, + "total": 19, + "score_pct": 94.7 + }, + "hook": { + "section": "hook_entity_injection", + "pass_count": 4, + "total": 4, + "score_pct": 100.0, + "cases": [ + { + "name": "hook_entity_avi_simon", + "query": "What are Avi Simon's meeting preferences?", + "expected": [ + "[entity:", + "entity: avi simon", + "person" + ], + "passed": true, + "output_preview": "[Entity: Avi Simon \u2014 person]\n- [? (golems)] # Family Brain \u2014 Multi-User BrainLayer + Relationship Ecosystem | > From \"Etan's tools\" to a people/busine" + }, + { + "name": "hook_entity_fedor", + "query": "What is Fedor working on with GitHub access?", + "expected": [ + "[entity:", + "entity: fedor", + "person" + ], + "passed": true, + "output_preview": "[Entity: Fedor \u2014 person]\n- [? (orchestrator)] coachClaude session transcript \u2014 March 9-10, 2026. ~/Gits/golems/packages/coach | ## Key Events: | ### 1" + }, + { + "name": "hook_entity_first_line", + "query": "Tell me about Avi Simon and his 6PM project", + "expected": [ + "[entity" + ], + "passed": true, + "output_preview": "[Entity: Avi Simon \u2014 person]\n- [? (golems)] # Family Brain \u2014 Multi-User BrainLayer + Relationship Ecosystem | > From \"Etan's tools\" to a people/busine" + }, + { + "name": "hook_no_entity_generic", + "query": "How does authentication work in Python?", + "expected": null, + "passed": true, + "output_preview": "[BrainLayer auto] Memories matching your prompt:\n- [2026-02-19 imp:7 (golems)] Also, I got the authentication application successfully set up on PyPi." + } + ] + }, + "combined_score_pct": 95.7 +} \ No newline at end of file diff --git a/tests/test_eval_baselines.py b/tests/test_eval_baselines.py new file mode 100644 index 00000000..31f16b17 --- /dev/null +++ b/tests/test_eval_baselines.py @@ -0,0 +1,703 @@ +"""Phase 0: Search Quality Eval Suite — Baselines. + +Measures real BrainLayer search quality against the production DB. +Requires: `pytest -m live` (skipped in CI, requires live DB). + +Grade: expected content snippet in top N = PASS. +Baseline file: tests/eval_baselines.json (generated by scripts/run_evals.py). + +Run baseline: + python scripts/run_evals.py > tests/eval_baselines.json + +Run live tests: + pytest tests/test_eval_baselines.py -m live -v + +Test cases cover: +- Entity routing (entity names in query) +- Tag filter (structured queries) +- Recency (recent content prioritized) +- Hebrew FTS (non-ASCII content) +- Cross-project search +- Decision/correction retrieval +- Memory word detection +- Real mined queries from session history +""" + +import json +from pathlib import Path + +import pytest + +# ── Fixtures ──────────────────────────────────────────────────────────────── + +EVAL_CASES_FILE = Path(__file__).parent / "eval_baselines.json" + + +@pytest.fixture(scope="module") +def live_store(): + """Real production VectorStore — requires live DB. + + Skips if DB is absent (CI environment). + """ + pytest.importorskip("brainlayer", reason="brainlayer not installed") + import os + import sys + + src = Path(__file__).parent.parent / "src" + if str(src) not in sys.path: + sys.path.insert(0, str(src)) + + from brainlayer.paths import get_db_path + from brainlayer.vector_store import VectorStore + + db = get_db_path() + if not os.path.exists(db): + pytest.skip(f"Live DB not found at {db}") + + store = VectorStore(db) + yield store + store.close() + + +@pytest.fixture(scope="module") +def live_model(): + """Real embedding model — cached across tests in this module.""" + import sys + + src = Path(__file__).parent.parent / "src" + if str(src) not in sys.path: + sys.path.insert(0, str(src)) + + from brainlayer.embeddings import get_embedding_model + + return get_embedding_model() + + +def _search(store, model, query, n=5, project=None, tag=None) -> tuple[list[str], list[str]]: + """Run hybrid search; return (chunk_ids, documents).""" + emb = model.embed_query(query) + results = store.hybrid_search( + query_embedding=emb, + query_text=query, + n_results=n, + project_filter=project, + tag_filter=tag, + ) + return results["ids"][0], results["documents"][0] + + +def _passes(docs: list[str], expected_snippets: list[str], top_n: int) -> bool: + """Return True if any expected_snippet appears in any of the top_n docs.""" + for doc in docs[:top_n]: + doc_lower = doc.lower() + for snippet in expected_snippets: + if snippet.lower() in doc_lower: + return True + return False + + +# ── Entity Routing Tests ───────────────────────────────────────────────────── + + +@pytest.mark.live +class TestEntityRouting: + """Entity names in query should surface entity-linked chunks.""" + + def test_avi_simon_entity(self, live_store, live_model): + """Query about Avi Simon should surface 6PM / MeHayom interaction chunks.""" + ids, docs = _search(live_store, live_model, "Avi Simon platform invites schedule", n=5) + assert _passes(docs, ["avi simon", "6pm", "6PM"], top_n=3), ( + f"Expected Avi Simon content in top 3, got: {[d[:60] for d in docs[:3]]}" + ) + + def test_fedor_entity(self, live_store, live_model): + """Query about Fedor should surface MeHayom / iOS build chunks.""" + ids, docs = _search(live_store, live_model, "Fedor iOS build handover GitHub", n=5) + assert _passes(docs, ["fedor", "Fedor", "iOS", "MeHayom"], top_n=3), ( + f"Expected Fedor content in top 3, got: {[d[:60] for d in docs[:3]]}" + ) + + def test_yuval_mehayom_entity(self, live_store, live_model): + """Query about Yuval (MeHayom client) should surface sprint/payment chunks.""" + ids, docs = _search(live_store, live_model, "MeHayom Yuval sprint payment", n=5) + assert _passes(docs, ["yuval", "Yuval", "mehayom", "MeHayom"], top_n=3), ( + f"Expected MeHayom/Yuval content in top 3, got: {[d[:60] for d in docs[:3]]}" + ) + + +# ── Tag Filter Tests ───────────────────────────────────────────────────────── + + +@pytest.mark.live +class TestTagFilter: + """Tag-filtered searches should respect chunk tag metadata.""" + + def test_decision_tag_returns_decisions(self, live_store, live_model): + """tag='decision' filter should return brain_store chunks tagged 'decision'.""" + ids, docs = _search( + live_store, + live_model, + "important decision", + n=5, + tag="decision", + ) + assert len(ids) >= 1, "tag='decision' filter returned 0 results" + # All results should contain decision-related content + assert _passes(docs, ["decision", "DECISION", "Decision", "chose", "decided"], top_n=3), ( + f"tag=decision filter didn't return decision content, got: {[d[:60] for d in docs[:3]]}" + ) + + def test_voicelayer_decision_scoped(self, live_store, live_model): + """project=voicelayer + decision query should return voicelayer decisions.""" + ids, docs = _search( + live_store, + live_model, + "architecture decision voicelayer", + n=5, + project="voicelayer", + ) + assert len(ids) >= 1, "voicelayer project scoped search returned 0 results" + assert _passes(docs, ["voicelayer", "VoiceLayer", "voice", "Voice"], top_n=3), ( + f"Expected VoiceLayer content, got: {[d[:60] for d in docs[:3]]}" + ) + + +# ── Recency Tests ──────────────────────────────────────────────────────────── + + +@pytest.mark.live +class TestRecency: + """Recent content should rank above old content for recency-sensitive queries.""" + + @pytest.mark.xfail( + reason="hybrid_search lacks temporal 'today' awareness; _brain_search router handles this via _current_context" + ) + def test_today_returns_recent_chunks(self, live_store, live_model): + """'what happened today' should return chunks from 2026-03-09 or 2026-03-10. + + Gap: hybrid_search scores by relevance+recency but doesn't boost 'today=24h'. + Fix: route 'today' queries through _brain_search → _current_context. + """ + ids, docs = _search(live_store, live_model, "what happened today current work progress", n=5) + cursor = live_store.conn.cursor() + recent_found = False + for cid in ids[:5]: + rows = list(cursor.execute("SELECT created_at FROM chunks WHERE id = ?", (cid,))) + if rows and rows[0][0]: + date_str = str(rows[0][0]) + if "2026-03-09" in date_str or "2026-03-10" in date_str: + recent_found = True + break + assert recent_found, ( + f"Expected at least one chunk from 2026-03-09/10 in top 5 for 'today' query. " + f"Got chunk_ids: {ids[:5]}" + ) + + def test_recent_milestone_findable(self, live_store, live_model): + """Recent brain_store milestone should appear for relevant query.""" + ids, docs = _search( + live_store, + live_model, + "measurement mandate evals before improvements", + n=5, + ) + assert _passes(docs, ["measurement", "Measurement", "MEASUREMENT", "evals", "baseline"], top_n=3), ( + f"Expected measurement mandate chunk in top 3, got: {[d[:60] for d in docs[:3]]}" + ) + + +# ── Hebrew FTS Tests ───────────────────────────────────────────────────────── + + +@pytest.mark.live +class TestHebrewFTS: + """Hebrew content should be discoverable via FTS5 and semantic search.""" + + def test_hebrew_query_returns_hebrew_content(self, live_store, live_model): + """Hebrew query should return Hebrew-language chunks.""" + ids, docs = _search(live_store, live_model, "לוח זמנים בוקר תזמון", n=5) + # Check that results contain Hebrew characters + hebrew_found = any(any("\u0590" <= c <= "\u05ea" for c in doc) for doc in docs[:3]) + assert hebrew_found, ( + f"Expected Hebrew content in top 3 results for Hebrew query. " + f"Got: {[d[:60] for d in docs[:3]]}" + ) + + def test_hebrew_style_correction_findable(self, live_store, live_model): + """Hebrew writing style correction should be findable by style query.""" + ids, docs = _search( + live_store, + live_model, + "em dashes Hebrew writing style correction freelance", + n=5, + ) + assert _passes(docs, ["hebrew", "Hebrew", "HEBREW", "em dash", "אתן", "style"], top_n=3), ( + f"Expected Hebrew style chunk in top 3, got: {[d[:60] for d in docs[:3]]}" + ) + + +# ── Cross-Project Search Tests ─────────────────────────────────────────────── + + +@pytest.mark.live +class TestCrossProject: + """Cross-project searches should surface multi-repo content.""" + + def test_brainlayer_fts5_architecture_findable(self, live_store, live_model): + """BrainLayer FTS5 architecture discussion should be findable.""" + ids, docs = _search( + live_store, + live_model, + "FTS5 search quality gaps summary tags indexed", + n=5, + project="brainlayer", + ) + assert _passes(docs, ["FTS5", "fts5", "summary", "tags", "indexed", "gaps"], top_n=3), ( + f"Expected FTS5 gap analysis in top 3, got: {[d[:60] for d in docs[:3]]}" + ) + + def test_golems_architecture_findable(self, live_store, live_model): + """Golems architecture discussions should be findable cross-project.""" + ids, docs = _search( + live_store, + live_model, + "golems monorepo architecture golem-powers CLI", + n=5, + ) + assert _passes(docs, ["golems", "Golems", "golem", "monorepo"], top_n=3), ( + f"Expected golems architecture in top 3, got: {[d[:60] for d in docs[:3]]}" + ) + + +# ── Decision/Correction Retrieval Tests ───────────────────────────────────── + + +@pytest.mark.live +class TestDecisionRetrieval: + """Stored decisions and corrections should be retrievable.""" + + def test_voicelayer_architectural_rule_findable(self, live_store, live_model): + """The VoiceLayer architectural rule (local tools) should be findable.""" + ids, docs = _search( + live_store, + live_model, + "voicelayer local CLI voice tools architecture rule", + n=5, + ) + assert _passes( + docs, + ["local", "CLI", "architecture", "VoiceLayer", "voicelayer", "voice"], + top_n=3, + ), ( + f"Expected VoiceLayer architecture rule in top 3, got: {[d[:60] for d in docs[:3]]}" + ) + + def test_brainlayer_decision_findable(self, live_store, live_model): + """BrainLayer v3 major architecture decisions should be findable.""" + ids, docs = _search( + live_store, + live_model, + "BrainLayer v3 architecture decisions sqlite-vec embeddings", + n=5, + project="brainlayer", + ) + assert _passes(docs, ["BrainLayer", "brainlayer", "architecture", "v3", "sqlite"], top_n=3), ( + f"Expected BrainLayer architecture decision in top 3, got: {[d[:60] for d in docs[:3]]}" + ) + + +# ── Memory / Deep Mode Tests ───────────────────────────────────────────────── + + +@pytest.mark.live +class TestMemoryRetrieval: + """Memory recall queries should surface stored manual chunks.""" + + def test_whoop_discussion_findable(self, live_store, live_model): + """WHOOP recovery discussions should be findable via memory query.""" + ids, docs = _search( + live_store, + live_model, + "remember when we discussed WHOOP recovery score", + n=5, + ) + assert _passes(docs, ["whoop", "Whoop", "WHOOP", "recovery", "Recovery"], top_n=3), ( + f"Expected WHOOP content in top 3, got: {[d[:60] for d in docs[:3]]}" + ) + + def test_coach_schedule_findable(self, live_store, live_model): + """Coach scheduling context should be findable.""" + ids, docs = _search( + live_store, + live_model, + "morning schedule wake up huberman protocol", + n=5, + ) + assert _passes(docs, ["schedule", "morning", "huberman", "Huberman", "wake", "coach"], top_n=3), ( + f"Expected schedule/huberman content in top 3, got: {[d[:60] for d in docs[:3]]}" + ) + + +# ── Mined From Logs: Real User Queries ─────────────────────────────────────── + + +@pytest.mark.live +class TestMinedQueries: + """Real queries from session transcripts — highest-value test cases.""" + + def test_brainlayer_search_quality_evaluation(self, live_store, live_model): + """User's actual query: search quality eval measurement (seen in sessions).""" + ids, docs = _search( + live_store, + live_model, + "brainlayer search quality evaluation evals measurement mandate", + n=5, + ) + assert _passes(docs, ["measurement", "MEASUREMENT", "evals", "baseline", "quality"], top_n=3), ( + f"Expected measurement mandate chunk in top 3, got: {[d[:60] for d in docs[:3]]}" + ) + + def test_enrichment_backend_mlx_groq(self, live_store, live_model): + """User query: enrichment backend status (from brainlayer sessions).""" + ids, docs = _search( + live_store, + live_model, + "enrichment MLX Groq backend progress stats chunks", + n=5, + project="brainlayer", + ) + assert _passes(docs, ["enrichment", "MLX", "mlx", "Groq", "groq", "backend", "chunks"], top_n=3), ( + f"Expected enrichment backend info in top 3, got: {[d[:60] for d in docs[:3]]}" + ) + + def test_6pm_architecture_decisions(self, live_store, live_model): + """User query: 6pm architecture decisions (mined from 6pm-mini sessions).""" + ids, docs = _search( + live_store, + live_model, + "6pm scheduling architecture confirmed decisions", + n=5, + ) + assert _passes(docs, ["6pm", "6PM", "scheduling", "architecture", "decision"], top_n=3), ( + f"Expected 6PM architecture in top 3, got: {[d[:60] for d in docs[:3]]}" + ) + + def test_cursor_cli_vs_ide_difference(self, live_store, live_model): + """User query: cursor CLI agent vs cursor IDE (mined from 6pm sessions).""" + ids, docs = _search( + live_store, + live_model, + "cursor CLI agent versus cursor IDE difference", + n=5, + ) + assert _passes(docs, ["cursor", "Cursor", "CLI", "IDE", "agent"], top_n=3), ( + f"Expected cursor CLI vs IDE content in top 3, got: {[d[:60] for d in docs[:3]]}" + ) + + +# ── Gap Identification Tests ───────────────────────────────────────────────── + + +@pytest.mark.live +class TestKnownGaps: + """Test cases expected to FAIL at baseline — document search gaps.""" + + @pytest.mark.xfail(reason="Cross-project auth: FTS returns brainlayer code not auth patterns") + def test_authentication_patterns_cross_project(self, live_store, live_model): + """Searching 'authentication JWT' cross-project should find auth patterns, not brainlayer code.""" + ids, docs = _search( + live_store, + live_model, + "authentication JWT tokens security implementation", + n=5, + ) + # Expect actual auth implementation, not brainlayer internals + assert _passes(docs, ["auth", "JWT", "token", "security"], top_n=3) and not _passes( + docs, ["hybrid_search", "vector_store", "VectorStore"], top_n=2 + ), (f"Expected auth patterns, got brainlayer internals: {[d[:60] for d in docs[:3]]}") + + @pytest.mark.xfail(reason="Hebrew semantic: returns unrelated Hebrew content (Albania trip)") + def test_hebrew_schedule_semantic_accuracy(self, live_store, live_model): + """Hebrew 'schedule morning' query should return schedule content, not unrelated Hebrew.""" + ids, docs = _search(live_store, live_model, "לוח זמנים בוקר קוד", n=5) + # Should NOT return Albania trip planning + albania_in_results = any("אלבניה" in doc for doc in docs[:3]) + assert not albania_in_results, ( + f"Hebrew query returned Albania trip content instead of schedule. " + f"This is the gap to fix." + ) + + +# ── Baseline Runner ────────────────────────────────────────────────────────── + + +def run_baseline() -> dict: + """Run all eval cases and return scored results. + + Called by scripts/run_evals.py to generate eval_baselines.json. + """ + import os + import sys + + src = Path(__file__).parent.parent / "src" + if str(src) not in sys.path: + sys.path.insert(0, str(src)) + + from brainlayer.embeddings import get_embedding_model + from brainlayer.paths import get_db_path + from brainlayer.vector_store import VectorStore + + db = get_db_path() + store = VectorStore(db) + model = get_embedding_model() + + # All eval cases: (name, query, expected_snippets, top_n, project, tag) + eval_cases = [ + # Entity routing + ("entity_avi_simon", "Avi Simon platform invites schedule", ["avi simon", "6pm", "6PM"], 3, None, None), + ("entity_fedor", "Fedor iOS build handover GitHub", ["fedor", "Fedor", "iOS", "MeHayom"], 3, None, None), + ("entity_yuval_mehayom", "MeHayom Yuval sprint payment", ["yuval", "Yuval", "MeHayom"], 3, None, None), + # Tag filter + ("tag_decision", "important decision", ["decision", "DECISION", "chose", "decided"], 3, None, "decision"), + ("tag_voicelayer_scoped", "architecture decision voicelayer", ["VoiceLayer", "voice"], 3, "voicelayer", None), + # Recency + ("recency_milestone", "measurement mandate evals before improvements", ["measurement", "evals", "baseline"], 3, None, None), + # Hebrew FTS + ("hebrew_style_correction", "em dashes Hebrew writing style correction freelance", ["hebrew", "Hebrew", "em dash", "style"], 3, None, None), + # Cross-project + ("cross_fts5_architecture", "FTS5 search quality gaps summary tags indexed", ["FTS5", "fts5", "summary", "gaps"], 3, "brainlayer", None), + ("cross_golems_monorepo", "golems monorepo architecture golem-powers CLI", ["golems", "golem", "monorepo"], 3, None, None), + # Decision retrieval + ("decision_voicelayer_rule", "voicelayer local CLI voice tools architecture rule", ["local", "CLI", "VoiceLayer", "voice"], 3, None, None), + ("decision_brainlayer_v3", "BrainLayer v3 architecture decisions sqlite-vec", ["BrainLayer", "architecture", "v3", "sqlite"], 3, "brainlayer", None), + # Memory retrieval + ("memory_whoop", "remember when we discussed WHOOP recovery score", ["whoop", "Whoop", "WHOOP", "recovery"], 3, None, None), + ("memory_coach_schedule", "morning schedule wake up huberman protocol", ["schedule", "morning", "huberman", "Huberman", "coach"], 3, None, None), + # Mined from logs + ("mined_search_quality", "brainlayer search quality evaluation evals measurement mandate", ["measurement", "evals", "baseline", "quality"], 3, None, None), + ("mined_enrichment_backend", "enrichment MLX Groq backend progress stats chunks", ["enrichment", "MLX", "Groq", "backend"], 3, "brainlayer", None), + ("mined_6pm_decisions", "6pm scheduling architecture confirmed decisions", ["6pm", "6PM", "scheduling", "architecture"], 3, None, None), + ("mined_cursor_cli", "cursor CLI agent versus cursor IDE difference", ["cursor", "Cursor", "CLI", "IDE"], 3, None, None), + # Known gaps (expected to fail at baseline) + ("gap_auth_cross_project", "authentication JWT tokens security implementation", ["auth", "JWT", "token"], 3, None, None), + ("gap_hebrew_semantic", "לוח זמנים בוקר קוד", ["לוח זמנים", "schedule", "morning"], 3, None, None), + ] + + results = { + "run_date": __import__("datetime").date.today().isoformat(), + "db": str(db), + "cases": [], + "pass_count": 0, + "total": len(eval_cases), + "score_pct": 0.0, + } + + for name, query, expected_snippets, top_n, project, tag in eval_cases: + try: + ids, docs = _search(store, model, query, n=top_n + 2, project=project, tag=tag) + passed = _passes(docs, expected_snippets, top_n=top_n) + # Find actual rank of first matching doc + actual_rank = None + for i, doc in enumerate(docs): + if any(s.lower() in doc.lower() for s in expected_snippets): + actual_rank = i + 1 + break + top_snippet = docs[0][:100] if docs else "" + top_chunk_id = ids[0] if ids else "" + except Exception as e: + passed = False + actual_rank = None + top_snippet = f"ERROR: {e}" + top_chunk_id = "" + + case_result = { + "name": name, + "query": query, + "expected_snippets": expected_snippets[:3], + "top_n": top_n, + "project": project, + "tag": tag, + "passed": passed, + "actual_rank": actual_rank, + "top_chunk_id": top_chunk_id[:50], + "top_snippet": top_snippet, + } + results["cases"].append(case_result) + if passed: + results["pass_count"] += 1 + + results["score_pct"] = round(results["pass_count"] / results["total"] * 100, 1) + store.close() + return results + + +# ── Prompt Hook Entity Injection Tests ────────────────────────────────────── + + +@pytest.mark.live +class TestPromptHookEntityInjection: + """The UserPromptSubmit hook should detect entity names and inject entity context. + + Baseline: FAIL (hook only does FTS5 keyword search — no entity detection). + After Phase A: PASS (hook detects entities → injects KG profile + linked chunks). + """ + + HOOK_PATH = Path.home() / ".claude" / "hooks" / "brainlayer-prompt-search.py" + + def _call_hook(self, prompt: str) -> str: + """Run the hook subprocess and return its stdout.""" + import subprocess + + if not self.HOOK_PATH.exists(): + pytest.skip(f"Hook not found at {self.HOOK_PATH}") + + result = subprocess.run( + ["python3", str(self.HOOK_PATH)], + input=json.dumps({"prompt": prompt}), + capture_output=True, + text=True, + timeout=5, + ) + return result.stdout + + def test_entity_detected_avi_simon(self): + """Hook should detect 'Avi Simon' as a known entity and inject entity label.""" + output = self._call_hook("What are Avi Simon's meeting preferences?") + # After Phase A: output should mention entity type or entity header + assert "[entity:" in output.lower() or "entity: avi simon" in output.lower() or ( + "avi simon" in output.lower() and "person" in output.lower() + ), ( + f"Expected entity injection for 'Avi Simon' in hook output. " + f"This is the Phase A gap (baseline = FAIL).\n" + f"Actual output: {output[:300]!r}" + ) + + def test_entity_detected_fedor(self): + """Hook should detect 'Fedor' as a known entity and inject entity label.""" + output = self._call_hook("What is Fedor working on with GitHub access?") + assert "[entity:" in output.lower() or "entity: fedor" in output.lower() or ( + "fedor" in output.lower() and "person" in output.lower() + ), ( + f"Expected entity injection for 'Fedor' in hook output. " + f"Phase A gap (baseline = FAIL).\n" + f"Actual output: {output[:300]!r}" + ) + + def test_hook_injects_entity_before_fts_results(self): + """When entity detected, entity section should appear first in hook output.""" + output = self._call_hook("Tell me about Avi Simon and his 6PM project") + lines = [l for l in output.strip().split("\n") if l.strip()] + if not lines: + pytest.skip("Hook returned no output") + # Entity section should come before FTS results + first_line = lines[0].lower() + assert "[entity" in first_line or "entity:" in first_line, ( + f"First line should be entity header, got: {first_line!r}" + ) + + def test_no_entity_in_generic_query(self): + """Hook should NOT inject entity section for generic queries.""" + output = self._call_hook("How does authentication work in Python?") + # Generic query — no entity injection expected + assert "[entity:" not in output.lower(), ( + f"Generic query should not trigger entity injection. Got: {output[:200]!r}" + ) + + +def run_hook_baseline() -> dict: + """Run hook entity injection eval cases. Returns scored results. + + Called by scripts/run_evals.py as part of full baseline. + """ + import subprocess + + hook_path = Path.home() / ".claude" / "hooks" / "brainlayer-prompt-search.py" + + def call_hook(prompt: str) -> str: + if not hook_path.exists(): + return "" + r = subprocess.run( + ["python3", str(hook_path)], + input=json.dumps({"prompt": prompt}), + capture_output=True, text=True, timeout=5, + ) + return r.stdout + + hook_cases = [ + ("hook_entity_avi_simon", "What are Avi Simon's meeting preferences?", + ["[entity:", "entity: avi simon", "person"]), + ("hook_entity_fedor", "What is Fedor working on with GitHub access?", + ["[entity:", "entity: fedor", "person"]), + ("hook_entity_first_line", "Tell me about Avi Simon and his 6PM project", + ["[entity"]), # entity header must be first line + ("hook_no_entity_generic", "How does authentication work in Python?", + None), # None = expect entity NOT present + ] + + cases = [] + pass_count = 0 + for name, prompt, expected in hook_cases: + output = call_hook(prompt) + output_lower = output.lower() + + if expected is None: + # Negative test: entity injection should NOT appear + passed = "[entity:" not in output_lower + else: + passed = any(e.lower() in output_lower for e in expected) + + cases.append({ + "name": name, + "query": prompt, + "expected": expected, + "passed": passed, + "output_preview": output[:150], + }) + if passed: + pass_count += 1 + + return { + "section": "hook_entity_injection", + "pass_count": pass_count, + "total": len(hook_cases), + "score_pct": round(pass_count / len(hook_cases) * 100, 1), + "cases": cases, + } + + +if __name__ == "__main__": + import json + + baseline = run_baseline() + hook_baseline = run_hook_baseline() + + print(f"\n=== brain_search quality ===") + print(f"Baseline Score: {baseline['pass_count']}/{baseline['total']} ({baseline['score_pct']}%)") + print(f"Run date: {baseline['run_date']}") + for case in baseline["cases"]: + status = "✓ PASS" if case["passed"] else "✗ FAIL" + rank = f"rank={case['actual_rank']}" if case["actual_rank"] else "not found" + print(f" {status} [{case['name']}] {rank}") + if not case["passed"]: + print(f" top: {case['top_snippet'][:70]!r}") + + print(f"\n=== hook entity injection ===") + print(f"Baseline Score: {hook_baseline['pass_count']}/{hook_baseline['total']} ({hook_baseline['score_pct']}%)") + for case in hook_baseline["cases"]: + status = "✓ PASS" if case["passed"] else "✗ FAIL" + print(f" {status} [{case['name']}]") + if not case["passed"]: + print(f" output: {case['output_preview'][:80]!r}") + + combined = { + "search": baseline, + "hook": hook_baseline, + "combined_score_pct": round( + (baseline["pass_count"] + hook_baseline["pass_count"]) + / (baseline["total"] + hook_baseline["total"]) * 100, 1 + ), + } + print(f"\nCombined: {combined['combined_score_pct']}%") + output_path = Path(__file__).parent / "eval_baselines.json" + output_path.write_text(json.dumps(combined, indent=2)) + print(f"Saved to {output_path}") From 0d5cc75ecfadcb36167d3de62ace11c48991a647 Mon Sep 17 00:00:00 2001 From: Etan Joseph Heyman Date: Mon, 9 Mar 2026 13:24:19 +0200 Subject: [PATCH 2/3] fix: resolve ruff lint errors (unused import, f-string placeholders) Co-Authored-By: Claude Opus 4.6 --- tests/test_eval_baselines.py | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/tests/test_eval_baselines.py b/tests/test_eval_baselines.py index 31f16b17..d9e0b933 100644 --- a/tests/test_eval_baselines.py +++ b/tests/test_eval_baselines.py @@ -423,8 +423,8 @@ def test_hebrew_schedule_semantic_accuracy(self, live_store, live_model): # Should NOT return Albania trip planning albania_in_results = any("אלבניה" in doc for doc in docs[:3]) assert not albania_in_results, ( - f"Hebrew query returned Albania trip content instead of schedule. " - f"This is the gap to fix." + "Hebrew query returned Albania trip content instead of schedule. " + "This is the gap to fix." ) @@ -436,7 +436,6 @@ def run_baseline() -> dict: Called by scripts/run_evals.py to generate eval_baselines.json. """ - import os import sys src = Path(__file__).parent.parent / "src" @@ -671,7 +670,7 @@ def call_hook(prompt: str) -> str: baseline = run_baseline() hook_baseline = run_hook_baseline() - print(f"\n=== brain_search quality ===") + print("\n=== brain_search quality ===") print(f"Baseline Score: {baseline['pass_count']}/{baseline['total']} ({baseline['score_pct']}%)") print(f"Run date: {baseline['run_date']}") for case in baseline["cases"]: @@ -681,7 +680,7 @@ def call_hook(prompt: str) -> str: if not case["passed"]: print(f" top: {case['top_snippet'][:70]!r}") - print(f"\n=== hook entity injection ===") + print("\n=== hook entity injection ===") print(f"Baseline Score: {hook_baseline['pass_count']}/{hook_baseline['total']} ({hook_baseline['score_pct']}%)") for case in hook_baseline["cases"]: status = "✓ PASS" if case["passed"] else "✗ FAIL" From 8823418d0cade64fa141399df202a5f873f88e85 Mon Sep 17 00:00:00 2001 From: Etan Joseph Heyman Date: Mon, 9 Mar 2026 13:25:57 +0200 Subject: [PATCH 3/3] style: format test_eval_baselines.py with ruff Co-Authored-By: Claude Opus 4.6 --- tests/test_eval_baselines.py | 188 ++++++++++++++++++++++++++--------- 1 file changed, 143 insertions(+), 45 deletions(-) diff --git a/tests/test_eval_baselines.py b/tests/test_eval_baselines.py index d9e0b933..91f29052 100644 --- a/tests/test_eval_baselines.py +++ b/tests/test_eval_baselines.py @@ -189,8 +189,7 @@ def test_today_returns_recent_chunks(self, live_store, live_model): recent_found = True break assert recent_found, ( - f"Expected at least one chunk from 2026-03-09/10 in top 5 for 'today' query. " - f"Got chunk_ids: {ids[:5]}" + f"Expected at least one chunk from 2026-03-09/10 in top 5 for 'today' query. Got chunk_ids: {ids[:5]}" ) def test_recent_milestone_findable(self, live_store, live_model): @@ -219,8 +218,7 @@ def test_hebrew_query_returns_hebrew_content(self, live_store, live_model): # Check that results contain Hebrew characters hebrew_found = any(any("\u0590" <= c <= "\u05ea" for c in doc) for doc in docs[:3]) assert hebrew_found, ( - f"Expected Hebrew content in top 3 results for Hebrew query. " - f"Got: {[d[:60] for d in docs[:3]]}" + f"Expected Hebrew content in top 3 results for Hebrew query. Got: {[d[:60] for d in docs[:3]]}" ) def test_hebrew_style_correction_findable(self, live_store, live_model): @@ -288,9 +286,7 @@ def test_voicelayer_architectural_rule_findable(self, live_store, live_model): docs, ["local", "CLI", "architecture", "VoiceLayer", "voicelayer", "voice"], top_n=3, - ), ( - f"Expected VoiceLayer architecture rule in top 3, got: {[d[:60] for d in docs[:3]]}" - ) + ), f"Expected VoiceLayer architecture rule in top 3, got: {[d[:60] for d in docs[:3]]}" def test_brainlayer_decision_findable(self, live_store, live_model): """BrainLayer v3 major architecture decisions should be findable.""" @@ -414,7 +410,7 @@ def test_authentication_patterns_cross_project(self, live_store, live_model): # Expect actual auth implementation, not brainlayer internals assert _passes(docs, ["auth", "JWT", "token", "security"], top_n=3) and not _passes( docs, ["hybrid_search", "vector_store", "VectorStore"], top_n=2 - ), (f"Expected auth patterns, got brainlayer internals: {[d[:60] for d in docs[:3]]}") + ), f"Expected auth patterns, got brainlayer internals: {[d[:60] for d in docs[:3]]}" @pytest.mark.xfail(reason="Hebrew semantic: returns unrelated Hebrew content (Albania trip)") def test_hebrew_schedule_semantic_accuracy(self, live_store, live_model): @@ -423,8 +419,7 @@ def test_hebrew_schedule_semantic_accuracy(self, live_store, live_model): # Should NOT return Albania trip planning albania_in_results = any("אלבניה" in doc for doc in docs[:3]) assert not albania_in_results, ( - "Hebrew query returned Albania trip content instead of schedule. " - "This is the gap to fix." + "Hebrew query returned Albania trip content instead of schedule. This is the gap to fix." ) @@ -460,25 +455,116 @@ def run_baseline() -> dict: ("tag_decision", "important decision", ["decision", "DECISION", "chose", "decided"], 3, None, "decision"), ("tag_voicelayer_scoped", "architecture decision voicelayer", ["VoiceLayer", "voice"], 3, "voicelayer", None), # Recency - ("recency_milestone", "measurement mandate evals before improvements", ["measurement", "evals", "baseline"], 3, None, None), + ( + "recency_milestone", + "measurement mandate evals before improvements", + ["measurement", "evals", "baseline"], + 3, + None, + None, + ), # Hebrew FTS - ("hebrew_style_correction", "em dashes Hebrew writing style correction freelance", ["hebrew", "Hebrew", "em dash", "style"], 3, None, None), + ( + "hebrew_style_correction", + "em dashes Hebrew writing style correction freelance", + ["hebrew", "Hebrew", "em dash", "style"], + 3, + None, + None, + ), # Cross-project - ("cross_fts5_architecture", "FTS5 search quality gaps summary tags indexed", ["FTS5", "fts5", "summary", "gaps"], 3, "brainlayer", None), - ("cross_golems_monorepo", "golems monorepo architecture golem-powers CLI", ["golems", "golem", "monorepo"], 3, None, None), + ( + "cross_fts5_architecture", + "FTS5 search quality gaps summary tags indexed", + ["FTS5", "fts5", "summary", "gaps"], + 3, + "brainlayer", + None, + ), + ( + "cross_golems_monorepo", + "golems monorepo architecture golem-powers CLI", + ["golems", "golem", "monorepo"], + 3, + None, + None, + ), # Decision retrieval - ("decision_voicelayer_rule", "voicelayer local CLI voice tools architecture rule", ["local", "CLI", "VoiceLayer", "voice"], 3, None, None), - ("decision_brainlayer_v3", "BrainLayer v3 architecture decisions sqlite-vec", ["BrainLayer", "architecture", "v3", "sqlite"], 3, "brainlayer", None), + ( + "decision_voicelayer_rule", + "voicelayer local CLI voice tools architecture rule", + ["local", "CLI", "VoiceLayer", "voice"], + 3, + None, + None, + ), + ( + "decision_brainlayer_v3", + "BrainLayer v3 architecture decisions sqlite-vec", + ["BrainLayer", "architecture", "v3", "sqlite"], + 3, + "brainlayer", + None, + ), # Memory retrieval - ("memory_whoop", "remember when we discussed WHOOP recovery score", ["whoop", "Whoop", "WHOOP", "recovery"], 3, None, None), - ("memory_coach_schedule", "morning schedule wake up huberman protocol", ["schedule", "morning", "huberman", "Huberman", "coach"], 3, None, None), + ( + "memory_whoop", + "remember when we discussed WHOOP recovery score", + ["whoop", "Whoop", "WHOOP", "recovery"], + 3, + None, + None, + ), + ( + "memory_coach_schedule", + "morning schedule wake up huberman protocol", + ["schedule", "morning", "huberman", "Huberman", "coach"], + 3, + None, + None, + ), # Mined from logs - ("mined_search_quality", "brainlayer search quality evaluation evals measurement mandate", ["measurement", "evals", "baseline", "quality"], 3, None, None), - ("mined_enrichment_backend", "enrichment MLX Groq backend progress stats chunks", ["enrichment", "MLX", "Groq", "backend"], 3, "brainlayer", None), - ("mined_6pm_decisions", "6pm scheduling architecture confirmed decisions", ["6pm", "6PM", "scheduling", "architecture"], 3, None, None), - ("mined_cursor_cli", "cursor CLI agent versus cursor IDE difference", ["cursor", "Cursor", "CLI", "IDE"], 3, None, None), + ( + "mined_search_quality", + "brainlayer search quality evaluation evals measurement mandate", + ["measurement", "evals", "baseline", "quality"], + 3, + None, + None, + ), + ( + "mined_enrichment_backend", + "enrichment MLX Groq backend progress stats chunks", + ["enrichment", "MLX", "Groq", "backend"], + 3, + "brainlayer", + None, + ), + ( + "mined_6pm_decisions", + "6pm scheduling architecture confirmed decisions", + ["6pm", "6PM", "scheduling", "architecture"], + 3, + None, + None, + ), + ( + "mined_cursor_cli", + "cursor CLI agent versus cursor IDE difference", + ["cursor", "Cursor", "CLI", "IDE"], + 3, + None, + None, + ), # Known gaps (expected to fail at baseline) - ("gap_auth_cross_project", "authentication JWT tokens security implementation", ["auth", "JWT", "token"], 3, None, None), + ( + "gap_auth_cross_project", + "authentication JWT tokens security implementation", + ["auth", "JWT", "token"], + 3, + None, + None, + ), ("gap_hebrew_semantic", "לוח זמנים בוקר קוד", ["לוח זמנים", "schedule", "morning"], 3, None, None), ] @@ -563,8 +649,10 @@ def test_entity_detected_avi_simon(self): """Hook should detect 'Avi Simon' as a known entity and inject entity label.""" output = self._call_hook("What are Avi Simon's meeting preferences?") # After Phase A: output should mention entity type or entity header - assert "[entity:" in output.lower() or "entity: avi simon" in output.lower() or ( - "avi simon" in output.lower() and "person" in output.lower() + assert ( + "[entity:" in output.lower() + or "entity: avi simon" in output.lower() + or ("avi simon" in output.lower() and "person" in output.lower()) ), ( f"Expected entity injection for 'Avi Simon' in hook output. " f"This is the Phase A gap (baseline = FAIL).\n" @@ -574,8 +662,10 @@ def test_entity_detected_avi_simon(self): def test_entity_detected_fedor(self): """Hook should detect 'Fedor' as a known entity and inject entity label.""" output = self._call_hook("What is Fedor working on with GitHub access?") - assert "[entity:" in output.lower() or "entity: fedor" in output.lower() or ( - "fedor" in output.lower() and "person" in output.lower() + assert ( + "[entity:" in output.lower() + or "entity: fedor" in output.lower() + or ("fedor" in output.lower() and "person" in output.lower()) ), ( f"Expected entity injection for 'Fedor' in hook output. " f"Phase A gap (baseline = FAIL).\n" @@ -618,19 +708,25 @@ def call_hook(prompt: str) -> str: r = subprocess.run( ["python3", str(hook_path)], input=json.dumps({"prompt": prompt}), - capture_output=True, text=True, timeout=5, + capture_output=True, + text=True, + timeout=5, ) return r.stdout hook_cases = [ - ("hook_entity_avi_simon", "What are Avi Simon's meeting preferences?", - ["[entity:", "entity: avi simon", "person"]), - ("hook_entity_fedor", "What is Fedor working on with GitHub access?", - ["[entity:", "entity: fedor", "person"]), - ("hook_entity_first_line", "Tell me about Avi Simon and his 6PM project", - ["[entity"]), # entity header must be first line - ("hook_no_entity_generic", "How does authentication work in Python?", - None), # None = expect entity NOT present + ( + "hook_entity_avi_simon", + "What are Avi Simon's meeting preferences?", + ["[entity:", "entity: avi simon", "person"], + ), + ("hook_entity_fedor", "What is Fedor working on with GitHub access?", ["[entity:", "entity: fedor", "person"]), + ( + "hook_entity_first_line", + "Tell me about Avi Simon and his 6PM project", + ["[entity"], + ), # entity header must be first line + ("hook_no_entity_generic", "How does authentication work in Python?", None), # None = expect entity NOT present ] cases = [] @@ -645,13 +741,15 @@ def call_hook(prompt: str) -> str: else: passed = any(e.lower() in output_lower for e in expected) - cases.append({ - "name": name, - "query": prompt, - "expected": expected, - "passed": passed, - "output_preview": output[:150], - }) + cases.append( + { + "name": name, + "query": prompt, + "expected": expected, + "passed": passed, + "output_preview": output[:150], + } + ) if passed: pass_count += 1 @@ -692,8 +790,8 @@ def call_hook(prompt: str) -> str: "search": baseline, "hook": hook_baseline, "combined_score_pct": round( - (baseline["pass_count"] + hook_baseline["pass_count"]) - / (baseline["total"] + hook_baseline["total"]) * 100, 1 + (baseline["pass_count"] + hook_baseline["pass_count"]) / (baseline["total"] + hook_baseline["total"]) * 100, + 1, ), } print(f"\nCombined: {combined['combined_score_pct']}%")