diff --git a/src/brainlayer/enrichment_controller.py b/src/brainlayer/enrichment_controller.py index b324cc27..8ce0c559 100644 --- a/src/brainlayer/enrichment_controller.py +++ b/src/brainlayer/enrichment_controller.py @@ -17,7 +17,7 @@ import time from dataclasses import dataclass, field from pathlib import Path -from typing import Any +from typing import Any, Optional logger = logging.getLogger(__name__) @@ -108,6 +108,39 @@ def _build_gemini_config() -> dict[str, Any]: } +# ── Entity extraction via Gemini ─────────────────────────────────────────────── + +GEMINI_EXTRACTION_MODEL = os.environ.get("BRAINLAYER_GEMINI_EXTRACTION_MODEL", "gemini-2.5-flash-lite") + + +def call_gemini_for_extraction(prompt: str) -> Optional[str]: + """Call Gemini for entity/relation extraction. Returns raw text response. + + Rate-limited by BRAINLAYER_ENRICH_RATE (default 0.2 = 12 RPM). + Timeout: 30 seconds per call. + """ + try: + client = _get_gemini_client() + except RuntimeError: + logger.debug("Gemini not available for extraction") + return None + + try: + response = client.models.generate_content( + model=GEMINI_EXTRACTION_MODEL, + contents=prompt, + config={ + "response_mime_type": "application/json", + "thinking_config": {"thinking_budget": 0}, + "http_options": {"timeout": 30_000}, + }, + ) + return response.text if response and response.text else None + except Exception: + logger.warning("Gemini extraction call failed", exc_info=True) + return None + + # ── Content-hash dedup ───────────────────────────────────────────────────────── diff --git a/src/brainlayer/pipeline/enrichment.py b/src/brainlayer/pipeline/enrichment.py index f3e00be2..2ec26d7d 100644 --- a/src/brainlayer/pipeline/enrichment.py +++ b/src/brainlayer/pipeline/enrichment.py @@ -857,12 +857,12 @@ def _enrich_one( from .entity_extraction import extract_entities_from_tags from .kg_extraction import extract_kg_from_chunk - # Seed + tag extraction (no API calls, always enabled) + # Entity extraction: seed matching + LLM extraction via Gemini extract_kg_from_chunk( store=store, chunk_id=chunk["id"], seed_entities=DEFAULT_SEED_ENTITIES, - use_llm=False, + use_llm=True, use_gliner=False, ) diff --git a/src/brainlayer/pipeline/entity_extraction.py b/src/brainlayer/pipeline/entity_extraction.py index a11015aa..01b90442 100644 --- a/src/brainlayer/pipeline/entity_extraction.py +++ b/src/brainlayer/pipeline/entity_extraction.py @@ -118,22 +118,66 @@ def _deduplicate_overlaps(entities: list[ExtractedEntity]) -> list[ExtractedEnti # ── LLM-based extraction ── -_NER_PROMPT_TEMPLATE = """Extract named entities and relationships from this developer conversation text. +_NER_PROMPT_TEMPLATE = """Extract ALL named entities and relationships from this developer conversation text. + +## Entity types (be precise — choose the most specific type): +- person: Human individuals (First Last). NOT repos, tools, or agents. +- agent: AI coding agents (orcClaude, coachClaude, brainClaude, Ralph, etc.). NOT humans. +- company: Businesses and organizations (Anthropic, Weby, Cantaloupe AI). +- project: Code repositories, apps, products (BrainLayer, VoiceLayer, 6PM). +- tool: Developer tools and services (Docker, Railway, Supabase, CodeRabbit). +- technology: Languages, frameworks, protocols (SQLite, SwiftUI, MCP, TypeScript). +- skill: Reusable AI skill or command (/commit, /pr-loop, /coach). +- service: Deployed infrastructure (LaunchAgent, daemon, watcher). +- config: Configuration files or settings (CLAUDE.md, pyproject.toml, .env). +- decision: Architectural or design decisions made during sessions. +- topic: Abstract concepts or domains (enrichment, graph RAG, dark mode). + +## Relation types (source → target, with description): +- created: person/agent → project/tool. "Anthropic created Claude Code" +- owns: person → project/company. "Etan owns BrainLayer" +- works_at: person → company. "Josh Anderson works at Cantaloupe AI" +- uses: entity → tool/technology. "BrainLayer uses SQLite" +- depends_on: project → technology/tool. "VoiceLayer depends on whisper-cpp" +- deployed_on: project/service → tool. "Golems deployed on Railway" +- fixes: agent/person → topic/project. "brainClaude fixes dark mode regression" +- configures: config → project/service. "CLAUDE.md configures BrainLayer hooks" +- spawns: agent → agent. "orcClaude spawns brainlayerClaude" +- client_of: person → person/company. "Yuval is client of Etan" +- affiliated_with: person → company. "Josh affiliated with Cantaloupe AI" +- coaches: agent → entity. "coachClaude coaches scheduling" +- builds: person/agent → project. "Etan builds VoiceLayer" +- related_to: generic fallback (use ONLY if no specific type fits) + +## Output format — return JSON only: +{{"entities": [{{"text": "exact text from input", "type": "entity_type", "description": "one-sentence description of this entity based on context"}}], "relations": [{{"source": "entity text", "target": "entity text", "type": "relation_type", "description": "natural language sentence describing the relationship", "strength": 0.8}}]}} + +## Rules: +- Extract entities that are CLEARLY identifiable, not vague mentions +- Each relation MUST have a substantive description — reject empty relations +- Strength is 0.0-1.0: explicit statements=0.9+, implied=0.5-0.8, speculative=0.3-0.5 +- Decompose N-ary relationships into binary pairs +- Include Hebrew entity names if present (e.g., MeHayom/מהיום) +- If no entities found, return: {{"entities": [], "relations": []}} -Entity types: person, agent, company, project, tool, technology, topic -- person: Human names (First Last). NOT repos/tools/agents. -- agent: AI agents (*Claude, *Golem, Ralph). NOT humans. -- company: Businesses. project: Code repos/apps. tool/technology: Dev tools, languages, frameworks. +Text: +{text}""" + +_GLEANING_PROMPT = """The previous extraction from the same text missed important entities and relationships. + +Previous extraction found: {previous_count} entities and {previous_rel_count} relations. -Relation types (direction: source → target): -- works_at: person → company. owns: person → project/company. builds: person/agent → project. -- uses: entity → tool/technology. client_of: A → B (B serves A). affiliated_with: person → company. -- coaches: agent → person. related_to: generic fallback. +Re-read the text carefully. Extract ADDITIONAL entities and relationships that were missed. Focus on: +- Implicit relationships (X depends on Y, X was deployed to Y) +- Agent names and their roles +- Configuration files and what they configure +- Decisions and what they decided about +- Services and what they serve -Return JSON only: -{{"entities": [{{"text": "exact text from input", "type": "entity_type"}}], "relations": [{{"source": "entity text", "target": "entity text", "type": "relation_type", "fact": "natural language sentence"}}]}} +Return ONLY newly found entities/relations (not duplicates of previous extraction). -If no entities found, return: {{"entities": [], "relations": []}} +Same JSON format: +{{"entities": [{{"text": "exact text", "type": "entity_type", "description": "description"}}], "relations": [{{"source": "entity text", "target": "entity text", "type": "relation_type", "description": "description", "strength": 0.7}}]}} Text: {text}""" @@ -144,6 +188,15 @@ def build_ner_prompt(text: str) -> str: return _NER_PROMPT_TEMPLATE.format(text=text) +def build_gleaning_prompt(text: str, prev_entity_count: int, prev_rel_count: int) -> str: + """Build the gleaning re-prompt for missed entities.""" + return _GLEANING_PROMPT.format( + text=text, + previous_count=prev_entity_count, + previous_rel_count=prev_rel_count, + ) + + def parse_llm_ner_response(response: str, source_text: str) -> tuple[list[ExtractedEntity], list[ExtractedRelation]]: """Parse LLM NER response into entities and relations with spans. @@ -192,20 +245,27 @@ def parse_llm_ner_response(response: str, source_text: str) -> tuple[list[Extrac source = raw_rel.get("source", "") target = raw_rel.get("target", "") rtype = raw_rel.get("type", "") + desc = raw_rel.get("description", "") if not source or not target or not rtype: continue - fact = raw_rel.get("fact") + try: + strength = float(raw_rel.get("strength", 0.7)) + except (TypeError, ValueError): + strength = 0.7 + fact = raw_rel.get("fact") or desc props = raw_rel.get("properties") or {} - if fact and "fact" not in props: + if fact: props["fact"] = fact + if desc: + props["description"] = desc relations.append( ExtractedRelation( source_text=source, target_text=target, relation_type=rtype, - confidence=0.7, + confidence=min(float(strength), 1.0), properties=props, ) ) @@ -239,12 +299,15 @@ def _extract_json(text: str) -> Optional[dict[str, Any]]: def extract_entities_llm( text: str, llm_caller: Optional[Any] = None, + enable_gleaning: bool = False, ) -> tuple[list[ExtractedEntity], list[ExtractedRelation]]: - """Extract entities using LLM (Ollama/MLX). + """Extract entities using LLM with optional gleaning second pass. Args: text: Source text to extract from. - llm_caller: Callable(prompt) -> str. If None, uses enrichment.call_llm. + llm_caller: Callable(prompt) -> str. If None, uses Gemini via enrichment_controller. + enable_gleaning: If True, re-prompt for missed entities (catches 20-40% more). + Default False to avoid doubling LLM calls. Enable for high-value chunks. Returns: Tuple of (entities, relations). @@ -252,13 +315,11 @@ def extract_entities_llm( if not text.strip(): return [], [] - prompt = build_ner_prompt(text) - if llm_caller is None: - from .enrichment import call_llm - - llm_caller = call_llm + llm_caller = _get_default_llm_caller() + # Pass 1: Primary extraction + prompt = build_ner_prompt(text) try: response = llm_caller(prompt) except Exception: @@ -268,7 +329,55 @@ def extract_entities_llm( if not response: return [], [] - return parse_llm_ner_response(response, text) + entities, relations = parse_llm_ner_response(response, text) + + # Pass 2: Gleaning — re-prompt for missed entities + if enable_gleaning and (entities or relations): + gleaning_prompt = build_gleaning_prompt(text, len(entities), len(relations)) + try: + gleaning_response = llm_caller(gleaning_prompt) + if gleaning_response: + extra_entities, extra_relations = parse_llm_ner_response(gleaning_response, text) + if extra_entities or extra_relations: + logger.info( + "Gleaning found %d extra entities, %d extra relations", + len(extra_entities), + len(extra_relations), + ) + entities.extend(extra_entities) + relations.extend(extra_relations) + except Exception: + logger.debug("Gleaning pass failed (non-critical)", exc_info=True) + + # Deduplicate relations (gleaning may re-find the same ones) + seen_rels: set[tuple[str, str, str]] = set() + unique_relations: list[ExtractedRelation] = [] + for r in relations: + key = (r.source_text.lower(), r.target_text.lower(), r.relation_type) + if key not in seen_rels: + seen_rels.add(key) + unique_relations.append(r) + + return entities, unique_relations + + +def _get_default_llm_caller(): + """Get the best available LLM caller — Gemini first, then enrichment.call_llm.""" + try: + from ..enrichment_controller import call_gemini_for_extraction + + return call_gemini_for_extraction + except (ImportError, RuntimeError): + pass + + try: + from .enrichment import call_llm + + return call_llm + except ImportError: + pass + + raise RuntimeError("No LLM backend available for entity extraction") # ── GLiNER-based extraction ──