diff --git a/.env b/.env deleted file mode 100644 index cdcc756..0000000 --- a/.env +++ /dev/null @@ -1 +0,0 @@ -OPENROUTER_API_KEY = APIKEY \ No newline at end of file diff --git a/.gitignore b/.gitignore index e69de29..0c2ad09 100644 --- a/.gitignore +++ b/.gitignore @@ -0,0 +1 @@ +.env diff --git a/SQL/instructions_for_creation.md b/SQL/instructions_for_creation.md new file mode 100644 index 0000000..1110b26 --- /dev/null +++ b/SQL/instructions_for_creation.md @@ -0,0 +1,2 @@ +# Para criar o banco: +mysql -u root -p -e "CREATE DATABASE IF NOT EXISTS pybot CHARACTER SET utf8mb4;" \ No newline at end of file diff --git a/SQL/schema.sql b/SQL/schema.sql new file mode 100644 index 0000000..1c73b4b --- /dev/null +++ b/SQL/schema.sql @@ -0,0 +1,10 @@ +CREATE TABLE IF NOT EXISTS knowledge ( + id INT UNSIGNED AUTO_INCREMENT PRIMARY KEY, + title VARCHAR(255) NOT NULL, + content TEXT NOT NULL, + category VARCHAR(100) NOT NULL DEFAULT 'geral', + active TINYINT(1) NOT NULL DEFAULT 1, + created_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP, + updated_at DATETIME NOT NULL DEFAULT CURRENT_TIMESTAMP ON UPDATE CURRENT_TIMESTAMP, + INDEX idx_active_category (active, category) +) ENGINE=InnoDB DEFAULT CHARSET=utf8mb4 COLLATE=utf8mb4_unicode_ci; \ No newline at end of file diff --git a/__pycache__/app.cpython-314.pyc b/__pycache__/app.cpython-314.pyc new file mode 100644 index 0000000..8d62d35 Binary files /dev/null and b/__pycache__/app.cpython-314.pyc differ diff --git a/__pycache__/main.cpython-314.pyc b/__pycache__/main.cpython-314.pyc new file mode 100644 index 0000000..cbfc496 Binary files /dev/null and b/__pycache__/main.cpython-314.pyc differ diff --git a/api/__pycache__/__init__.cpython-314.pyc b/api/__pycache__/__init__.cpython-314.pyc index c327023..77c4f5b 100644 Binary files a/api/__pycache__/__init__.cpython-314.pyc and b/api/__pycache__/__init__.cpython-314.pyc differ diff --git a/api/__pycache__/routes.cpython-314.pyc b/api/__pycache__/routes.cpython-314.pyc index af7eb8d..a002356 100644 Binary files a/api/__pycache__/routes.cpython-314.pyc and b/api/__pycache__/routes.cpython-314.pyc differ diff --git a/api/routes.py b/api/routes.py index 0ae7448..cb24c0f 100644 --- a/api/routes.py +++ b/api/routes.py @@ -8,6 +8,8 @@ from fastapi import APIRouter, HTTPException, Request from fastapi.responses import HTMLResponse, StreamingResponse +from collections.abc import AsyncGenerator + log = logging.getLogger("kernelbots.api.chat") router = APIRouter() @@ -75,6 +77,28 @@ async def chat(request: Request) -> StreamingResponse: detail="Campo 'session_id' deve ser string ou omitido.", ) + if user_message.strip().lower() == "/reload": + log.info("🔄 Comando /reload recebido — reconstruindo índice BM25...") + services.search_engine.rebuild() + chunk_count = len(services.search_engine.chunks) + db_count = sum(1 for c in services.search_engine.chunks if c.get("source", "").startswith("db:")) + md_count = chunk_count - db_count + status = ( + f"Índice reconstruído: {chunk_count} chunk(s) total " + f"({md_count} de arquivos .md + {db_count} do MySQL)." + ) + log.info("✅ /reload concluído — %s", status) + + async def _reload_stream() -> AsyncGenerator[str, None]: + yield f"data: {status}\n\n" + yield "data: [DONE]\n\n" + + return StreamingResponse( + _reload_stream(), + media_type="text/event-stream", + headers={"Cache-Control": "no-cache", "X-Accel-Buffering": "no", "Connection": "keep-alive"}, + ) + built = services.context_manager.build_messages( user_message, discipline_filter=discipline, diff --git a/app/__pycache__/__init__.cpython-314.pyc b/app/__pycache__/__init__.cpython-314.pyc index b4a0e4f..db1147b 100644 Binary files a/app/__pycache__/__init__.cpython-314.pyc and b/app/__pycache__/__init__.cpython-314.pyc differ diff --git a/app/__pycache__/factory.cpython-314.pyc b/app/__pycache__/factory.cpython-314.pyc index b32b8b8..9c43984 100644 Binary files a/app/__pycache__/factory.cpython-314.pyc and b/app/__pycache__/factory.cpython-314.pyc differ diff --git a/app/__pycache__/state.cpython-314.pyc b/app/__pycache__/state.cpython-314.pyc index 58d7902..98972be 100644 Binary files a/app/__pycache__/state.cpython-314.pyc and b/app/__pycache__/state.cpython-314.pyc differ diff --git a/core/__pycache__/__init__.cpython-314.pyc b/core/__pycache__/__init__.cpython-314.pyc index 2c6cbb1..9cbd5b2 100644 Binary files a/core/__pycache__/__init__.cpython-314.pyc and b/core/__pycache__/__init__.cpython-314.pyc differ diff --git a/core/__pycache__/config.cpython-314.pyc b/core/__pycache__/config.cpython-314.pyc index 463a9a0..e40be64 100644 Binary files a/core/__pycache__/config.cpython-314.pyc and b/core/__pycache__/config.cpython-314.pyc differ diff --git a/core/__pycache__/logging_config.cpython-314.pyc b/core/__pycache__/logging_config.cpython-314.pyc index 69ccfac..c990bc2 100644 Binary files a/core/__pycache__/logging_config.cpython-314.pyc and b/core/__pycache__/logging_config.cpython-314.pyc differ diff --git a/core/config.py b/core/config.py index 096b381..a20593a 100644 --- a/core/config.py +++ b/core/config.py @@ -27,6 +27,11 @@ class Settings: pinned_max_turns: int pinned_max_chars: int pinned_weak_score: float + db_host: str + db_port: int + db_name: str + db_user: str + db_password: str @property def openrouter_headers(self) -> dict[str, str]: @@ -87,6 +92,23 @@ def load(cls) -> Settings: raise RuntimeError("ACL_PINNED_WEAK_SCORE deve ser um número.") from None pinned_weak_score = max(0.05, min(0.95, pinned_weak_score)) + """ !Credenciais do banco! """ + + db_host = (os.getenv("DB_HOST") or "").strip() + + db_port_raw = (os.getenv("DB_PORT") or "3306").strip() + + try: + db_port = int(db_port_raw) + except ValueError: + raise RuntimeError("DB_PORT deve ser um inteiro.") from None + + db_name = (os.getenv("DB_NAME") or "").strip() + + db_user = (os.getenv("DB_USER") or "").strip() + + db_password = (os.getenv("DB_PASSWORD") or "").strip() + return cls( openrouter_api_key=key, project_root=project_root, @@ -100,4 +122,9 @@ def load(cls) -> Settings: pinned_max_turns=pinned_max_turns, pinned_max_chars=pinned_max_chars, pinned_weak_score=pinned_weak_score, + db_host=db_host, + db_port=db_port, + db_name=db_name, + db_user=db_user, + db_password=db_password, ) diff --git a/engine/__pycache__/__init__.cpython-314.pyc b/engine/__pycache__/__init__.cpython-314.pyc index 5418816..ece1cbe 100644 Binary files a/engine/__pycache__/__init__.cpython-314.pyc and b/engine/__pycache__/__init__.cpython-314.pyc differ diff --git a/engine/__pycache__/chat_provider.cpython-314.pyc b/engine/__pycache__/chat_provider.cpython-314.pyc index 554a383..347a60b 100644 Binary files a/engine/__pycache__/chat_provider.cpython-314.pyc and b/engine/__pycache__/chat_provider.cpython-314.pyc differ diff --git a/engine/__pycache__/context.cpython-314.pyc b/engine/__pycache__/context.cpython-314.pyc index 4c471d1..b368c69 100644 Binary files a/engine/__pycache__/context.cpython-314.pyc and b/engine/__pycache__/context.cpython-314.pyc differ diff --git a/engine/__pycache__/database.cpython-314.pyc b/engine/__pycache__/database.cpython-314.pyc new file mode 100644 index 0000000..921aae6 Binary files /dev/null and b/engine/__pycache__/database.cpython-314.pyc differ diff --git a/engine/__pycache__/pinned_store.cpython-314.pyc b/engine/__pycache__/pinned_store.cpython-314.pyc index 953f467..5834e63 100644 Binary files a/engine/__pycache__/pinned_store.cpython-314.pyc and b/engine/__pycache__/pinned_store.cpython-314.pyc differ diff --git a/engine/__pycache__/search.cpython-314.pyc b/engine/__pycache__/search.cpython-314.pyc index 44dd406..7ab519c 100644 Binary files a/engine/__pycache__/search.cpython-314.pyc and b/engine/__pycache__/search.cpython-314.pyc differ diff --git a/engine/__pycache__/watcher.cpython-314.pyc b/engine/__pycache__/watcher.cpython-314.pyc index 7162436..7679849 100644 Binary files a/engine/__pycache__/watcher.cpython-314.pyc and b/engine/__pycache__/watcher.cpython-314.pyc differ diff --git a/engine/database.py b/engine/database.py new file mode 100644 index 0000000..f20c798 --- /dev/null +++ b/engine/database.py @@ -0,0 +1,84 @@ +"""Fonte de dados MySQL para o índice BM25.""" +from __future__ import annotations + +import logging +from typing import TYPE_CHECKING + +if TYPE_CHECKING: + from core.config import Settings + +log = logging.getLogger(f"kernelbots.{__name__}") + +DB_CHUNK_WORDS = 500 +DB_CHUNK_OVERLAP = 50 + + +def _chunk_text(text: str, title: str, source: str) -> list[dict]: + """Divide texto em janelas de ~500 palavras com overlap de 50.""" + words = text.split() + if not words: + return [] + chunks: list[dict] = [] + start = 0 + while start < len(words): + end = min(start + DB_CHUNK_WORDS, len(words)) + chunks.append({ + "text": f"{title}\n" + " ".join(words[start:end]), + "source": source, + "discipline": "db", + }) + if end == len(words): + break + start += DB_CHUNK_WORDS - DB_CHUNK_OVERLAP + return chunks + + +def fetch_db_chunks(settings: Settings) -> list[dict]: + """ + Busca rows ativas da tabela knowledge e retorna lista de chunks BM25. + Retorna [] com warning se o DB não estiver configurado ou falhar. + """ + if not all([settings.db_host, settings.db_name, settings.db_user]): + log.debug("Variáveis DB_* não configuradas — pulando fonte MySQL.") + return [] + + try: + import pymysql + import pymysql.cursors + except ImportError: + log.warning("PyMySQL não instalado — fonte MySQL desativada.") + return [] + + try: + conn = pymysql.connect( + host=settings.db_host, + port=settings.db_port, + database=settings.db_name, + user=settings.db_user, + password=settings.db_password, + charset="utf8mb4", + cursorclass=pymysql.cursors.DictCursor, + connect_timeout=5, + read_timeout=10, + ) + with conn: + with conn.cursor() as cursor: + cursor.execute( + "SELECT id, title, content, category " + "FROM knowledge WHERE active = 1 ORDER BY id" + ) + rows = cursor.fetchall() + + all_chunks: list[dict] = [] + for row in rows: + source = f"db:{row['category']}" + chunks = _chunk_text(row["content"], row["title"], source) + all_chunks.extend(chunks) + log.debug(" 🗄 row id=%s '%s' → %s chunk(s)", row["id"], row["title"], len(chunks)) + + log.info(" 🗄 MySQL: %s row(s) → %s chunk(s) carregados", len(rows), len(all_chunks)) + return all_chunks + + except Exception: + log.warning("⚠ Falha ao conectar ao MySQL — continuando apenas com .md.", exc_info=True) + return [] \ No newline at end of file diff --git a/engine/search.py b/engine/search.py index ebf036a..5a10203 100644 --- a/engine/search.py +++ b/engine/search.py @@ -12,6 +12,8 @@ from rank_bm25 import BM25Okapi from core.config import GlobalContextMode +from core.config import Settings +from engine.database import fetch_db_chunks log = logging.getLogger(f"kernelbots.{__name__}") @@ -26,10 +28,12 @@ def __init__( content_dir: Path, score_threshold: float, global_context_mode: GlobalContextMode = "geral", + settings: Settings | None = None, # <-- adicionar ) -> None: self._content_dir = content_dir.resolve() self._score_threshold = score_threshold self._global_context_mode: GlobalContextMode = global_context_mode + self._settings = settings self._lock = threading.RLock() self._silos: dict[str, dict[str, Any]] = {} self._discipline_ids: frozenset[str] = frozenset() @@ -151,6 +155,15 @@ def rebuild(self) -> None: log.warning( "⚠ Nenhum .md indexado — BM25 desativado. Modo assistente geral ativo." ) + + # --- chunks do MySQL (silo "db") --- + db_chunks: list[dict] = [] + if self._settings is not None: + db_chunks = fetch_db_chunks(self._settings) + if db_chunks: + tokenized_db = [self._tokenize(c["text"]) for c in db_chunks] + new_silos["db"] = {"chunks": db_chunks, "bm25": BM25Okapi(tokenized_db)} + all_chunks.extend(db_chunks) elapsed = (time.perf_counter() - t0) * 1000 with self._lock: @@ -158,11 +171,11 @@ def rebuild(self) -> None: self._silos = new_silos self._all_chunks = all_chunks + db_count = len(db_chunks) + md_count = len(all_chunks) - db_count log.info( - "✅ Índice BM25 por silo pronto — %s chunk(s) | %s silo(s) | rebuild em %.1fms", - len(all_chunks), - len(new_silos), - elapsed, + "✅ Índice BM25 por silo pronto — %s chunk(s) (%s .md + %s MySQL) | %s silo(s) | rebuild em %.1fms", + len(all_chunks), md_count, db_count, len(new_silos), elapsed, ) def normalize_discipline(self, raw: str | None) -> str | None: @@ -225,7 +238,10 @@ def search( return self._hits_in_silo(nd, query, top_k) if self._global_context_mode == "geral": - return self._hits_in_silo("geral", query, top_k) + hits = self._hits_in_silo("geral", query, top_k) + hits += self._hits_in_silo("db", query, top_k) + hits.sort(key=lambda h: h["score"], reverse=True) + return hits[:top_k] merged: list[dict] = [] for silo in sorted(self._silos.keys()): diff --git a/main.py b/main.py index a8efe52..b45637f 100644 --- a/main.py +++ b/main.py @@ -24,6 +24,7 @@ settings.content_dir, settings.bm25_score_threshold, settings.global_context_mode, + settings=settings, ) observer = start_content_observer(search_engine, settings.content_dir) diff --git a/requirements.txt b/requirements.txt index f93aa6e..bbba313 100644 --- a/requirements.txt +++ b/requirements.txt @@ -5,4 +5,5 @@ python-dotenv jinja2 rank-bm25 watchdog -pytest \ No newline at end of file +pytest +PyMySQL \ No newline at end of file diff --git a/templates/index.html b/templates/index.html index 9e63d6c..f7c13e9 100644 --- a/templates/index.html +++ b/templates/index.html @@ -32,6 +32,7 @@ content/doc), /python, /visualizacao-sql, /projeto-bloco, /planejamento-curso-carreira (RAG só na disciplina).

+ /reload /python o que são listas? /visualizacao-sql explique GROUP BY /projeto-bloco resuma o pipeline @@ -48,6 +49,7 @@ Enter envia · Shift+Enter nova linha · /python · /visualizacao-sql · /projeto-bloco · /planejamento-curso-carreira · /doc · /content + · /reload para reconstruir o índice