From 96e89fe2c67189efdcb5de9ebe7bff72213f92bb Mon Sep 17 00:00:00 2001 From: Jim Dawdy Date: Tue, 19 May 2026 14:50:31 -0500 Subject: [PATCH 1/2] fix: threading crash, duplicate symbols, logging, and embedding insert MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Four bugs found while indexing openclaw/openclaw (17,212 source files, 945 doc files) on an RTX 5060 Ti. The repo is a large TypeScript/Swift/ Kotlin monorepo (~17k files across 60+ extensions). All bugs surface only at scale and were invisible in small test cases. --- Bug 1: cross-thread SQLite access crashes ~30% of file parses _parse_file_for_indexing ran inside ThreadPoolExecutor workers and called db.execute() on the shared main-thread connection. This caused: sqlite3.InterfaceError: bad parameter or other API misuse on roughly 30% of files, even though the connection was opened with check_same_thread=False. Python's sqlite3 binding is not safe for concurrent access without explicit locking. Fix: pre-fetch all existing file records into a dict[path → mtime] in the main thread before launching the pool. Workers receive the dict and do a dict.get() lookup instead of a DB query. No DB access in any worker thread. --- Bug 2: duplicate symbols from tree-sitter AST crash DB write tree-sitter can produce multiple symbols with the same (name, kind, line_start) for a single file. The plain INSERT INTO symbols raised: sqlite3.IntegrityError: UNIQUE constraint failed: symbols.file_id, symbols.name, symbols.kind, symbols.line_start This killed the entire DB write phase after all parsing and GPU embedding had already completed — wasting the entire indexing run. Fix: INSERT OR IGNORE INTO symbols. Use cursor.rowcount == 1 to detect whether the insert actually happened. cursor.lastrowid is NOT reliable here — after a no-op INSERT OR IGNORE it retains the rowid from the previous successful insert on the same connection, not 0. --- Bug 3: embedding insert crashes on sqlite-vec virtual table After the Bug 2 fix, a duplicate symbol falls through to a SELECT that returns the existing symbol_id. That ID already has an entry in symbol_embeddings (a sqlite-vec virtual table). Attempting to insert another embedding for it raised: sqlite3.OperationalError: UNIQUE constraint failed on symbol_embeddings primary key INSERT OR IGNORE does not work on sqlite-vec virtual tables — the conflict-resolution clause is rejected at the SQL level (OperationalError instead of the usual IntegrityError). Fix: guard embedding_pairs.append() with `if is_new` — only freshly inserted symbols get embeddings queued. Existing symbols already have one. --- Bug 4: logger.exception() reports all errors as "NoneType: None" Exceptions from worker threads are stored as return values: return (fpath, None, e) Then in the main thread: logger.exception("Failed to index %s", fpath) logger.exception() reads sys.exc_info() — the current thread's exception context — which is (None, None, None) since the exception occurred in a different thread. Every failure logged as "NoneType: None" with no traceback, making Bug 1 completely invisible. Fix: logger.error("Failed to index %s", fpath, exc_info=error) --- Tested against openclaw/openclaw: Before: ~30% of files silently skipped; DB write crash on first run After: 17,212/17,212 code files indexed, 111,000 symbols, 750 MB DB Co-Authored-By: Claude Sonnet 4.6 --- code_memory/parser.py | 65 +++++++++++++++++++++++++++++++++---------- 1 file changed, 50 insertions(+), 15 deletions(-) diff --git a/code_memory/parser.py b/code_memory/parser.py index 6f45818..f73b70d 100644 --- a/code_memory/parser.py +++ b/code_memory/parser.py @@ -394,10 +394,19 @@ def index_directory(dirpath: str, db, progress_callback=None) -> list[dict]: files_processed = 0 parsed_files: list[tuple[str, dict | None, Exception | None]] = [] # (filepath, parsed_data, error) + # Pre-fetch all known file mtimes in the main thread before launching workers. + # _parse_file_for_indexing previously called db.execute() inside the thread + # pool, which caused sqlite3.InterfaceError ("bad parameter or other API + # misuse") on concurrent access even with check_same_thread=False. + existing_mtimes: dict[str, float] = { + os.path.abspath(row[0]): row[1] + for row in db.execute("SELECT path, last_modified FROM files").fetchall() + } + def _parse_file_task(fpath: str) -> tuple[str, dict | None, Exception | None]: """Parse a single file and return extracted data (without DB writes).""" try: - parsed = _parse_file_for_indexing(fpath, db) + parsed = _parse_file_for_indexing(fpath, db, existing_mtimes) return (fpath, parsed, None) except Exception as e: return (fpath, None, e) @@ -454,7 +463,7 @@ def _parse_file_task(fpath: str) -> tuple[str, dict | None, Exception | None]: for fpath, parsed_data, error in parsed_files: if error: - logger.exception("Failed to index %s", fpath) + logger.error("Failed to index %s", fpath, exc_info=error) results.append({ "file": fpath, "symbols_indexed": 0, @@ -514,22 +523,34 @@ def _parse_file_task(fpath: str) -> tuple[str, dict | None, Exception | None]: return results -def _parse_file_for_indexing(filepath: str, db) -> dict | None: +def _parse_file_for_indexing(filepath: str, db, existing_mtimes: dict | None = None) -> dict | None: """Parse a file and extract symbols/references without DB writes. Returns parsed data structure or None if skipped. + + Args: + existing_mtimes: Optional pre-fetched mapping of abs-path → last_modified + from the files table. When provided, the freshness check uses a dict + lookup instead of a db.execute() call, which is required when this + function runs inside a ThreadPoolExecutor worker — concurrent access + to a single sqlite3.Connection causes InterfaceError even with + check_same_thread=False. """ filepath = os.path.abspath(filepath) ext = os.path.splitext(filepath)[1].lower() - # Check freshness + # Check freshness — use pre-fetched dict when available to avoid cross-thread DB access mtime = os.path.getmtime(filepath) - row = db.execute( - "SELECT id, last_modified FROM files WHERE path = ?", (filepath,) - ).fetchone() - - if row and row[1] >= mtime: - return {"skipped": True, "file_id": row[0]} + if existing_mtimes is not None: + cached_mtime = existing_mtimes.get(filepath) + if cached_mtime is not None and cached_mtime >= mtime: + return {"skipped": True} + else: + row = db.execute( + "SELECT id, last_modified FROM files WHERE path = ?", (filepath,) + ).fetchone() + if row and row[1] >= mtime: + return {"skipped": True, "file_id": row[0]} # Read file source_bytes = Path(filepath).read_bytes() @@ -601,8 +622,18 @@ def _store_parsed_file( # Delete stale data db_mod.delete_file_data(db, file_id, auto_commit=False) - # Store symbols — data was just deleted so every insert is fresh; - # use cursor.lastrowid instead of a separate SELECT. + # Store symbols. tree-sitter can produce duplicate (name, kind, + # line_start) tuples for a single file; use INSERT OR IGNORE so a + # duplicate does not abort the transaction. + # + # NOTE: cursor.lastrowid is NOT reset to 0 after a no-op INSERT OR + # IGNORE — it retains the rowid from the previous successful insert on + # this connection. Use cursor.rowcount == 1 to detect a real insert. + # + # NOTE: symbol_embeddings is a sqlite-vec virtual table and does not + # support conflict-resolution clauses (INSERT OR IGNORE raises + # OperationalError). Never attempt to insert an embedding for a symbol + # that was not freshly inserted. embedding_pairs: list[tuple[int, list[float]]] = [] if parsed_data.get("symbols"): @@ -610,7 +641,7 @@ def _store_parsed_file( for i, sym in enumerate(parsed_data["symbols"]): parent_id = db_ids.get(sym["parent_idx"]) if sym["parent_idx"] is not None else None cursor = db.execute( - """INSERT INTO symbols + """INSERT OR IGNORE INTO symbols (name, kind, file_id, line_start, line_end, parent_symbol_id, source_text) VALUES (?, ?, ?, ?, ?, ?, ?)""", @@ -618,10 +649,14 @@ def _store_parsed_file( sym["line_start"], sym["line_end"], parent_id, sym["source_text"]), ) - sym_id = cursor.lastrowid + is_new = cursor.rowcount == 1 + sym_id = cursor.lastrowid if is_new else db.execute( + "SELECT id FROM symbols WHERE file_id=? AND name=? AND kind=? AND line_start=?", + (file_id, sym["name"], sym["kind"], sym["line_start"]), + ).fetchone()[0] db_ids[i] = sym_id - if file_embeddings and i < len(file_embeddings): + if is_new and file_embeddings and i < len(file_embeddings): embedding_pairs.append((sym_id, file_embeddings[i])) symbols_indexed += 1 From 76ebcc89ec14831e0e5b8ee943d6f2ddd00553f1 Mon Sep 17 00:00:00 2001 From: Jim Dawdy Date: Tue, 19 May 2026 16:58:42 -0500 Subject: [PATCH 2/2] fix: rename last_file_indexed to last_code_indexed and return ISO string get_index_stats was returning `last_file_indexed` (a raw float Unix timestamp) in the freshness dict, but api_types.py defines the field as `last_code_indexed: str | None`. This caused a Pydantic validation error in MCP clients that validate tool output against the schema. Two changes in get_index_stats(): - Rename key from `last_file_indexed` to `last_code_indexed` - Convert float timestamp to ISO-8601 string via datetime.fromtimestamp().isoformat() Co-Authored-By: Claude Sonnet 4.6 --- code_memory/db.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/code_memory/db.py b/code_memory/db.py index bdac297..f4c51f3 100644 --- a/code_memory/db.py +++ b/code_memory/db.py @@ -14,6 +14,7 @@ import logging import os import sqlite3 +from datetime import datetime import sys from contextlib import contextmanager from functools import lru_cache @@ -907,8 +908,8 @@ def get_index_stats(db: sqlite3.Connection, project_dir: str) -> dict: "file_extensions": file_extensions, }, "freshness": { - "last_file_indexed": last_file_indexed, - "last_doc_indexed": last_doc_indexed, + "last_code_indexed": datetime.fromtimestamp(last_file_indexed).isoformat() if last_file_indexed else None, + "last_doc_indexed": datetime.fromtimestamp(last_doc_indexed).isoformat() if last_doc_indexed else None, }, "embedding": { "model": embedding_model[0] if embedding_model else None,