From 96e89fe2c67189efdcb5de9ebe7bff72213f92bb Mon Sep 17 00:00:00 2001
From: Jim Dawdy <jim.dawdy@gmail.com>
Date: Tue, 19 May 2026 14:50:31 -0500
Subject: [PATCH 1/2] fix: threading crash, duplicate symbols, logging, and
 embedding insert
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Four bugs found while indexing openclaw/openclaw (17,212 source files,
945 doc files) on an RTX 5060 Ti. The repo is a large TypeScript/Swift/
Kotlin monorepo (~17k files across 60+ extensions). All bugs surface only
at scale and were invisible in small test cases.

---

Bug 1: cross-thread SQLite access crashes ~30% of file parses

_parse_file_for_indexing ran inside ThreadPoolExecutor workers and called
db.execute() on the shared main-thread connection. This caused:

  sqlite3.InterfaceError: bad parameter or other API misuse

on roughly 30% of files, even though the connection was opened with
check_same_thread=False. Python's sqlite3 binding is not safe for
concurrent access without explicit locking.

Fix: pre-fetch all existing file records into a dict[path → mtime] in the
main thread before launching the pool. Workers receive the dict and do a
dict.get() lookup instead of a DB query. No DB access in any worker thread.

---

Bug 2: duplicate symbols from tree-sitter AST crash DB write

tree-sitter can produce multiple symbols with the same (name, kind,
line_start) for a single file. The plain INSERT INTO symbols raised:

  sqlite3.IntegrityError: UNIQUE constraint failed:
    symbols.file_id, symbols.name, symbols.kind, symbols.line_start

This killed the entire DB write phase after all parsing and GPU embedding
had already completed — wasting the entire indexing run.

Fix: INSERT OR IGNORE INTO symbols. Use cursor.rowcount == 1 to detect
whether the insert actually happened. cursor.lastrowid is NOT reliable
here — after a no-op INSERT OR IGNORE it retains the rowid from the
previous successful insert on the same connection, not 0.

---

Bug 3: embedding insert crashes on sqlite-vec virtual table

After the Bug 2 fix, a duplicate symbol falls through to a SELECT that
returns the existing symbol_id. That ID already has an entry in
symbol_embeddings (a sqlite-vec virtual table). Attempting to insert
another embedding for it raised:

  sqlite3.OperationalError: UNIQUE constraint failed on
    symbol_embeddings primary key

INSERT OR IGNORE does not work on sqlite-vec virtual tables — the
conflict-resolution clause is rejected at the SQL level (OperationalError
instead of the usual IntegrityError).

Fix: guard embedding_pairs.append() with `if is_new` — only freshly
inserted symbols get embeddings queued. Existing symbols already have one.

---

Bug 4: logger.exception() reports all errors as "NoneType: None"

Exceptions from worker threads are stored as return values:
  return (fpath, None, e)

Then in the main thread:
  logger.exception("Failed to index %s", fpath)

logger.exception() reads sys.exc_info() — the current thread's exception
context — which is (None, None, None) since the exception occurred in a
different thread. Every failure logged as "NoneType: None" with no
traceback, making Bug 1 completely invisible.

Fix: logger.error("Failed to index %s", fpath, exc_info=error)

---

Tested against openclaw/openclaw:
  Before: ~30% of files silently skipped; DB write crash on first run
  After:  17,212/17,212 code files indexed, 111,000 symbols, 750 MB DB

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 code_memory/parser.py | 65 +++++++++++++++++++++++++++++++++----------
 1 file changed, 50 insertions(+), 15 deletions(-)

diff --git a/code_memory/parser.py b/code_memory/parser.py
index 6f45818..f73b70d 100644
--- a/code_memory/parser.py
+++ b/code_memory/parser.py
@@ -394,10 +394,19 @@ def index_directory(dirpath: str, db, progress_callback=None) -> list[dict]:
     files_processed = 0
     parsed_files: list[tuple[str, dict | None, Exception | None]] = []  # (filepath, parsed_data, error)
 
+    # Pre-fetch all known file mtimes in the main thread before launching workers.
+    # _parse_file_for_indexing previously called db.execute() inside the thread
+    # pool, which caused sqlite3.InterfaceError ("bad parameter or other API
+    # misuse") on concurrent access even with check_same_thread=False.
+    existing_mtimes: dict[str, float] = {
+        os.path.abspath(row[0]): row[1]
+        for row in db.execute("SELECT path, last_modified FROM files").fetchall()
+    }
+
     def _parse_file_task(fpath: str) -> tuple[str, dict | None, Exception | None]:
         """Parse a single file and return extracted data (without DB writes)."""
         try:
-            parsed = _parse_file_for_indexing(fpath, db)
+            parsed = _parse_file_for_indexing(fpath, db, existing_mtimes)
             return (fpath, parsed, None)
         except Exception as e:
             return (fpath, None, e)
@@ -454,7 +463,7 @@ def _parse_file_task(fpath: str) -> tuple[str, dict | None, Exception | None]:
 
     for fpath, parsed_data, error in parsed_files:
         if error:
-            logger.exception("Failed to index %s", fpath)
+            logger.error("Failed to index %s", fpath, exc_info=error)
             results.append({
                 "file": fpath,
                 "symbols_indexed": 0,
@@ -514,22 +523,34 @@ def _parse_file_task(fpath: str) -> tuple[str, dict | None, Exception | None]:
     return results
 
 
-def _parse_file_for_indexing(filepath: str, db) -> dict | None:
+def _parse_file_for_indexing(filepath: str, db, existing_mtimes: dict | None = None) -> dict | None:
     """Parse a file and extract symbols/references without DB writes.
 
     Returns parsed data structure or None if skipped.
+
+    Args:
+        existing_mtimes: Optional pre-fetched mapping of abs-path → last_modified
+            from the files table. When provided, the freshness check uses a dict
+            lookup instead of a db.execute() call, which is required when this
+            function runs inside a ThreadPoolExecutor worker — concurrent access
+            to a single sqlite3.Connection causes InterfaceError even with
+            check_same_thread=False.
     """
     filepath = os.path.abspath(filepath)
     ext = os.path.splitext(filepath)[1].lower()
 
-    # Check freshness
+    # Check freshness — use pre-fetched dict when available to avoid cross-thread DB access
     mtime = os.path.getmtime(filepath)
-    row = db.execute(
-        "SELECT id, last_modified FROM files WHERE path = ?", (filepath,)
-    ).fetchone()
-
-    if row and row[1] >= mtime:
-        return {"skipped": True, "file_id": row[0]}
+    if existing_mtimes is not None:
+        cached_mtime = existing_mtimes.get(filepath)
+        if cached_mtime is not None and cached_mtime >= mtime:
+            return {"skipped": True}
+    else:
+        row = db.execute(
+            "SELECT id, last_modified FROM files WHERE path = ?", (filepath,)
+        ).fetchone()
+        if row and row[1] >= mtime:
+            return {"skipped": True, "file_id": row[0]}
 
     # Read file
     source_bytes = Path(filepath).read_bytes()
@@ -601,8 +622,18 @@ def _store_parsed_file(
         # Delete stale data
         db_mod.delete_file_data(db, file_id, auto_commit=False)
 
-        # Store symbols — data was just deleted so every insert is fresh;
-        # use cursor.lastrowid instead of a separate SELECT.
+        # Store symbols. tree-sitter can produce duplicate (name, kind,
+        # line_start) tuples for a single file; use INSERT OR IGNORE so a
+        # duplicate does not abort the transaction.
+        #
+        # NOTE: cursor.lastrowid is NOT reset to 0 after a no-op INSERT OR
+        # IGNORE — it retains the rowid from the previous successful insert on
+        # this connection. Use cursor.rowcount == 1 to detect a real insert.
+        #
+        # NOTE: symbol_embeddings is a sqlite-vec virtual table and does not
+        # support conflict-resolution clauses (INSERT OR IGNORE raises
+        # OperationalError). Never attempt to insert an embedding for a symbol
+        # that was not freshly inserted.
         embedding_pairs: list[tuple[int, list[float]]] = []
 
         if parsed_data.get("symbols"):
@@ -610,7 +641,7 @@ def _store_parsed_file(
             for i, sym in enumerate(parsed_data["symbols"]):
                 parent_id = db_ids.get(sym["parent_idx"]) if sym["parent_idx"] is not None else None
                 cursor = db.execute(
-                    """INSERT INTO symbols
+                    """INSERT OR IGNORE INTO symbols
                            (name, kind, file_id, line_start, line_end,
                             parent_symbol_id, source_text)
                        VALUES (?, ?, ?, ?, ?, ?, ?)""",
@@ -618,10 +649,14 @@ def _store_parsed_file(
                      sym["line_start"], sym["line_end"],
                      parent_id, sym["source_text"]),
                 )
-                sym_id = cursor.lastrowid
+                is_new = cursor.rowcount == 1
+                sym_id = cursor.lastrowid if is_new else db.execute(
+                    "SELECT id FROM symbols WHERE file_id=? AND name=? AND kind=? AND line_start=?",
+                    (file_id, sym["name"], sym["kind"], sym["line_start"]),
+                ).fetchone()[0]
                 db_ids[i] = sym_id
 
-                if file_embeddings and i < len(file_embeddings):
+                if is_new and file_embeddings and i < len(file_embeddings):
                     embedding_pairs.append((sym_id, file_embeddings[i]))
                 symbols_indexed += 1
 

From 76ebcc89ec14831e0e5b8ee943d6f2ddd00553f1 Mon Sep 17 00:00:00 2001
From: Jim Dawdy <jim.dawdy@gmail.com>
Date: Tue, 19 May 2026 16:58:42 -0500
Subject: [PATCH 2/2] fix: rename last_file_indexed to last_code_indexed and
 return ISO string

get_index_stats was returning `last_file_indexed` (a raw float Unix
timestamp) in the freshness dict, but api_types.py defines the field
as `last_code_indexed: str | None`. This caused a Pydantic validation
error in MCP clients that validate tool output against the schema.

Two changes in get_index_stats():
- Rename key from `last_file_indexed` to `last_code_indexed`
- Convert float timestamp to ISO-8601 string via datetime.fromtimestamp().isoformat()

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 code_memory/db.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/code_memory/db.py b/code_memory/db.py
index bdac297..f4c51f3 100644
--- a/code_memory/db.py
+++ b/code_memory/db.py
@@ -14,6 +14,7 @@
 import logging
 import os
 import sqlite3
+from datetime import datetime
 import sys
 from contextlib import contextmanager
 from functools import lru_cache
@@ -907,8 +908,8 @@ def get_index_stats(db: sqlite3.Connection, project_dir: str) -> dict:
             "file_extensions": file_extensions,
         },
         "freshness": {
-            "last_file_indexed": last_file_indexed,
-            "last_doc_indexed": last_doc_indexed,
+            "last_code_indexed": datetime.fromtimestamp(last_file_indexed).isoformat() if last_file_indexed else None,
+            "last_doc_indexed": datetime.fromtimestamp(last_doc_indexed).isoformat() if last_doc_indexed else None,
         },
         "embedding": {
             "model": embedding_model[0] if embedding_model else None,