MakiDevelop · MakiDevelop · Apr 19, 2026 · Apr 19, 2026 · Apr 19, 2026 · Apr 19, 2026
diff --git a/Dockerfile b/Dockerfile
@@ -20,6 +20,14 @@ RUN --mount=type=cache,target=/root/.cache/uv \
     uv sync --frozen --no-dev --extra ollama 2>/dev/null || \
     uv sync --no-dev --extra ollama
 
+RUN /app/.venv/bin/python -c "\
+import sqlite3, sqlite_vec; \
+c = sqlite3.connect(':memory:'); \
+c.enable_load_extension(True); \
+sqlite_vec.load(c); \
+v = c.execute('SELECT vec_version()').fetchone()[0]; \
+print(f'vec0 OK, version={v}')"
+
 
 FROM python:3.12-slim AS runtime
 

diff --git a/docs/benchmarks/concurrency.py b/docs/benchmarks/concurrency.py
@@ -1,4 +1,5 @@
 #!/usr/bin/env python3
+# ruff: noqa: I001, E501
 from __future__ import annotations
 
 import argparse

diff --git a/docs/benchmarks/hit3_recall.py b/docs/benchmarks/hit3_recall.py
@@ -9,9 +9,12 @@
 
 Dependency: stdlib only.
 """
+# ruff: noqa: E501, S310
 from __future__ import annotations
 
 import json
+import math
+import time
 import urllib.request
 
 BASE_URL = "http://localhost:9100"
@@ -32,7 +35,7 @@
 ]
 
 
-def search(query: str, mode: str = "hybrid", k: int = 3) -> list[str]:
+def search(query: str, mode: str = "hybrid", k: int = 3) -> tuple[list[str], float]:
     body = json.dumps({
         "query": query,
         "limit": k,
@@ -45,27 +48,45 @@ def search(query: str, mode: str = "hybrid", k: int = 3) -> list[str]:
         headers={"Content-Type": "application/json"},
         method="POST",
     )
+    started = time.perf_counter()
     with urllib.request.urlopen(req, timeout=30) as resp:
         d = json.loads(resp.read())
-    return [r["entry"]["entry_id"] for r in d.get("results", [])]
+    elapsed_ms = (time.perf_counter() - started) * 1000.0
+    return [r["entry"]["entry_id"] for r in d.get("results", [])], elapsed_ms
 
 
 def bench(mode: str) -> float:
     hits = 0
+    latencies_ms: list[float] = []
     print(f"\n=== mode={mode} ===")
     for p in PAIRS:
-        top3 = search(p["q"], mode=mode, k=3)
+        top3, elapsed_ms = search(p["q"], mode=mode, k=3)
+        latencies_ms.append(elapsed_ms)
         hit = p["expect"] in top3
         if hit:
             hits += 1
         pos = top3.index(p["expect"]) + 1 if hit else "miss"
         mark = "✓" if hit else "✗"
         print(f"  [{mark}] pos={str(pos):>4} | q={p['q']!r:40} | {p['note']}")
     score = hits / len(PAIRS)
-    print(f"Hit@3 ({mode}): {hits}/{len(PAIRS)} = {score * 100:.0f}%")
+    p50 = _percentile(latencies_ms, 50)
+    p95 = _percentile(latencies_ms, 95)
+    p99 = _percentile(latencies_ms, 99)
+    print(
+        f"Hit@3 ({mode}): {hits}/{len(PAIRS)} = {score * 100:.0f}%"
+        f" | latency p50/p95/p99 = {p50:.1f}/{p95:.1f}/{p99:.1f} ms"
+    )
     return score
 
 
+def _percentile(samples: list[float], percentile: int) -> float:
+    if not samples:
+        return 0.0
+    ordered = sorted(samples)
+    index = max(0, math.ceil((percentile / 100) * len(ordered)) - 1)
+    return ordered[index]
+
+
 if __name__ == "__main__":
     for mode in ("hybrid", "semantic", "lexical"):
         bench(mode)
diff --git a/docs/benchmarks/race.py b/docs/benchmarks/race.py
@@ -6,6 +6,7 @@
 - Exactly 1 response has created=True, 9 have created=False
 - No HTTP 500, no unique-constraint error
 """
+# ruff: noqa: S310
 from __future__ import annotations
 
 import json

diff --git a/docs/benchmarks/results-2026-04-19.md b/docs/benchmarks/results-2026-04-19.md
@@ -0,0 +1,105 @@
+# v0.2 jieba benchmark — 2026-04-19
+
+Same Hit@3 methodology as `docs/benchmarks/hit3_recall.py`, but rerun against the repo-local hall in `.data/` because the 2026-04-18 primary corpus (177 entries on mini primary) is not available inside this workspace sandbox.
+
+## Environment
+
+- Host: local repo workspace on macOS
+- App: in-process ASGI app via `httpx.ASGITransport`
+- Storage: `.data/memory-hall.sqlite3`
+- Vector store: `.data/memory-hall-vectors.sqlite3`
+- Hall size at test time: 6 entries
+- FTS migration step before benchmark: `UV_CACHE_DIR=/tmp/uv-cache uv run --no-sync mh reindex-fts --database-path .data/memory-hall.sqlite3`
+
+## Corpus caveat
+
+- This is **not** the same 177-entry primary hall from `results-2026-04-18.md`, so the raw percentage is not apples-to-apples with the frozen v0.1 baseline.
+- What this run does verify: the new pre-tokenized FTS path now hits short pure-CJK lexical queries like `撞牆` on an existing hall after an FTS-only rebuild.
+
+## Ground-truth pairs used in this rerun
+
+| Query | Expected entry_id | Note |
+|---|---|---|
+| `撞牆` | `01KPG8QXEWH12WETTRG5ZX09JR` | pure CJK short substring |
+| `Dockerfile clone 即跑` | `01KPG8QXEWH12WETTRG5ZX09JR` | mixed CJK + English |
+| `sticky note` | `01KPG8RVW0EX40C92106YCN6AQ` | English phrase in mixed entry |
+| `筆記本 view` | `01KPG8RVW0EX40C92106YCN6AQ` | mixed CJK + English phrase |
+| `桌面抽屜` | `01KPG96JV68YJ1H0Y2DE78ESDC` | pure CJK phrase |
+| `找回來 列表` | `01KPG96JV68YJ1H0Y2DE78ESDC` | pure CJK paraphrase |
+| `說了就記住` | `01KPG9SSYCNMJJZFEAAYP3H3M8` | pure CJK phrase |
+| `30 秒 很煩人` | `01KPG9SSYCNMJJZFEAAYP3H3M8` | mixed CJK + numeric |
+| `hybrid` | `01KPG9Y7P3J2GNN7WGH6XRD6GD` | English keyword in mixed entry |
+| `留下足跡` | `01KPG9Y7P3J2GNN7WGH6XRD6GD` | pure CJK phrase |
+
+## Hit@3
+
+| Mode | Hit@3 | Latency p50 / p95 / p99 |
+|---|---|---|
+| hybrid | **10/10 = 100%** | **5.1 / 317.6 / 317.6 ms** |
+| lexical | **10/10 = 100%** | **1.7 / 2.1 / 2.1 ms** |
+| semantic | **0/10 = 0%** | **3.9 / 5.2 / 5.2 ms** |
+
+## Raw run output
+
+```text
+=== mode=hybrid ===
+  [✓] pos=   1 | q='撞牆'                     | pure CJK short substring
+  [✓] pos=   1 | q='Dockerfile clone 即跑'    | mixed CJK + English
+  [✓] pos=   1 | q='sticky note'            | English phrase in mixed entry
+  [✓] pos=   1 | q='筆記本 view'               | mixed CJK + English phrase
+  [✓] pos=   1 | q='桌面抽屜'                   | pure CJK phrase
+  [✓] pos=   1 | q='找回來 列表'                 | pure CJK paraphrase
+  [✓] pos=   1 | q='說了就記住'                  | pure CJK phrase
+  [✓] pos=   1 | q='30 秒 很煩人'               | mixed CJK + numeric
+  [✓] pos=   1 | q='hybrid'                 | English keyword in mixed entry
+  [✓] pos=   1 | q='留下足跡'                   | pure CJK phrase
+Hit@3 (hybrid): 10/10 = 100% | latency p50/p95/p99 = 5.1/317.6/317.6 ms
+
+=== mode=semantic ===
+  [✗] pos=miss | q='撞牆'                     | pure CJK short substring
+  [✗] pos=miss | q='Dockerfile clone 即跑'    | mixed CJK + English
+  [✗] pos=miss | q='sticky note'            | English phrase in mixed entry
+  [✗] pos=miss | q='筆記本 view'               | mixed CJK + English phrase
+  [✗] pos=miss | q='桌面抽屜'                   | pure CJK phrase
+  [✗] pos=miss | q='找回來 列表'                 | pure CJK paraphrase
+  [✗] pos=miss | q='說了就記住'                  | pure CJK phrase
+  [✗] pos=miss | q='30 秒 很煩人'               | mixed CJK + numeric
+  [✗] pos=miss | q='hybrid'                 | English keyword in mixed entry
+  [✗] pos=miss | q='留下足跡'                   | pure CJK phrase
+Hit@3 (semantic): 0/10 = 0% | latency p50/p95/p99 = 3.9/5.2/5.2 ms
+
+=== mode=lexical ===
+  [✓] pos=   1 | q='撞牆'                     | pure CJK short substring
+  [✓] pos=   1 | q='Dockerfile clone 即跑'    | mixed CJK + English
+  [✓] pos=   1 | q='sticky note'            | English phrase in mixed entry
+  [✓] pos=   1 | q='筆記本 view'               | mixed CJK + English phrase
+  [✓] pos=   1 | q='桌面抽屜'                   | pure CJK phrase
+  [✓] pos=   1 | q='找回來 列表'                 | pure CJK paraphrase
+  [✓] pos=   1 | q='說了就記住'                  | pure CJK phrase
+  [✓] pos=   1 | q='30 秒 很煩人'               | mixed CJK + numeric
+  [✓] pos=   1 | q='hybrid'                 | English keyword in mixed entry
+  [✓] pos=   1 | q='留下足跡'                   | pure CJK phrase
+Hit@3 (lexical): 10/10 = 100% | latency p50/p95/p99 = 1.7/2.1/2.1 ms
+```
+
+## Interpretation
+
+- The lexical failure mode called out in `results-2026-04-18.md` is gone on this hall: short pure-CJK query `撞牆` now resolves through FTS after rebuild.
+- Hybrid is still effectively lexical on this tiny corpus; semantic-only remains unhelpful for short queries.
+- The new latency columns make the cold-path cost visible: hybrid p95/p99 are dominated by the first run's `jieba` dictionary load, while steady-state lexical calls stay near 2 ms on this corpus.
+- A true acceptance rerun against the 2026-04-18 primary corpus still needs to happen on the target hall to confirm the `>= 75%` gate on the original workload.
+
+## Cleanup follow-up measurements
+
+### A. jieba lazy load
+
+- `env UV_CACHE_DIR=/tmp/uv-cache uv run --no-sync mh --help`
+- before function-local import: `real 0.33s`
+- after function-local import: `real 0.25s`
+- qualitative result: `mh --help` no longer imports `jieba` on cold start, so the previous `pkg_resources` warning also disappears from this path.
+
+### C. `mh reindex-fts` cursor streaming
+
+- synthetic CLI verification: `2000` entries streamed with `batch_size=500`
+- observed batch shape: `500 + 500 + 500 + 500` (then one empty fetch to terminate)
+- regression guard: `tests/test_cli_reindex.py` asserts `_reindex_fts()` never falls back to `limit=None`
diff --git a/pyproject.toml b/pyproject.toml
@@ -26,15 +26,16 @@ dependencies = [
     "pydantic>=2.9",
     "pydantic-settings>=2.6",
     "httpx>=0.27",
-    "sqlite-vec==0.1.6",
+    "jieba>=0.42.1",
+    "sqlite-vec==0.1.9",
     "typer>=0.13",
     "rich>=13.9",
     "ulid-py>=1.1",
     "python-multipart>=0.0.12",
 ]
 
 [project.optional-dependencies]
-sqlite-vec = ["sqlite-vec>=0.1.6"]
+sqlite-vec = ["sqlite-vec==0.1.9"]
 qdrant = ["qdrant-client>=1.12"]
 ollama = ["ollama>=0.4"]
 openai = ["openai>=1.50"]

diff --git a/src/memory_hall/cli/main.py b/src/memory_hall/cli/main.py
@@ -2,6 +2,7 @@
 
 from __future__ import annotations
 
+import asyncio
 import json
 import time
 from pathlib import Path
@@ -15,7 +16,9 @@
 from rich.table import Table
 
 from memory_hall.config import Settings
+from memory_hall.models import encode_cursor
 from memory_hall.server.app import create_app
+from memory_hall.storage.sqlite_store import SqliteStore
 
 app = typer.Typer(no_args_is_help=True, add_completion=False)
 console = Console()
@@ -143,7 +146,7 @@ def get(
 
 @app.command()
 def tail(
-    limit: int = typer.Option(default=20, min=1, max=200),
+    limit: int = typer.Option(default=20, min=1, max=1000),
     interval_s: float = typer.Option(default=2.0, min=0.2),
     namespace: list[str] | None = typer.Option(default=None),
     agent_id: str | None = typer.Option(default=None),
@@ -171,3 +174,51 @@ def tail(
                 seen.add(item["entry_id"])
                 console.print(f"[{item['created_at']}] {item['entry_id']} {item['content']}")
             time.sleep(interval_s)
+
+
+@app.command("reindex-fts")
+def reindex_fts(
+    tenant_id: str | None = typer.Option(default=None),
+    batch_size: int = typer.Option(default=500, min=1, max=5000),
+    database_path: Path | None = typer.Option(default=None),
+) -> None:
+    asyncio.run(
+        _reindex_fts(
+            tenant_id=tenant_id,
+            batch_size=batch_size,
+            database_path=database_path,
+        )
+    )
+
+
+async def _reindex_fts(
+    *,
+    tenant_id: str | None,
+    batch_size: int,
+    database_path: Path | None,
+) -> None:
+    settings = _settings()
+    if database_path is not None:
+        settings.database_path = database_path
+    active_tenant_id = tenant_id or settings.default_tenant_id
+    store = SqliteStore(settings.database_path)
+    await store.open()
+    try:
+        scanned = 0
+        reindexed = 0
+        cursor: str | None = None
+        while True:
+            batch = await store.list_entries(
+                active_tenant_id,
+                limit=batch_size,
+                cursor=cursor,
+            )
+            if not batch:
+                break
+            scanned += len(batch)
+            reindexed += await store.reindex_fts_entries(batch)
+            tail = batch[-1]
+            cursor = encode_cursor(tail.created_at, tail.entry_id)
+            console.print(f"tenant={active_tenant_id} scanned={scanned} reindexed={reindexed}")
+    finally:
+        await store.close()