diff --git a/src/cache.py b/src/cache.py index e9d6576..7e3d119 100644 --- a/src/cache.py +++ b/src/cache.py @@ -95,11 +95,30 @@ def merge_memory(existing: List[Dict], new: List[Dict]) -> List[Dict]: Supports both old format (entry_id key) and new format (id key based on content hash of source_tool:source_file:content). + + The fallback key is a SHA-256 of the entry content — never str(id(e)) which + changes every Python process invocation and would cause duplication (#36). """ + import hashlib + + def _stable_fallback(e: Dict) -> str: + """Deterministic key for entries that lack an explicit id field.""" + # Use content + source fields to build a stable hash so that the + # same entry loaded from disk on two different runs gets the same key. + raw = "|".join( + [ + str(e.get("content", "")), + str(e.get("source_tool", "")), + str(e.get("source_file", "")), + str(e.get("category", "")), + ] + ) + return "fallback:" + hashlib.sha256(raw.encode()).hexdigest()[:16] def _key(e: Dict) -> str: - # New format uses "id" (content-hash), old format uses "entry_id" - return e.get("id") or e.get("entry_id") or str(id(e)) + # New format uses "id" (content-hash), old format uses "entry_id". + # Fallback to a stable content hash — never str(id(e)) (#36). + return e.get("id") or e.get("entry_id") or _stable_fallback(e) index = {_key(e): e for e in existing} for e in new: diff --git a/src/memory.py b/src/memory.py index d8205d2..ca67b32 100644 --- a/src/memory.py +++ b/src/memory.py @@ -21,6 +21,11 @@ def _is_raw_file_entry(entry: dict) -> bool: return "source_file" in entry +def _is_manual_entry(entry: dict) -> bool: + """Detect manually-added entries (source_tool == 'manual').""" + return entry.get("source_tool") == "manual" + + @click.group() def memory(): """Manage AI memory entries (local cache).""" @@ -39,15 +44,20 @@ def memory(): ) def add(text, category): """Add a memory entry to local cache. Usage: apc memory add "your text" """ - ts = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S") - short_hash = hashlib.md5(text.encode()).hexdigest()[:6] - entry_id = f"{ts}_{short_hash}" + # Use the new schema (id + source_tool) so that: + # 1. The same text added twice is idempotent — content-hash id is stable (#45) + # 2. merge_memory deduplicates via 'id', not a timestamp-based entry_id + content_id = hashlib.sha256(f"manual:{category}:{text}".encode()).hexdigest()[:16] + now = datetime.now(timezone.utc).isoformat() new_entry = { - "entry_id": entry_id, + "id": content_id, + "source_tool": "manual", + "source_file": "memory_add", + "label": f"Manual [{category}]", "category": category, "content": text, - "source": "manual_add", + "collected_at": now, } existing = load_memory() @@ -65,11 +75,26 @@ def list_entries(): click.echo("No memory entries found. Run 'apc collect' or 'apc memory add \"...\"' first.") return - # Separate raw-file entries from legacy entries - raw_files = [e for e in entries if _is_raw_file_entry(e)] + # Separate entries into three groups: + # 1. manually-added (new schema, source_tool=="manual") — show content + # 2. raw collected files (new schema, source_tool != "manual") — show path + size + # 3. legacy format (no source_file key) — show content grouped by category + manual = [e for e in entries if _is_raw_file_entry(e) and _is_manual_entry(e)] + raw_files = [e for e in entries if _is_raw_file_entry(e) and not _is_manual_entry(e)] legacy = [e for e in entries if not _is_raw_file_entry(e)] - # Display raw-file entries + # Display manually-added entries grouped by category + if manual: + by_category: dict = {} + for entry in manual: + cat = entry.get("category", "manual") + by_category.setdefault(cat, []).append(entry) + for cat, items in sorted(by_category.items()): + click.echo(f"\n[{cat}]") + for item in items: + click.echo(f" - {item.get('content', '')} (manual)") + + # Display collected file entries (tool memory files) if raw_files: click.echo("\n[Collected Files]") for entry in raw_files: @@ -84,12 +109,12 @@ def list_entries(): # Display legacy entries grouped by category if legacy: - by_category = {} + by_category2: dict = {} for entry in legacy: cat = entry.get("category", "unknown") - by_category.setdefault(cat, []).append(entry) + by_category2.setdefault(cat, []).append(entry) - for cat, items in sorted(by_category.items()): + for cat, items in sorted(by_category2.items()): click.echo(f"\n[{cat}]") for item in items: source = item.get("source", "") diff --git a/tests/test_memory_bugs_36_45.py b/tests/test_memory_bugs_36_45.py new file mode 100644 index 0000000..107f2c9 --- /dev/null +++ b/tests/test_memory_bugs_36_45.py @@ -0,0 +1,156 @@ +"""Tests for memory schema and deduplication fixes (#36, #45).""" + +import unittest +from unittest.mock import patch + + +class TestMergeMemoryDedup(unittest.TestCase): + """#36 — Memory entries duplicate on every apc collect.""" + + def test_same_content_no_id_not_duplicated(self): + """Entries lacking id/entry_id are deduplicated by stable content hash.""" + from cache import merge_memory + + entry = {"content": "hello", "source_tool": "test", "category": "pref"} + # Same logical entry as two separate dict instances (simulates load from disk) + entry1 = dict(entry) + entry2 = dict(entry) + + result = merge_memory([entry1], [entry2]) + self.assertEqual(len(result), 1, "Same content must not create a duplicate") + + def test_different_content_creates_two_entries(self): + """Different content creates two separate entries.""" + from cache import merge_memory + + e1 = {"content": "hello", "source_tool": "test", "category": "pref"} + e2 = {"content": "world", "source_tool": "test", "category": "pref"} + result = merge_memory([e1], [e2]) + self.assertEqual(len(result), 2) + + def test_stable_id_deduplicates_correctly(self): + """Entries with an explicit 'id' are deduplicated by that id.""" + from cache import merge_memory + + e1 = {"id": "abc123", "content": "old", "source_tool": "t"} + e2 = {"id": "abc123", "content": "new", "source_tool": "t"} + result = merge_memory([e1], [e2]) + self.assertEqual(len(result), 1) + self.assertEqual(result[0]["content"], "new", "New entry should win (upsert)") + + def test_entry_id_key_stable_across_runs(self): + """entry_id (old format) provides a stable dedup key across reloads.""" + from cache import merge_memory + + e = {"entry_id": "20260307_abc123", "content": "hello", "category": "pref"} + # Same entry reloaded = new Python object, but same entry_id + e_reloaded = dict(e) + result = merge_memory([e], [e_reloaded]) + self.assertEqual(len(result), 1) + + def test_no_id_same_source_tool_file_deduplicates(self): + """Fallback hash uses source_tool + source_file + content for stable keying.""" + from cache import merge_memory + + base = {"content": "data", "source_tool": "openclaw", "source_file": "MEMORY.md"} + result = merge_memory([dict(base)], [dict(base)]) + self.assertEqual(len(result), 1, "Same source+content should deduplicate") + + +class TestMemoryAddNewSchema(unittest.TestCase): + """#45 — apc memory add uses legacy schema vs collect's new schema.""" + + def test_memory_add_uses_id_not_entry_id(self): + """memory add now creates entries with 'id' field (new schema).""" + from click.testing import CliRunner + + from memory import memory + + runner = CliRunner() + saved = [] + + with ( + patch("memory.load_memory", return_value=[]), + patch("memory.save_memory", side_effect=lambda entries: saved.extend(entries)), + patch("memory.merge_memory", side_effect=lambda existing, new: new), + ): + result = runner.invoke(memory, ["add", "test memory text"]) + + self.assertEqual(result.exit_code, 0) + self.assertEqual(len(saved), 1) + entry = saved[0] + # New schema: must have 'id', not 'entry_id' + self.assertIn("id", entry, "New schema requires 'id' field") + self.assertNotIn("entry_id", entry, "'entry_id' is legacy — must not appear") + self.assertIn("source_tool", entry) + self.assertEqual(entry["source_tool"], "manual") + self.assertIn("source_file", entry) + + def test_memory_add_same_text_same_id(self): + """Adding the same text twice produces the same id (idempotent).""" + from click.testing import CliRunner + + from memory import memory + + runner = CliRunner() + ids = [] + + def capture_save(entries): + ids.extend(e["id"] for e in entries) + + with ( + patch("memory.load_memory", return_value=[]), + patch("memory.save_memory", side_effect=capture_save), + patch("memory.merge_memory", side_effect=lambda existing, new: new), + ): + runner.invoke(memory, ["add", "repeated text"]) + runner.invoke(memory, ["add", "repeated text"]) + + self.assertEqual(len(ids), 2) + self.assertEqual(ids[0], ids[1], "Same text must produce the same id") + + def test_memory_add_different_category_different_id(self): + """Same text with different category produces a different id.""" + from click.testing import CliRunner + + from memory import memory + + runner = CliRunner() + ids = [] + + def capture_save(entries): + ids.extend(e["id"] for e in entries) + + with ( + patch("memory.load_memory", return_value=[]), + patch("memory.save_memory", side_effect=capture_save), + patch("memory.merge_memory", side_effect=lambda existing, new: new), + ): + runner.invoke(memory, ["add", "same text", "--category", "preference"]) + runner.invoke(memory, ["add", "same text", "--category", "workflow"]) + + self.assertEqual(len(ids), 2) + self.assertNotEqual(ids[0], ids[1], "Different categories should produce different ids") + + def test_memory_add_preserves_category(self): + """Category is still stored in the new schema entry.""" + from click.testing import CliRunner + + from memory import memory + + runner = CliRunner() + saved = [] + + with ( + patch("memory.load_memory", return_value=[]), + patch("memory.save_memory", side_effect=lambda e: saved.extend(e)), + patch("memory.merge_memory", side_effect=lambda existing, new: new), + ): + runner.invoke(memory, ["add", "some text", "--category", "workflow"]) + + self.assertEqual(saved[0]["category"], "workflow") + self.assertEqual(saved[0]["content"], "some text") + + +if __name__ == "__main__": + unittest.main()