Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 21 additions & 2 deletions src/cache.py
Original file line number Diff line number Diff line change
Expand Up @@ -95,11 +95,30 @@ def merge_memory(existing: List[Dict], new: List[Dict]) -> List[Dict]:

Supports both old format (entry_id key) and new format (id key based on
content hash of source_tool:source_file:content).

The fallback key is a SHA-256 of the entry content — never str(id(e)) which
changes every Python process invocation and would cause duplication (#36).
"""
import hashlib

def _stable_fallback(e: Dict) -> str:
"""Deterministic key for entries that lack an explicit id field."""
# Use content + source fields to build a stable hash so that the
# same entry loaded from disk on two different runs gets the same key.
raw = "|".join(
[
str(e.get("content", "")),
str(e.get("source_tool", "")),
str(e.get("source_file", "")),
str(e.get("category", "")),
]
)
return "fallback:" + hashlib.sha256(raw.encode()).hexdigest()[:16]

def _key(e: Dict) -> str:
# New format uses "id" (content-hash), old format uses "entry_id"
return e.get("id") or e.get("entry_id") or str(id(e))
# New format uses "id" (content-hash), old format uses "entry_id".
# Fallback to a stable content hash — never str(id(e)) (#36).
return e.get("id") or e.get("entry_id") or _stable_fallback(e)

index = {_key(e): e for e in existing}
for e in new:
Expand Down
47 changes: 36 additions & 11 deletions src/memory.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,6 +21,11 @@ def _is_raw_file_entry(entry: dict) -> bool:
return "source_file" in entry


def _is_manual_entry(entry: dict) -> bool:
"""Detect manually-added entries (source_tool == 'manual')."""
return entry.get("source_tool") == "manual"


@click.group()
def memory():
"""Manage AI memory entries (local cache)."""
Expand All @@ -39,15 +44,20 @@ def memory():
)
def add(text, category):
"""Add a memory entry to local cache. Usage: apc memory add "your text" """
ts = datetime.now(timezone.utc).strftime("%Y%m%d_%H%M%S")
short_hash = hashlib.md5(text.encode()).hexdigest()[:6]
entry_id = f"{ts}_{short_hash}"
# Use the new schema (id + source_tool) so that:
# 1. The same text added twice is idempotent — content-hash id is stable (#45)
# 2. merge_memory deduplicates via 'id', not a timestamp-based entry_id
content_id = hashlib.sha256(f"manual:{category}:{text}".encode()).hexdigest()[:16]
now = datetime.now(timezone.utc).isoformat()

new_entry = {
"entry_id": entry_id,
"id": content_id,
"source_tool": "manual",
"source_file": "memory_add",
"label": f"Manual [{category}]",
"category": category,
"content": text,
"source": "manual_add",
"collected_at": now,
}

existing = load_memory()
Expand All @@ -65,11 +75,26 @@ def list_entries():
click.echo("No memory entries found. Run 'apc collect' or 'apc memory add \"...\"' first.")
return

# Separate raw-file entries from legacy entries
raw_files = [e for e in entries if _is_raw_file_entry(e)]
# Separate entries into three groups:
# 1. manually-added (new schema, source_tool=="manual") — show content
# 2. raw collected files (new schema, source_tool != "manual") — show path + size
# 3. legacy format (no source_file key) — show content grouped by category
manual = [e for e in entries if _is_raw_file_entry(e) and _is_manual_entry(e)]
raw_files = [e for e in entries if _is_raw_file_entry(e) and not _is_manual_entry(e)]
legacy = [e for e in entries if not _is_raw_file_entry(e)]

# Display raw-file entries
# Display manually-added entries grouped by category
if manual:
by_category: dict = {}
for entry in manual:
cat = entry.get("category", "manual")
by_category.setdefault(cat, []).append(entry)
for cat, items in sorted(by_category.items()):
click.echo(f"\n[{cat}]")
for item in items:
click.echo(f" - {item.get('content', '')} (manual)")

# Display collected file entries (tool memory files)
if raw_files:
click.echo("\n[Collected Files]")
for entry in raw_files:
Expand All @@ -84,12 +109,12 @@ def list_entries():

# Display legacy entries grouped by category
if legacy:
by_category = {}
by_category2: dict = {}
for entry in legacy:
cat = entry.get("category", "unknown")
by_category.setdefault(cat, []).append(entry)
by_category2.setdefault(cat, []).append(entry)

for cat, items in sorted(by_category.items()):
for cat, items in sorted(by_category2.items()):
click.echo(f"\n[{cat}]")
for item in items:
source = item.get("source", "")
Expand Down
156 changes: 156 additions & 0 deletions tests/test_memory_bugs_36_45.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,156 @@
"""Tests for memory schema and deduplication fixes (#36, #45)."""

import unittest
from unittest.mock import patch


class TestMergeMemoryDedup(unittest.TestCase):
"""#36 — Memory entries duplicate on every apc collect."""

def test_same_content_no_id_not_duplicated(self):
"""Entries lacking id/entry_id are deduplicated by stable content hash."""
from cache import merge_memory

entry = {"content": "hello", "source_tool": "test", "category": "pref"}
# Same logical entry as two separate dict instances (simulates load from disk)
entry1 = dict(entry)
entry2 = dict(entry)

result = merge_memory([entry1], [entry2])
self.assertEqual(len(result), 1, "Same content must not create a duplicate")

def test_different_content_creates_two_entries(self):
"""Different content creates two separate entries."""
from cache import merge_memory

e1 = {"content": "hello", "source_tool": "test", "category": "pref"}
e2 = {"content": "world", "source_tool": "test", "category": "pref"}
result = merge_memory([e1], [e2])
self.assertEqual(len(result), 2)

def test_stable_id_deduplicates_correctly(self):
"""Entries with an explicit 'id' are deduplicated by that id."""
from cache import merge_memory

e1 = {"id": "abc123", "content": "old", "source_tool": "t"}
e2 = {"id": "abc123", "content": "new", "source_tool": "t"}
result = merge_memory([e1], [e2])
self.assertEqual(len(result), 1)
self.assertEqual(result[0]["content"], "new", "New entry should win (upsert)")

def test_entry_id_key_stable_across_runs(self):
"""entry_id (old format) provides a stable dedup key across reloads."""
from cache import merge_memory

e = {"entry_id": "20260307_abc123", "content": "hello", "category": "pref"}
# Same entry reloaded = new Python object, but same entry_id
e_reloaded = dict(e)
result = merge_memory([e], [e_reloaded])
self.assertEqual(len(result), 1)

def test_no_id_same_source_tool_file_deduplicates(self):
"""Fallback hash uses source_tool + source_file + content for stable keying."""
from cache import merge_memory

base = {"content": "data", "source_tool": "openclaw", "source_file": "MEMORY.md"}
result = merge_memory([dict(base)], [dict(base)])
self.assertEqual(len(result), 1, "Same source+content should deduplicate")


class TestMemoryAddNewSchema(unittest.TestCase):
"""#45 — apc memory add uses legacy schema vs collect's new schema."""

def test_memory_add_uses_id_not_entry_id(self):
"""memory add now creates entries with 'id' field (new schema)."""
from click.testing import CliRunner

from memory import memory

runner = CliRunner()
saved = []

with (
patch("memory.load_memory", return_value=[]),
patch("memory.save_memory", side_effect=lambda entries: saved.extend(entries)),
patch("memory.merge_memory", side_effect=lambda existing, new: new),
):
result = runner.invoke(memory, ["add", "test memory text"])

self.assertEqual(result.exit_code, 0)
self.assertEqual(len(saved), 1)
entry = saved[0]
# New schema: must have 'id', not 'entry_id'
self.assertIn("id", entry, "New schema requires 'id' field")
self.assertNotIn("entry_id", entry, "'entry_id' is legacy — must not appear")
self.assertIn("source_tool", entry)
self.assertEqual(entry["source_tool"], "manual")
self.assertIn("source_file", entry)

def test_memory_add_same_text_same_id(self):
"""Adding the same text twice produces the same id (idempotent)."""
from click.testing import CliRunner

from memory import memory

runner = CliRunner()
ids = []

def capture_save(entries):
ids.extend(e["id"] for e in entries)

with (
patch("memory.load_memory", return_value=[]),
patch("memory.save_memory", side_effect=capture_save),
patch("memory.merge_memory", side_effect=lambda existing, new: new),
):
runner.invoke(memory, ["add", "repeated text"])
runner.invoke(memory, ["add", "repeated text"])

self.assertEqual(len(ids), 2)
self.assertEqual(ids[0], ids[1], "Same text must produce the same id")

def test_memory_add_different_category_different_id(self):
"""Same text with different category produces a different id."""
from click.testing import CliRunner

from memory import memory

runner = CliRunner()
ids = []

def capture_save(entries):
ids.extend(e["id"] for e in entries)

with (
patch("memory.load_memory", return_value=[]),
patch("memory.save_memory", side_effect=capture_save),
patch("memory.merge_memory", side_effect=lambda existing, new: new),
):
runner.invoke(memory, ["add", "same text", "--category", "preference"])
runner.invoke(memory, ["add", "same text", "--category", "workflow"])

self.assertEqual(len(ids), 2)
self.assertNotEqual(ids[0], ids[1], "Different categories should produce different ids")

def test_memory_add_preserves_category(self):
"""Category is still stored in the new schema entry."""
from click.testing import CliRunner

from memory import memory

runner = CliRunner()
saved = []

with (
patch("memory.load_memory", return_value=[]),
patch("memory.save_memory", side_effect=lambda e: saved.extend(e)),
patch("memory.merge_memory", side_effect=lambda existing, new: new),
):
runner.invoke(memory, ["add", "some text", "--category", "workflow"])

self.assertEqual(saved[0]["category"], "workflow")
self.assertEqual(saved[0]["content"], "some text")


if __name__ == "__main__":
unittest.main()