LukeMainwaring · LukeMainwaring · Apr 10, 2026 · Apr 10, 2026 · Apr 10, 2026 · Apr 10, 2026
@@ -0,0 +1,26 @@
+# Pydantic AI Rules
+
+## Docs are split between two places
+
+Pydantic AI's documentation lives in two places with **different content**:
+
+1. **`docs/pydantic-ai-llms-full.txt`** — local pinned reference. API signatures,
+   class docs, model provider APIs, the `pydantic_evals` surface. Refresh via
+   the `updating-deps` skill (or `curl -sSL https://ai.pydantic.dev/llms-full.txt
+   -o docs/pydantic-ai-llms-full.txt`).
+
+2. **`https://ai.pydantic.dev/`** — conceptual guides, tutorials, example
+   applications (chat, RAG, durable execution, graphs, A2A). Not cached locally;
+   fetch ad-hoc with WebFetch.
+
+**Rule of thumb:** if you're grepping `llms-full.txt` for a worked example or
+tutorial and finding nothing, the content hasn't been deleted — it's on the web
+docs site. Use WebFetch before giving up.
+
+## Tool error handling
+
+When adding a tool that makes external calls, prefer letting exceptions
+propagate so `backend/src/cortexdj/agents/hooks.py::on_tool_execute_error`
+catches them and returns a structured recovery payload. Don't wrap the whole
+tool body in try/except unless you have a specific reason to handle a known
+error shape differently (e.g., token-expired → reconnect prompt).
@@ -39,11 +39,14 @@ uv run --directory backend pre-commit run --all-files
 ## Tests
 
 ```bash
-uv run --directory backend pytest              # run all tests
+uv run --directory backend pytest              # unit tests (eval suite excluded)
 uv run --directory backend pytest -v           # verbose output
 uv run --directory backend pytest tests/test_preprocessing.py  # single file
+uv run --directory backend pytest -m eval      # real-model brain_agent eval suite (opt-in)
 ```
 
+The `eval` marker gates tests that call the real OpenAI API via `brain_agent` — the default `pytest` invocation excludes them via `addopts = "-m 'not eval'"`. Use them as a nightly safety net on `main` or manual spot-checks, not on every PR. See `.claude/rules/backend/pydantic-ai.md` and `backend/tests/evals/` for the suite layout.
+
 ## Database migrations
 
 ```bash

@@ -18,7 +18,7 @@ dependencies = [
   "modal>=1.4.1",
   "numpy>=2.4.4",
   "psycopg[binary,pool]>=3.3.3",
-  "pydantic-ai>=1.77.0",
+  "pydantic-ai>=1.79.0",
   "pydantic>=2.12.5",
   "scipy>=1.17.1",
   "spotipy>=2.26.0",
@@ -31,15 +31,18 @@ dependencies = [
 dev = [
   "mypy>=1.20.0",
   "pre-commit>=4.5.1",
+  "pydantic-evals>=1.79.0",
   "pytest>=9.0.3",
-  "ruff>=0.15.9",
+  "ruff>=0.15.10",
 ]
 
 [tool.pytest.ini_options]
 markers = [
     "main: marks tests for main demo examples",
     "additional: marks tests for additional demo examples",
+    "eval: marks tests that call a real LLM (opt-in; run with `pytest -m eval`)",
 ]
+addopts = "-m 'not eval'"
 
 [tool.mypy]
 strict = true

@@ -8,15 +8,15 @@
 
 import logfire
 from pydantic_ai import Agent
-from pydantic_ai.capabilities import HistoryProcessor
-from pydantic_ai.models.openai import OpenAIResponsesModel
+from pydantic_ai.models.openai import OpenAIResponsesModel, OpenAIResponsesModelSettings
 
 from cortexdj.agents.capabilities.classification import ClassificationCapability
 from cortexdj.agents.capabilities.insight import InsightCapability
 from cortexdj.agents.capabilities.playlist import PlaylistCapability
 from cortexdj.agents.capabilities.session import SessionCapability
 from cortexdj.agents.deps import AgentDeps
 from cortexdj.agents.history_processor import summarize_tool_results
+from cortexdj.agents.hooks import build_brain_agent_hooks
 from cortexdj.core.config import get_settings
 
 logfire.configure(service_name="cortexdj")
@@ -75,15 +75,25 @@
 
 _model = OpenAIResponsesModel(model_name=config.AGENT_MODEL)
 
+# Reasoning is opt-in via AGENT_REASONING_EFFORT env var. Enable and validate
+# against backend/tests/evals/test_brain_agent_evals.py before committing.
+_model_settings = (
+    OpenAIResponsesModelSettings(openai_reasoning_effort=config.AGENT_REASONING_EFFORT)
+    if config.AGENT_REASONING_EFFORT is not None
+    else None
+)
+
 brain_agent = Agent(
     model=_model,
+    model_settings=_model_settings,
     deps_type=AgentDeps,
     instructions=SYSTEM_PROMPT,
     capabilities=[
         SessionCapability(),
         InsightCapability(),
         PlaylistCapability(),
         ClassificationCapability(),
-        HistoryProcessor(summarize_tool_results),
+        build_brain_agent_hooks(),
     ],
+    history_processors=[summarize_tool_results],
 )
@@ -0,0 +1,53 @@
+"""Hooks for the CortexDJ brain agent.
+
+Tool bodies already handle anticipated failures (Spotify not configured,
+token expired, etc.) by returning structured ``{"error": ...}`` dicts.
+This module is the safety net for *unanticipated* exceptions — anything
+that bubbles out of a tool body would otherwise crash the Vercel AI SDK
+stream mid-response. ``on_tool_execute_error`` intercepts those, logs the
+traceback, and returns a structured recovery payload so the agent can
+explain the failure to the user conversationally.
+"""
+
+from __future__ import annotations
+
+import logging
+from typing import Any
+
+from pydantic_ai import ToolDefinition
+from pydantic_ai.capabilities import Hooks
+from pydantic_ai.messages import ToolCallPart
+from pydantic_ai.tools import RunContext
+
+from cortexdj.agents.deps import AgentDeps
+
+logger = logging.getLogger(__name__)
+
+
+def _recovery_payload(tool_name: str, error: Exception) -> dict[str, Any]:
+    return {
+        "error": "tool_failed",
+        "tool": tool_name,
+        "exception_type": type(error).__name__,
+        "message": (
+            f"The {tool_name} tool failed unexpectedly ({type(error).__name__}). "
+            "Apologize to the user, explain briefly what you were trying to do, "
+            "and suggest they retry or rephrase."
+        ),
+    }
+
+
+async def _recover_tool_error(
+    ctx: RunContext[AgentDeps],
+    *,
+    call: ToolCallPart,
+    tool_def: ToolDefinition,
+    args: Any,
+    error: Exception,
+) -> dict[str, Any]:
+    logger.exception(f"Unhandled exception in tool {tool_def.name}: {error!r}")
+    return _recovery_payload(tool_def.name, error)
+
+
+def build_brain_agent_hooks() -> Hooks[AgentDeps]:
+    return Hooks[AgentDeps](tool_execute_error=_recover_tool_error)
@@ -16,6 +16,7 @@ class ApiSettings(BaseSettings):
 
 class AgentSettings(BaseSettings):
     AGENT_MODEL: str = "gpt-5.4-mini"
+    AGENT_REASONING_EFFORT: Literal["low", "medium", "high"] | None = None
 
 
 class PostgresSettings(BaseSettings):

@@ -0,0 +1,8 @@
+import os
+
+# ``OpenAIResponsesModel`` constructs its provider client eagerly at module
+# import time, which requires ``OPENAI_API_KEY``. Any test that imports
+# ``brain_agent`` — directly or transitively — would otherwise fail at
+# collection. Tests that actually invoke the model use ``agent.override``
+# with ``TestModel``; this dummy value never reaches a real API call.
+os.environ.setdefault("OPENAI_API_KEY", "sk-test-deterministic-no-real-calls")
@@ -0,0 +1,43 @@
+"""Fixtures for brain_agent evals.
+
+Provides ``make_fake_deps`` — a constructor for ``AgentDeps`` instances
+that don't hit the real database, Spotify, or EEG model. Used by both
+the deterministic ``prepare_tools`` tests (TestModel-backed) and the
+real-model ``@pytest.mark.eval`` tests.
+"""
+
+from __future__ import annotations
+
+from typing import Any
+from unittest.mock import MagicMock
+
+import spotipy
+from sqlalchemy.ext.asyncio import AsyncSession
+
+from cortexdj.agents.deps import AgentDeps
+from cortexdj.ml.predict import EEGModel
+
+
+def make_fake_deps(
+    *,
+    spotify_client: spotipy.Spotify | None = None,
+    eeg_model: EEGModel | None = None,
+    thread_id: str = "test-thread",
+    brain_context: Any | None = None,
+) -> AgentDeps:
+    fake_db = MagicMock(spec=AsyncSession)
+    return AgentDeps(
+        db=fake_db,
+        eeg_model=eeg_model,
+        spotify_client=spotify_client,
+        thread_id=thread_id,
+        brain_context=brain_context,
+    )
+
+
+def fake_spotify_client() -> spotipy.Spotify:
+    return MagicMock(spec=spotipy.Spotify)
+
+
+def fake_eeg_model() -> EEGModel:
+    return MagicMock(spec=EEGModel)