Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
26 changes: 26 additions & 0 deletions .claude/rules/backend/pydantic-ai.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
# Pydantic AI Rules

## Docs are split between two places

Pydantic AI's documentation lives in two places with **different content**:

1. **`docs/pydantic-ai-llms-full.txt`** — local pinned reference. API signatures,
class docs, model provider APIs, the `pydantic_evals` surface. Refresh via
the `updating-deps` skill (or `curl -sSL https://ai.pydantic.dev/llms-full.txt
-o docs/pydantic-ai-llms-full.txt`).

2. **`https://ai.pydantic.dev/`** — conceptual guides, tutorials, example
applications (chat, RAG, durable execution, graphs, A2A). Not cached locally;
fetch ad-hoc with WebFetch.

**Rule of thumb:** if you're grepping `llms-full.txt` for a worked example or
tutorial and finding nothing, the content hasn't been deleted — it's on the web
docs site. Use WebFetch before giving up.

## Tool error handling

When adding a tool that makes external calls, prefer letting exceptions
propagate so `backend/src/cortexdj/agents/hooks.py::on_tool_execute_error`
catches them and returns a structured recovery payload. Don't wrap the whole
tool body in try/except unless you have a specific reason to handle a known
error shape differently (e.g., token-expired → reconnect prompt).
5 changes: 4 additions & 1 deletion DEVELOPMENT.md
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,14 @@ uv run --directory backend pre-commit run --all-files
## Tests

```bash
uv run --directory backend pytest # run all tests
uv run --directory backend pytest # unit tests (eval suite excluded)
uv run --directory backend pytest -v # verbose output
uv run --directory backend pytest tests/test_preprocessing.py # single file
uv run --directory backend pytest -m eval # real-model brain_agent eval suite (opt-in)
```

The `eval` marker gates tests that call the real OpenAI API via `brain_agent` — the default `pytest` invocation excludes them via `addopts = "-m 'not eval'"`. Use them as a nightly safety net on `main` or manual spot-checks, not on every PR. See `.claude/rules/backend/pydantic-ai.md` and `backend/tests/evals/` for the suite layout.

## Database migrations

```bash
Expand Down
7 changes: 5 additions & 2 deletions backend/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ dependencies = [
"modal>=1.4.1",
"numpy>=2.4.4",
"psycopg[binary,pool]>=3.3.3",
"pydantic-ai>=1.77.0",
"pydantic-ai>=1.79.0",
"pydantic>=2.12.5",
"scipy>=1.17.1",
"spotipy>=2.26.0",
Expand All @@ -31,15 +31,18 @@ dependencies = [
dev = [
"mypy>=1.20.0",
"pre-commit>=4.5.1",
"pydantic-evals>=1.79.0",
"pytest>=9.0.3",
"ruff>=0.15.9",
"ruff>=0.15.10",
]

[tool.pytest.ini_options]
markers = [
"main: marks tests for main demo examples",
"additional: marks tests for additional demo examples",
"eval: marks tests that call a real LLM (opt-in; run with `pytest -m eval`)",
]
addopts = "-m 'not eval'"

[tool.mypy]
strict = true
Expand Down
16 changes: 13 additions & 3 deletions backend/src/cortexdj/agents/brain_agent.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,15 @@

import logfire
from pydantic_ai import Agent
from pydantic_ai.capabilities import HistoryProcessor
from pydantic_ai.models.openai import OpenAIResponsesModel
from pydantic_ai.models.openai import OpenAIResponsesModel, OpenAIResponsesModelSettings

from cortexdj.agents.capabilities.classification import ClassificationCapability
from cortexdj.agents.capabilities.insight import InsightCapability
from cortexdj.agents.capabilities.playlist import PlaylistCapability
from cortexdj.agents.capabilities.session import SessionCapability
from cortexdj.agents.deps import AgentDeps
from cortexdj.agents.history_processor import summarize_tool_results
from cortexdj.agents.hooks import build_brain_agent_hooks
from cortexdj.core.config import get_settings

logfire.configure(service_name="cortexdj")
Expand Down Expand Up @@ -75,15 +75,25 @@

_model = OpenAIResponsesModel(model_name=config.AGENT_MODEL)

# Reasoning is opt-in via AGENT_REASONING_EFFORT env var. Enable and validate
# against backend/tests/evals/test_brain_agent_evals.py before committing.
_model_settings = (
OpenAIResponsesModelSettings(openai_reasoning_effort=config.AGENT_REASONING_EFFORT)
if config.AGENT_REASONING_EFFORT is not None
else None
)

brain_agent = Agent(
model=_model,
model_settings=_model_settings,
deps_type=AgentDeps,
instructions=SYSTEM_PROMPT,
capabilities=[
SessionCapability(),
InsightCapability(),
PlaylistCapability(),
ClassificationCapability(),
HistoryProcessor(summarize_tool_results),
build_brain_agent_hooks(),
],
history_processors=[summarize_tool_results],
)
53 changes: 53 additions & 0 deletions backend/src/cortexdj/agents/hooks.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
"""Hooks for the CortexDJ brain agent.

Tool bodies already handle anticipated failures (Spotify not configured,
token expired, etc.) by returning structured ``{"error": ...}`` dicts.
This module is the safety net for *unanticipated* exceptions — anything
that bubbles out of a tool body would otherwise crash the Vercel AI SDK
stream mid-response. ``on_tool_execute_error`` intercepts those, logs the
traceback, and returns a structured recovery payload so the agent can
explain the failure to the user conversationally.
"""

from __future__ import annotations

import logging
from typing import Any

from pydantic_ai import ToolDefinition
from pydantic_ai.capabilities import Hooks
from pydantic_ai.messages import ToolCallPart
from pydantic_ai.tools import RunContext

from cortexdj.agents.deps import AgentDeps

logger = logging.getLogger(__name__)


def _recovery_payload(tool_name: str, error: Exception) -> dict[str, Any]:
return {
"error": "tool_failed",
"tool": tool_name,
"exception_type": type(error).__name__,
"message": (
f"The {tool_name} tool failed unexpectedly ({type(error).__name__}). "
"Apologize to the user, explain briefly what you were trying to do, "
"and suggest they retry or rephrase."
),
}


async def _recover_tool_error(
ctx: RunContext[AgentDeps],
*,
call: ToolCallPart,
tool_def: ToolDefinition,
args: Any,
error: Exception,
) -> dict[str, Any]:
logger.exception(f"Unhandled exception in tool {tool_def.name}: {error!r}")
return _recovery_payload(tool_def.name, error)


def build_brain_agent_hooks() -> Hooks[AgentDeps]:
return Hooks[AgentDeps](tool_execute_error=_recover_tool_error)
1 change: 1 addition & 0 deletions backend/src/cortexdj/core/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@ class ApiSettings(BaseSettings):

class AgentSettings(BaseSettings):
AGENT_MODEL: str = "gpt-5.4-mini"
AGENT_REASONING_EFFORT: Literal["low", "medium", "high"] | None = None


class PostgresSettings(BaseSettings):
Expand Down
8 changes: 8 additions & 0 deletions backend/tests/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
import os

# ``OpenAIResponsesModel`` constructs its provider client eagerly at module
# import time, which requires ``OPENAI_API_KEY``. Any test that imports
# ``brain_agent`` — directly or transitively — would otherwise fail at
# collection. Tests that actually invoke the model use ``agent.override``
# with ``TestModel``; this dummy value never reaches a real API call.
os.environ.setdefault("OPENAI_API_KEY", "sk-test-deterministic-no-real-calls")
Empty file.
43 changes: 43 additions & 0 deletions backend/tests/evals/conftest.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
"""Fixtures for brain_agent evals.

Provides ``make_fake_deps`` — a constructor for ``AgentDeps`` instances
that don't hit the real database, Spotify, or EEG model. Used by both
the deterministic ``prepare_tools`` tests (TestModel-backed) and the
real-model ``@pytest.mark.eval`` tests.
"""

from __future__ import annotations

from typing import Any
from unittest.mock import MagicMock

import spotipy
from sqlalchemy.ext.asyncio import AsyncSession

from cortexdj.agents.deps import AgentDeps
from cortexdj.ml.predict import EEGModel


def make_fake_deps(
*,
spotify_client: spotipy.Spotify | None = None,
eeg_model: EEGModel | None = None,
thread_id: str = "test-thread",
brain_context: Any | None = None,
) -> AgentDeps:
fake_db = MagicMock(spec=AsyncSession)
return AgentDeps(
db=fake_db,
eeg_model=eeg_model,
spotify_client=spotify_client,
thread_id=thread_id,
brain_context=brain_context,
)


def fake_spotify_client() -> spotipy.Spotify:
return MagicMock(spec=spotipy.Spotify)


def fake_eeg_model() -> EEGModel:
return MagicMock(spec=EEGModel)
Loading