sdk/python: Anthropic prompt caching

ndeodhar · ndeodhar · commit c1e7416eee2a · 2026-02-15T17:56:16.000-08:00
diff --git a/sdk/python/polos/agents/agent.py b/sdk/python/polos/agents/agent.py
@@ -386,6 +386,7 @@ def __init__(
         guardrails: Callable | str | list[Callable | str] | None = None,
         guardrail_max_retries: int = 2,
         conversation_history: int = 10,  # Number of messages to keep
+        stream_to_workflow: bool = False,
     ):
         # Parse queue configuration (same as task decorator)
         queue_name: str | None = None
@@ -442,6 +443,9 @@ def __init__(
         # Conversation history
         self.conversation_history = conversation_history
 
+        # Stream to workflow topic for all invocations
+        self.stream_to_workflow = stream_to_workflow
+
         # Convert Pydantic model to JSON schema if provided
         self._output_json_schema, self._output_schema_name = convert_output_schema(
             output_schema, context_id=self.id
@@ -505,7 +509,7 @@ async def _agent_execute(self, ctx: AgentContext, payload: dict[str, Any]) -> di
             )
 
         input_data = payload.get("input")
-        streaming = payload.get("streaming", False)  # Whether to stream or return final result
+        streaming = payload.get("streaming", False) or self.stream_to_workflow
         provider_kwargs = payload.get(
             "provider_kwargs", {}
         )  # Additional kwargs to pass to provider
diff --git a/sdk/python/polos/agents/stream.py b/sdk/python/polos/agents/stream.py
@@ -118,6 +118,8 @@ async def _agent_stream_function(ctx: AgentContext, payload: dict[str, Any]) ->
     final_input_tokens = 0
     final_output_tokens = 0
     final_total_tokens = 0
+    final_cache_read_input_tokens = 0
+    final_cache_creation_input_tokens = 0
     last_llm_result_content = None
     all_tool_results = []
     steps: list[Step] = []
@@ -242,6 +244,10 @@ async def _agent_stream_function(ctx: AgentContext, payload: dict[str, Any]) ->
             final_input_tokens += usage_dict.get("input_tokens", 0)
             final_output_tokens += usage_dict.get("output_tokens", 0)
             final_total_tokens += usage_dict.get("total_tokens", 0)
+            if usage_dict.get("cache_read_input_tokens"):
+                final_cache_read_input_tokens += usage_dict["cache_read_input_tokens"]
+            if usage_dict.get("cache_creation_input_tokens"):
+                final_cache_creation_input_tokens += usage_dict["cache_creation_input_tokens"]
 
         last_llm_result_content = llm_result.get("content")
         tool_calls = llm_result.get("tool_calls") or []
@@ -555,6 +561,16 @@ async def _agent_stream_function(ctx: AgentContext, payload: dict[str, Any]) ->
                 "input_tokens": final_input_tokens,
                 "output_tokens": final_output_tokens,
                 "total_tokens": final_total_tokens,
+                **(
+                    {"cache_read_input_tokens": final_cache_read_input_tokens}
+                    if final_cache_read_input_tokens > 0
+                    else {}
+                ),
+                **(
+                    {"cache_creation_input_tokens": final_cache_creation_input_tokens}
+                    if final_cache_creation_input_tokens > 0
+                    else {}
+                ),
             },
         }
     )
diff --git a/sdk/python/polos/execution/tools/exec.py b/sdk/python/polos/execution/tools/exec.py
@@ -41,8 +41,7 @@ async def _request_approval(
             "_form": {
                 "title": "Approve command execution",
                 "description": (
-                    f"The agent wants to run a shell command in the "
-                    f"{env_info.type} environment."
+                    f"The agent wants to run a shell command in the {env_info.type} environment."
                 ),
                 "fields": [
                     {
diff --git a/sdk/python/polos/llm/providers/anthropic.py b/sdk/python/polos/llm/providers/anthropic.py
@@ -1,11 +1,68 @@
 """Anthropic provider implementation."""
 
 import json
+import logging
 import os
 from typing import Any
 
 from .base import LLMProvider, LLMResponse, register_provider
 
+logger = logging.getLogger(__name__)
+
+ANTHROPIC_CACHE_CONTROL = {"type": "ephemeral"}
+
+
+def _apply_cache_control(request_params: dict[str, Any]) -> None:
+    """Add Anthropic prompt caching breakpoints to request params (in-place).
+
+    Marks the system prompt, the last tool, and the last message with
+    cache_control so Anthropic can cache the static prefix across calls.
+    """
+    # 1. System prompt: convert string to content block list with cache control
+    system = request_params.get("system")
+    if isinstance(system, str):
+        request_params["system"] = [
+            {"type": "text", "text": system, "cache_control": ANTHROPIC_CACHE_CONTROL}
+        ]
+    elif isinstance(system, list) and system:
+        # Already a list of content blocks — mark the last one
+        system[-1] = {**system[-1], "cache_control": ANTHROPIC_CACHE_CONTROL}
+
+    # 2. Tools: mark the last tool
+    tools = request_params.get("tools")
+    if tools and isinstance(tools, list) and len(tools) > 0:
+        tools[-1] = {**tools[-1], "cache_control": ANTHROPIC_CACHE_CONTROL}
+
+    # 3. Messages: mark the last content block of the last message
+    messages = request_params.get("messages")
+    if messages and isinstance(messages, list) and len(messages) > 0:
+        last_msg = messages[-1]
+        content = last_msg.get("content") if isinstance(last_msg, dict) else None
+        if isinstance(content, str):
+            # Convert string content to content block list with cache control
+            messages[-1] = {
+                **last_msg,
+                "content": [
+                    {"type": "text", "text": content, "cache_control": ANTHROPIC_CACHE_CONTROL}
+                ],
+            }
+        elif isinstance(content, list) and len(content) > 0:
+            # Mark the last content block
+            content[-1] = {**content[-1], "cache_control": ANTHROPIC_CACHE_CONTROL}
+
+
+def _extract_cache_usage(usage_data: Any) -> dict[str, int]:
+    """Extract cache token fields from Anthropic usage data."""
+    result: dict[str, int] = {}
+    if usage_data:
+        cache_read = getattr(usage_data, "cache_read_input_tokens", None)
+        if cache_read is not None:
+            result["cache_read_input_tokens"] = cache_read
+        cache_creation = getattr(usage_data, "cache_creation_input_tokens", None)
+        if cache_creation is not None:
+            result["cache_creation_input_tokens"] = cache_creation
+    return result
+
 
 @register_provider("anthropic")
 class AnthropicProvider(LLMProvider):
@@ -149,6 +206,10 @@ async def generate(
 
         # Add any additional kwargs
         request_params.update(kwargs)
+
+        # Apply prompt caching breakpoints
+        _apply_cache_control(request_params)
+
         try:
             # Use the SDK's Messages API
             response = await self.client.messages.create(**request_params)
@@ -202,6 +263,7 @@ async def generate(
                 "total_tokens": (usage_data.input_tokens + usage_data.output_tokens)
                 if usage_data
                 else 0,
+                **_extract_cache_usage(usage_data),
             }
 
             # Extract model and stop_reason from response
@@ -344,6 +406,10 @@ async def stream(
 
         # Add any additional kwargs
         request_params.update(kwargs)
+
+        # Apply prompt caching breakpoints
+        _apply_cache_control(request_params)
+
         try:
             # Use the SDK's Messages API with streaming
             stream = await self.client.messages.create(**request_params)
@@ -512,6 +578,14 @@ async def stream(
                                     usage["input_tokens"] = usage_data.get("input_tokens")
                                 if usage_data.get("output_tokens"):
                                     usage["output_tokens"] = usage_data.get("output_tokens")
+                                if usage_data.get("cache_read_input_tokens") is not None:
+                                    usage["cache_read_input_tokens"] = usage_data.get(
+                                        "cache_read_input_tokens"
+                                    )
+                                if usage_data.get("cache_creation_input_tokens") is not None:
+                                    usage["cache_creation_input_tokens"] = usage_data.get(
+                                        "cache_creation_input_tokens"
+                                    )
 
                 elif event_type == "message_stop":
                     # Stream complete - final event
diff --git a/sdk/python/polos/types/types.py b/sdk/python/polos/types/types.py
@@ -11,6 +11,8 @@ class Usage(BaseModel):
     input_tokens: int = 0
     output_tokens: int = 0
     total_tokens: int = 0
+    cache_read_input_tokens: int | None = None
+    cache_creation_input_tokens: int | None = None
 
 
 class ToolCallFunction(BaseModel):
diff --git a/sdk/python/tests/unit/test_agents/test_agent.py b/sdk/python/tests/unit/test_agents/test_agent.py
@@ -613,3 +613,116 @@ def guardrail2(ctx, guardrail_ctx):
         # Test with invalid type
         with pytest.raises(TypeError, match="Invalid guardrails type"):
             agent._normalize_guardrails(123)  # type: ignore
+
+    def test_agent_stream_to_workflow_default_false(self):
+        """Test Agent stream_to_workflow defaults to False."""
+        agent = Agent(id="test-agent", model="gpt-4", provider="openai")
+        assert agent.stream_to_workflow is False
+
+    def test_agent_stream_to_workflow_true(self):
+        """Test Agent stream_to_workflow can be set to True."""
+        agent = Agent(
+            id="test-agent",
+            model="gpt-4",
+            provider="openai",
+            stream_to_workflow=True,
+        )
+        assert agent.stream_to_workflow is True
+
+
+class TestAgentStreamToWorkflow:
+    """Tests for stream_to_workflow streaming flag resolution in _agent_execute."""
+
+    @pytest.mark.asyncio
+    async def test_stream_to_workflow_false_payload_streaming_false(self):
+        """Default agent: streaming=False in payload → streaming=False passed to stream function."""
+        agent = Agent(id="test-agent-stw-1", model="gpt-4", provider="openai")
+
+        mock_ctx = MagicMock()
+        mock_ctx.execution_id = "exec-123"
+        mock_ctx.session_id = "sess-123"
+        mock_ctx.user_id = "user-123"
+        mock_ctx.step.uuid = AsyncMock(return_value="conv-123")
+
+        with patch(
+            "polos.agents.stream._agent_stream_function", new_callable=AsyncMock
+        ) as mock_stream:
+            mock_stream.return_value = {"result": "ok"}
+
+            await agent._agent_execute(mock_ctx, {"input": "hello", "streaming": False})
+
+            call_args = mock_stream.call_args[0]
+            assert call_args[1]["streaming"] is False
+
+    @pytest.mark.asyncio
+    async def test_stream_to_workflow_true_payload_streaming_false(self):
+        """Agent with stream_to_workflow=True: streaming=False in payload → streaming=True."""
+        agent = Agent(
+            id="test-agent-stw-2",
+            model="gpt-4",
+            provider="openai",
+            stream_to_workflow=True,
+        )
+
+        mock_ctx = MagicMock()
+        mock_ctx.execution_id = "exec-123"
+        mock_ctx.session_id = "sess-123"
+        mock_ctx.user_id = "user-123"
+        mock_ctx.step.uuid = AsyncMock(return_value="conv-123")
+
+        with patch(
+            "polos.agents.stream._agent_stream_function", new_callable=AsyncMock
+        ) as mock_stream:
+            mock_stream.return_value = {"result": "ok"}
+
+            await agent._agent_execute(mock_ctx, {"input": "hello", "streaming": False})
+
+            call_args = mock_stream.call_args[0]
+            assert call_args[1]["streaming"] is True
+
+    @pytest.mark.asyncio
+    async def test_stream_to_workflow_true_payload_streaming_true(self):
+        """Agent with stream_to_workflow=True + payload streaming=True → streaming=True."""
+        agent = Agent(
+            id="test-agent-stw-3",
+            model="gpt-4",
+            provider="openai",
+            stream_to_workflow=True,
+        )
+
+        mock_ctx = MagicMock()
+        mock_ctx.execution_id = "exec-123"
+        mock_ctx.session_id = "sess-123"
+        mock_ctx.user_id = "user-123"
+        mock_ctx.step.uuid = AsyncMock(return_value="conv-123")
+
+        with patch(
+            "polos.agents.stream._agent_stream_function", new_callable=AsyncMock
+        ) as mock_stream:
+            mock_stream.return_value = {"result": "ok"}
+
+            await agent._agent_execute(mock_ctx, {"input": "hello", "streaming": True})
+
+            call_args = mock_stream.call_args[0]
+            assert call_args[1]["streaming"] is True
+
+    @pytest.mark.asyncio
+    async def test_stream_to_workflow_false_payload_streaming_true(self):
+        """Default agent + payload streaming=True → streaming=True (unchanged)."""
+        agent = Agent(id="test-agent-stw-4", model="gpt-4", provider="openai")
+
+        mock_ctx = MagicMock()
+        mock_ctx.execution_id = "exec-123"
+        mock_ctx.session_id = "sess-123"
+        mock_ctx.user_id = "user-123"
+        mock_ctx.step.uuid = AsyncMock(return_value="conv-123")
+
+        with patch(
+            "polos.agents.stream._agent_stream_function", new_callable=AsyncMock
+        ) as mock_stream:
+            mock_stream.return_value = {"result": "ok"}
+
+            await agent._agent_execute(mock_ctx, {"input": "hello", "streaming": True})
+
+            call_args = mock_stream.call_args[0]
+            assert call_args[1]["streaming"] is True
diff --git a/sdk/python/tests/unit/test_agents/test_stop_conditions.py b/sdk/python/tests/unit/test_agents/test_stop_conditions.py
@@ -223,16 +223,12 @@ def test_max_steps_continues_when_count_not_reached(self):
     def test_max_steps_default_count(self):
         """Test max_steps uses default count=20."""
         config = MaxStepsConfig()  # Uses default count=20
-        ctx = StopConditionContext(
-            steps=[Step(step=i) for i in range(1, 6)]
-        )
+        ctx = StopConditionContext(steps=[Step(step=i) for i in range(1, 6)])
         configured = max_steps(config)
         result = configured(ctx)
         assert result is False  # 5 < 20
 
-        ctx_at_limit = StopConditionContext(
-            steps=[Step(step=i) for i in range(1, 21)]
-        )
+        ctx_at_limit = StopConditionContext(steps=[Step(step=i) for i in range(1, 21)])
         result = configured(ctx_at_limit)
         assert result is True  # 20 >= 20