CraftOS-dev · ahmad-ajmal · Apr 22, 2026 · Apr 13, 2026 · Apr 13, 2026 · Apr 13, 2026
diff --git a/agent_core/core/embedding_interface.py b/agent_core/core/embedding_interface.py
@@ -12,6 +12,8 @@
 - GOOGLE_API_KEY (for provider="gemini")
 """
 
+from __future__ import annotations
+
 import os
 from typing import List, Optional
 

diff --git a/agent_core/core/impl/action/router.py b/agent_core/core/impl/action/router.py
@@ -6,6 +6,8 @@
 based on user queries using LLM reasoning.
 """
 
+from __future__ import annotations
+
 import json
 import ast
 from typing import Optional, List, Dict, Any, Tuple

diff --git a/agent_core/core/impl/llm/cache/config.py b/agent_core/core/impl/llm/cache/config.py
@@ -7,6 +7,7 @@
 
 from __future__ import annotations
 
+
 import os
 from dataclasses import dataclass
 from typing import Optional

diff --git a/agent_core/core/impl/llm/cache/metrics.py b/agent_core/core/impl/llm/cache/metrics.py
@@ -7,6 +7,7 @@
 
 from __future__ import annotations
 
+
 import logging
 from dataclasses import dataclass
 from typing import Dict, Optional

diff --git a/agent_core/core/impl/llm/errors.py b/agent_core/core/impl/llm/errors.py
@@ -8,6 +8,7 @@
 
 from __future__ import annotations
 
+
 from typing import Optional
 
 # Import provider exception types

diff --git a/agent_core/core/impl/llm/interface.py b/agent_core/core/impl/llm/interface.py
@@ -1184,9 +1184,10 @@ def _generate_openai(
             # Always enforce JSON output format
             request_kwargs["response_format"] = {"type": "json_object"}
 
-            # Add prompt_cache_key when call_type is provided for better cache routing
-            # This helps when alternating between different call types (reasoning, action_selection)
-            if call_type and system_prompt and len(system_prompt) >= config.min_cache_tokens:
+            # Add prompt_cache_key for OpenAI/DeepSeek cache routing.
+            # Grok (xAI) does not support prompt_cache_key — it uses automatic
+            # prefix caching and ignores this parameter, so skip it for Grok.
+            if self.provider != "grok" and call_type and system_prompt and len(system_prompt) >= config.min_cache_tokens:
                 prompt_hash = hashlib.sha256(system_prompt.encode()).hexdigest()[:16]
                 cache_key = f"{call_type}_{prompt_hash}"
                 request_kwargs["extra_body"] = {"prompt_cache_key": cache_key}
@@ -1197,21 +1198,26 @@ def _generate_openai(
             token_count_input = response.usage.prompt_tokens
             token_count_output = response.usage.completion_tokens
 
-            # Extract cached tokens from prompt_tokens_details (OpenAI automatic caching)
-            # Available for prompts ≥1024 tokens
-            prompt_tokens_details = getattr(response.usage, "prompt_tokens_details", None)
-            if prompt_tokens_details:
-                cached_tokens = getattr(prompt_tokens_details, "cached_tokens", 0) or 0
+            # Extract cached tokens — field name differs by provider:
+            # - OpenAI:  response.usage.prompt_tokens_details.cached_tokens
+            # - Grok (xAI): response.usage.prompt_cache_hit_tokens
+            if self.provider == "grok":
+                cached_tokens = getattr(response.usage, "prompt_cache_hit_tokens", 0) or 0
+            else:
+                prompt_tokens_details = getattr(response.usage, "prompt_tokens_details", None)
+                if prompt_tokens_details:
+                    cached_tokens = getattr(prompt_tokens_details, "cached_tokens", 0) or 0
 
             # Record cache metrics
+            provider_label = self.provider  # "openai", "grok", "deepseek", etc.
             metrics = get_cache_metrics()
             if cached_tokens > 0:
-                logger.info(f"[CACHE] OpenAI {cache_type} cache hit: {cached_tokens}/{token_count_input} tokens from cache")
-                metrics.record_hit("openai", cache_type, cached_tokens=cached_tokens, total_tokens=token_count_input)
+                logger.info(f"[CACHE] {provider_label} {cache_type} cache hit: {cached_tokens}/{token_count_input} tokens from cache")
+                metrics.record_hit(provider_label, cache_type, cached_tokens=cached_tokens, total_tokens=token_count_input)
             elif system_prompt and len(system_prompt) >= config.min_cache_tokens:
                 # Caching should have been attempted (prompt long enough)
                 # This is a miss - either first call or cache expired
-                metrics.record_miss("openai", cache_type, total_tokens=token_count_input)
+                metrics.record_miss(provider_label, cache_type, total_tokens=token_count_input)
 
             status = "success"
         except Exception as exc:
@@ -1262,22 +1268,24 @@ def _generate_ollama(self, system_prompt: str | None, user_prompt: str) -> Dict[
         try:
             payload = {
                 "model": self.model,
-                "system": system_prompt,
                 "prompt": user_prompt,
                 "stream": False,
+                "format": "json",
                 "options": {
                     "temperature": self.temperature,
                 }
             }
+            if system_prompt:
+                payload["system"] = system_prompt
             url: str = f"{self.remote_url.rstrip('/')}/api/generate"
             response = requests.post(url, json=payload, timeout=600)
             response.raise_for_status()
             result = response.json()
 
             content = result.get("response", "").strip()
-            total_tokens = result.get("usage", {}).get("total_tokens", 0)
             token_count_input = result.get("prompt_eval_count", 0)
             token_count_output = result.get("eval_count", 0)
+            total_tokens = token_count_input + token_count_output
             status = "success"
         except Exception as exc:
             exc_obj = exc

diff --git a/agent_core/core/impl/llm/types.py b/agent_core/core/impl/llm/types.py
@@ -5,6 +5,7 @@
 
 from __future__ import annotations
 
+
 from enum import Enum
Original file line number	Diff line number	Diff line change
Expand Up		@@ -8,6 +8,7 @@

		from __future__ import annotations


		from typing import Optional

		# Import provider exception types
Expand Down
Original file line number	Diff line number	Diff line change
Expand Up		@@ -5,6 +5,7 @@

		from __future__ import annotations


		from enum import Enum


Expand Down