fix: wire resolve_failover_decision into retry loops and fix critical issues

praisonai-triage-agent[bot] · MervinPraison · praisonai-triage-agent[bot] · commit c898f69b6a9e · 2026-06-04T08:44:29.000Z
- Fix dead code: Wire resolve_failover_decision into _call_with_retry and _call_with_retry_async
- Fix backoff unit mixing in rate limit handling (prevent extreme delays)
- Fix quota exceeded duplicate classification (billing vs rate_limit)
- Add auth_permanent classification for non-retryable auth errors
- Fix IdleTimeoutBreaker _count field visibility with field(init=False)
- Add legacy error category mapping with deprecation warnings
- Update error subclass constructors to accept error_category parameter
- Fix remaining hardcoded iteration limits to use configurable max_iter

Addresses all critical issues identified by reviewers. The circuit breaker
is now functional and can properly trip to prevent runaway API costs.

Co-authored-by: Mervin Praison &lt;MervinPraison@users.noreply.github.com&gt;
diff --git a/src/praisonai-agents/praisonaiagents/errors.py b/src/praisonai-agents/praisonaiagents/errors.py
@@ -9,9 +9,10 @@
 - External integrations
 """
 
-from typing import Literal, Protocol, runtime_checkable, Optional, Dict, Any
-from dataclasses import dataclass
+from typing import Literal, Protocol, runtime_checkable, Optional, Dict, Any, get_args
+from dataclasses import dataclass, field
 import uuid
+import warnings
 
 
 # Closed error taxonomy for typed failure classification
@@ -21,6 +22,16 @@
     "model_not_found", "empty_response", "format_error", "unknown",
 ]
 
+# Legacy error category mapping for backward compatibility
+LEGACY_ERROR_CATEGORY_MAP = {
+    "tool": "unknown",
+    "llm": "unknown", 
+    "budget": "billing",
+    "validation": "format_error",
+    "network": "unknown",
+    "handoff": "unknown",
+}
+
 
 @dataclass
 class FailoverDecision:
@@ -44,7 +55,7 @@ class IdleTimeoutBreaker:
     Prevents runaway API costs when providers repeatedly stall.
     """
     max_consecutive: int = 3
-    _count: int = 0
+    _count: int = field(default=0, init=False, repr=False)
 
     def record_idle_timeout(self) -> bool:
         """Returns True when the hard cap is reached."""
@@ -86,7 +97,23 @@ def __init__(
         self.message = message
         self.agent_id = agent_id
         self.run_id = run_id or str(uuid.uuid4())
-        self.error_category = error_category or "unknown"
+        
+        # Handle error category with legacy mapping
+        if error_category is None:
+            self.error_category = "unknown"
+        elif error_category in get_args(AgentErrorKind):
+            self.error_category = error_category
+        elif error_category in LEGACY_ERROR_CATEGORY_MAP:
+            self.error_category = LEGACY_ERROR_CATEGORY_MAP[error_category]
+            warnings.warn(
+                f"error_category={error_category!r} is deprecated; "
+                f"use {self.error_category!r} instead.",
+                DeprecationWarning,
+                stacklevel=2,
+            )
+        else:
+            raise ValueError(f"Unsupported error_category: {error_category!r}")
+            
         self.is_retryable = is_retryable
         self.context = context or {}
 
@@ -108,6 +135,7 @@ def __init__(
         tool_name: str = "unknown",
         agent_id: str = "unknown",
         run_id: Optional[str] = None,
+        error_category: AgentErrorKind = "unknown",
         is_retryable: bool = True,  # Most tool errors are retryable
         context: Optional[Dict[str, Any]] = None
     ):
@@ -117,7 +145,7 @@ def __init__(
             message, 
             agent_id=agent_id, 
             run_id=run_id, 
-            error_category="unknown",  # Tools use "unknown" by default, can be overridden
+            error_category=error_category,
             is_retryable=is_retryable,
             context=context
         )
@@ -137,6 +165,7 @@ def __init__(
         model_name: str = "unknown", 
         agent_id: str = "unknown",
         run_id: Optional[str] = None,
+        error_category: AgentErrorKind = "unknown",
         is_retryable: bool = False,  # Default to non-retryable unless specified
         context: Optional[Dict[str, Any]] = None
     ):
@@ -146,7 +175,7 @@ def __init__(
             message, 
             agent_id=agent_id, 
             run_id=run_id, 
-            error_category="unknown",  # LLM errors use "unknown" by default, specific kind determined by classify_error
+            error_category=error_category,
             is_retryable=is_retryable,
             context=context
         )
@@ -280,6 +309,7 @@ def __init__(
         status_code: Optional[int] = None,
         agent_id: str = "unknown",
         run_id: Optional[str] = None,
+        error_category: AgentErrorKind = "unknown",
         is_retryable: bool = True,  # Most network errors are retryable
         context: Optional[Dict[str, Any]] = None
     ):
@@ -292,7 +322,7 @@ def __init__(
             message, 
             agent_id=agent_id, 
             run_id=run_id, 
-            error_category="unknown",  # Network errors map to "unknown" by default, can be classified as specific types
+            error_category=error_category,
             is_retryable=is_retryable,
             context=context
         )
@@ -314,6 +344,7 @@ def __init__(
         target_agent: Optional[str] = None,
         agent_id: str = "unknown",
         run_id: Optional[str] = None,
+        error_category: AgentErrorKind = "unknown",
         is_retryable: bool = False,  # Handoff errors usually need investigation
         context: Optional[Dict[str, Any]] = None
     ):
@@ -326,7 +357,7 @@ def __init__(
             message, 
             agent_id=agent_id, 
             run_id=run_id, 
-            error_category="unknown",  # Handoff errors use "unknown" by default
+            error_category=error_category,
             is_retryable=is_retryable,
             context=context
         )
diff --git a/src/praisonai-agents/praisonaiagents/llm/llm.py b/src/praisonai-agents/praisonaiagents/llm/llm.py
@@ -724,18 +724,24 @@ def classify_error_kind(self, error: Exception) -> AgentErrorKind:
         error_str = str(error).lower()
         error_type = type(error).__name__.lower()
         
-        # Authentication errors
+        # Check for permanent auth errors first (non-retryable)
         if any(indicator in error_str for indicator in [
-            "invalid api key", "unauthorized", "api key", "authentication failed",
-            "invalid_request_error", "openai_error", "authentication_error",
-            "invalid_api_key", "incorrect api key", "api key not found"
+            "invalid api key", "api key not found", "invalid_api_key", 
+            "incorrect api key", "authentication_error"
+        ]):
+            return "auth_permanent"
+        
+        # Retryable authentication errors
+        if any(indicator in error_str for indicator in [
+            "unauthorized", "api key", "authentication failed",
+            "invalid_request_error", "openai_error"
         ]):
             return "auth"
         
         # Rate limiting 
         if any(indicator in error_str for indicator in [
             "rate limit", "ratelimit", "too many request", "resource_exhausted",
-            "quota exceeded", "usage limit", "429"
+            "usage limit", "429"
         ]) or "429" in str(getattr(error, "status_code", "")):
             return "rate_limit"
         
@@ -828,9 +834,9 @@ def resolve_failover_decision(self, error: Exception, attempt_state: dict) -> Fa
         
         # Rate limiting - extract retry delay
         if error_kind == "rate_limit":
-            backoff = self._parse_retry_delay(str(error))
+            backoff = self._parse_retry_delay(str(error))  # Returns seconds
             if backoff == 0:  # No specific delay found, use exponential backoff
-                backoff = min(1000 * (2 ** (attempt - 1)), 60000)  # Cap at 60s
+                backoff = min(2 ** (attempt - 1), 60)  # 1s, 2s, 4s, ... cap at 60s
             return FailoverDecision(
                 action="retry",
                 reason=error_kind, 
@@ -962,10 +968,16 @@ def _call_with_retry(self, func, *args, **kwargs):
                 return result
 
             except Exception as e:
-                category, can_retry, retry_delay = self._classify_error_and_should_retry(e, attempt + 1)
+                # Use new typed failover decision instead of old classification
+                decision = self.resolve_failover_decision(e, {"attempt": attempt + 1, "max_retries": self._max_retries})
                 
                 last_error = e
                 error_str = str(e)
+                
+                # Map decision to old variables for compatibility with existing logic
+                category = decision.reason
+                can_retry = decision.is_retryable
+                retry_delay = decision.backoff_ms / 1000.0  # Convert ms to seconds
 
                 # Check for auth errors and try refreshing subscription credentials
                 if category == "auth" and self._auth_provider_id and attempt == 0:
@@ -982,8 +994,27 @@ def _call_with_retry(self, func, *args, **kwargs):
                     except Exception as refresh_error:
                         logging.warning(f"Failed to refresh subscription credentials: {refresh_error}")
 
-                # Failover: mark failure and try next profile (do this before early exit)
-                if self._failover_manager and self._current_profile:
+                # Handle different failover decision actions
+                if decision.action == "rotate_profile" and self._failover_manager:
+                    if self._current_profile:
+                        is_rate_limit = (category == "rate_limit")
+                        self._failover_manager.mark_failure(
+                            self._current_profile, error_str, is_rate_limit=is_rate_limit
+                        )
+                    next_profile = self._failover_manager.get_next_profile()
+                    if next_profile and next_profile != self._current_profile:
+                        self._switch_to_profile(next_profile)
+                        self._current_profile = next_profile
+                        # Update the kwargs with new profile values for the next retry
+                        if "api_key" in kwargs:
+                            kwargs["api_key"] = self.api_key
+                        if "base_url" in kwargs:
+                            kwargs["base_url"] = self.base_url
+                        if "model" in kwargs:
+                            kwargs["model"] = self.model
+                        logging.info(f"Failover: switched to profile '{next_profile.name}'")
+                # Legacy failover for compatibility (when decision is retry but failover is configured)
+                elif self._failover_manager and self._current_profile and decision.action == "retry":
                     is_rate_limit = (category == "rate_limit")
                     self._failover_manager.mark_failure(
                         self._current_profile, error_str, is_rate_limit=is_rate_limit
@@ -1004,7 +1035,7 @@ def _call_with_retry(self, func, *args, **kwargs):
                         retry_delay = 0.0
                         logging.info(f"Failover: switched to profile '{next_profile.name}'")
                 
-                if not can_retry:
+                if decision.action == "surface_error" or not can_retry:
                     raise
 
                 if attempt < self._max_retries:
@@ -1069,10 +1100,16 @@ async def _call_with_retry_async(self, func, *args, **kwargs):
                 return result
 
             except Exception as e:
-                category, can_retry, retry_delay = self._classify_error_and_should_retry(e, attempt + 1)
+                # Use new typed failover decision instead of old classification
+                decision = self.resolve_failover_decision(e, {"attempt": attempt + 1, "max_retries": self._max_retries})
                 
                 last_error = e
                 error_str = str(e)
+                
+                # Map decision to old variables for compatibility with existing logic
+                category = decision.reason
+                can_retry = decision.is_retryable
+                retry_delay = decision.backoff_ms / 1000.0  # Convert ms to seconds
 
                 # Check for auth errors and try refreshing subscription credentials
                 if category == "auth" and self._auth_provider_id and attempt == 0:
@@ -1089,8 +1126,27 @@ async def _call_with_retry_async(self, func, *args, **kwargs):
                     except Exception as refresh_error:
                         logging.warning(f"Failed to refresh subscription credentials: {refresh_error}")
 
-                # Failover: mark failure and try next profile (do this before early exit)
-                if self._failover_manager and self._current_profile:
+                # Handle different failover decision actions
+                if decision.action == "rotate_profile" and self._failover_manager:
+                    if self._current_profile:
+                        is_rate_limit = (category == "rate_limit")
+                        self._failover_manager.mark_failure(
+                            self._current_profile, error_str, is_rate_limit=is_rate_limit
+                        )
+                    next_profile = self._failover_manager.get_next_profile()
+                    if next_profile and next_profile != self._current_profile:
+                        self._switch_to_profile(next_profile)
+                        self._current_profile = next_profile
+                        # Update the kwargs with new profile values for the next retry
+                        if "api_key" in kwargs:
+                            kwargs["api_key"] = self.api_key
+                        if "base_url" in kwargs:
+                            kwargs["base_url"] = self.base_url
+                        if "model" in kwargs:
+                            kwargs["model"] = self.model
+                        logging.info(f"Failover: switched to profile '{next_profile.name}'")
+                # Legacy failover for compatibility (when decision is retry but failover is configured)
+                elif self._failover_manager and self._current_profile and decision.action == "retry":
                     is_rate_limit = (category == "rate_limit")
                     self._failover_manager.mark_failure(
                         self._current_profile, error_str, is_rate_limit=is_rate_limit
@@ -1111,7 +1167,7 @@ async def _call_with_retry_async(self, func, *args, **kwargs):
                         retry_delay = 0.0
                         logging.info(f"Failover: switched to profile '{next_profile.name}'")
                 
-                if not can_retry:
+                if decision.action == "surface_error" or not can_retry:
                     raise
 
                 if attempt < self._max_retries:
@@ -2144,7 +2200,7 @@ def _prepare_return_value(text: str) -> Union[str, tuple]:
                     )
 
             # Sequential tool calling loop - similar to agent.py
-            max_iterations = 10  # Prevent infinite loops
+            max_iterations = self.max_iter  # Use configurable iteration limit
             iteration_count = 0
             final_response_text = ""
             response_text = ""  # Initialize to prevent UnboundLocalError on API errors
@@ -3935,7 +3991,7 @@ async def get_response_async(
             formatted_tools = self._format_tools_for_litellm(tools)
 
             # Initialize variables for iteration loop
-            max_iterations = 50  # Prevent infinite loops
+            max_iterations = self.max_iter  # Use configurable iteration limit
             iteration_count = 0
             final_response_text = ""
             stored_reasoning_content = None  # Store reasoning content from tool execution
@@ -4416,7 +4472,7 @@ async def get_response_async(
                         continue
                     
                     # Safety check: prevent infinite loops for any provider
-                    if iteration_count >= 20:
+                    if iteration_count >= self.max_iter:
                         if tool_results:
                             final_response_text = "Task completed successfully based on tool execution results."
                         else: