Skip to content

Commit c898f69

Browse files
fix: wire resolve_failover_decision into retry loops and fix critical issues
- Fix dead code: Wire resolve_failover_decision into _call_with_retry and _call_with_retry_async - Fix backoff unit mixing in rate limit handling (prevent extreme delays) - Fix quota exceeded duplicate classification (billing vs rate_limit) - Add auth_permanent classification for non-retryable auth errors - Fix IdleTimeoutBreaker _count field visibility with field(init=False) - Add legacy error category mapping with deprecation warnings - Update error subclass constructors to accept error_category parameter - Fix remaining hardcoded iteration limits to use configurable max_iter Addresses all critical issues identified by reviewers. The circuit breaker is now functional and can properly trip to prevent runaway API costs. Co-authored-by: Mervin Praison <MervinPraison@users.noreply.github.com>
1 parent 434253c commit c898f69

2 files changed

Lines changed: 113 additions & 26 deletions

File tree

src/praisonai-agents/praisonaiagents/errors.py

Lines changed: 39 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -9,9 +9,10 @@
99
- External integrations
1010
"""
1111

12-
from typing import Literal, Protocol, runtime_checkable, Optional, Dict, Any
13-
from dataclasses import dataclass
12+
from typing import Literal, Protocol, runtime_checkable, Optional, Dict, Any, get_args
13+
from dataclasses import dataclass, field
1414
import uuid
15+
import warnings
1516

1617

1718
# Closed error taxonomy for typed failure classification
@@ -21,6 +22,16 @@
2122
"model_not_found", "empty_response", "format_error", "unknown",
2223
]
2324

25+
# Legacy error category mapping for backward compatibility
26+
LEGACY_ERROR_CATEGORY_MAP = {
27+
"tool": "unknown",
28+
"llm": "unknown",
29+
"budget": "billing",
30+
"validation": "format_error",
31+
"network": "unknown",
32+
"handoff": "unknown",
33+
}
34+
2435

2536
@dataclass
2637
class FailoverDecision:
@@ -44,7 +55,7 @@ class IdleTimeoutBreaker:
4455
Prevents runaway API costs when providers repeatedly stall.
4556
"""
4657
max_consecutive: int = 3
47-
_count: int = 0
58+
_count: int = field(default=0, init=False, repr=False)
4859

4960
def record_idle_timeout(self) -> bool:
5061
"""Returns True when the hard cap is reached."""
@@ -86,7 +97,23 @@ def __init__(
8697
self.message = message
8798
self.agent_id = agent_id
8899
self.run_id = run_id or str(uuid.uuid4())
89-
self.error_category = error_category or "unknown"
100+
101+
# Handle error category with legacy mapping
102+
if error_category is None:
103+
self.error_category = "unknown"
104+
elif error_category in get_args(AgentErrorKind):
105+
self.error_category = error_category
106+
elif error_category in LEGACY_ERROR_CATEGORY_MAP:
107+
self.error_category = LEGACY_ERROR_CATEGORY_MAP[error_category]
108+
warnings.warn(
109+
f"error_category={error_category!r} is deprecated; "
110+
f"use {self.error_category!r} instead.",
111+
DeprecationWarning,
112+
stacklevel=2,
113+
)
114+
else:
115+
raise ValueError(f"Unsupported error_category: {error_category!r}")
116+
90117
self.is_retryable = is_retryable
91118
self.context = context or {}
92119

@@ -108,6 +135,7 @@ def __init__(
108135
tool_name: str = "unknown",
109136
agent_id: str = "unknown",
110137
run_id: Optional[str] = None,
138+
error_category: AgentErrorKind = "unknown",
111139
is_retryable: bool = True, # Most tool errors are retryable
112140
context: Optional[Dict[str, Any]] = None
113141
):
@@ -117,7 +145,7 @@ def __init__(
117145
message,
118146
agent_id=agent_id,
119147
run_id=run_id,
120-
error_category="unknown", # Tools use "unknown" by default, can be overridden
148+
error_category=error_category,
121149
is_retryable=is_retryable,
122150
context=context
123151
)
@@ -137,6 +165,7 @@ def __init__(
137165
model_name: str = "unknown",
138166
agent_id: str = "unknown",
139167
run_id: Optional[str] = None,
168+
error_category: AgentErrorKind = "unknown",
140169
is_retryable: bool = False, # Default to non-retryable unless specified
141170
context: Optional[Dict[str, Any]] = None
142171
):
@@ -146,7 +175,7 @@ def __init__(
146175
message,
147176
agent_id=agent_id,
148177
run_id=run_id,
149-
error_category="unknown", # LLM errors use "unknown" by default, specific kind determined by classify_error
178+
error_category=error_category,
150179
is_retryable=is_retryable,
151180
context=context
152181
)
@@ -280,6 +309,7 @@ def __init__(
280309
status_code: Optional[int] = None,
281310
agent_id: str = "unknown",
282311
run_id: Optional[str] = None,
312+
error_category: AgentErrorKind = "unknown",
283313
is_retryable: bool = True, # Most network errors are retryable
284314
context: Optional[Dict[str, Any]] = None
285315
):
@@ -292,7 +322,7 @@ def __init__(
292322
message,
293323
agent_id=agent_id,
294324
run_id=run_id,
295-
error_category="unknown", # Network errors map to "unknown" by default, can be classified as specific types
325+
error_category=error_category,
296326
is_retryable=is_retryable,
297327
context=context
298328
)
@@ -314,6 +344,7 @@ def __init__(
314344
target_agent: Optional[str] = None,
315345
agent_id: str = "unknown",
316346
run_id: Optional[str] = None,
347+
error_category: AgentErrorKind = "unknown",
317348
is_retryable: bool = False, # Handoff errors usually need investigation
318349
context: Optional[Dict[str, Any]] = None
319350
):
@@ -326,7 +357,7 @@ def __init__(
326357
message,
327358
agent_id=agent_id,
328359
run_id=run_id,
329-
error_category="unknown", # Handoff errors use "unknown" by default
360+
error_category=error_category,
330361
is_retryable=is_retryable,
331362
context=context
332363
)

src/praisonai-agents/praisonaiagents/llm/llm.py

Lines changed: 74 additions & 18 deletions
Original file line numberDiff line numberDiff line change
@@ -724,18 +724,24 @@ def classify_error_kind(self, error: Exception) -> AgentErrorKind:
724724
error_str = str(error).lower()
725725
error_type = type(error).__name__.lower()
726726

727-
# Authentication errors
727+
# Check for permanent auth errors first (non-retryable)
728728
if any(indicator in error_str for indicator in [
729-
"invalid api key", "unauthorized", "api key", "authentication failed",
730-
"invalid_request_error", "openai_error", "authentication_error",
731-
"invalid_api_key", "incorrect api key", "api key not found"
729+
"invalid api key", "api key not found", "invalid_api_key",
730+
"incorrect api key", "authentication_error"
731+
]):
732+
return "auth_permanent"
733+
734+
# Retryable authentication errors
735+
if any(indicator in error_str for indicator in [
736+
"unauthorized", "api key", "authentication failed",
737+
"invalid_request_error", "openai_error"
732738
]):
733739
return "auth"
734740

735741
# Rate limiting
736742
if any(indicator in error_str for indicator in [
737743
"rate limit", "ratelimit", "too many request", "resource_exhausted",
738-
"quota exceeded", "usage limit", "429"
744+
"usage limit", "429"
739745
]) or "429" in str(getattr(error, "status_code", "")):
740746
return "rate_limit"
741747

@@ -828,9 +834,9 @@ def resolve_failover_decision(self, error: Exception, attempt_state: dict) -> Fa
828834

829835
# Rate limiting - extract retry delay
830836
if error_kind == "rate_limit":
831-
backoff = self._parse_retry_delay(str(error))
837+
backoff = self._parse_retry_delay(str(error)) # Returns seconds
832838
if backoff == 0: # No specific delay found, use exponential backoff
833-
backoff = min(1000 * (2 ** (attempt - 1)), 60000) # Cap at 60s
839+
backoff = min(2 ** (attempt - 1), 60) # 1s, 2s, 4s, ... cap at 60s
834840
return FailoverDecision(
835841
action="retry",
836842
reason=error_kind,
@@ -962,10 +968,16 @@ def _call_with_retry(self, func, *args, **kwargs):
962968
return result
963969

964970
except Exception as e:
965-
category, can_retry, retry_delay = self._classify_error_and_should_retry(e, attempt + 1)
971+
# Use new typed failover decision instead of old classification
972+
decision = self.resolve_failover_decision(e, {"attempt": attempt + 1, "max_retries": self._max_retries})
966973

967974
last_error = e
968975
error_str = str(e)
976+
977+
# Map decision to old variables for compatibility with existing logic
978+
category = decision.reason
979+
can_retry = decision.is_retryable
980+
retry_delay = decision.backoff_ms / 1000.0 # Convert ms to seconds
969981

970982
# Check for auth errors and try refreshing subscription credentials
971983
if category == "auth" and self._auth_provider_id and attempt == 0:
@@ -982,8 +994,27 @@ def _call_with_retry(self, func, *args, **kwargs):
982994
except Exception as refresh_error:
983995
logging.warning(f"Failed to refresh subscription credentials: {refresh_error}")
984996

985-
# Failover: mark failure and try next profile (do this before early exit)
986-
if self._failover_manager and self._current_profile:
997+
# Handle different failover decision actions
998+
if decision.action == "rotate_profile" and self._failover_manager:
999+
if self._current_profile:
1000+
is_rate_limit = (category == "rate_limit")
1001+
self._failover_manager.mark_failure(
1002+
self._current_profile, error_str, is_rate_limit=is_rate_limit
1003+
)
1004+
next_profile = self._failover_manager.get_next_profile()
1005+
if next_profile and next_profile != self._current_profile:
1006+
self._switch_to_profile(next_profile)
1007+
self._current_profile = next_profile
1008+
# Update the kwargs with new profile values for the next retry
1009+
if "api_key" in kwargs:
1010+
kwargs["api_key"] = self.api_key
1011+
if "base_url" in kwargs:
1012+
kwargs["base_url"] = self.base_url
1013+
if "model" in kwargs:
1014+
kwargs["model"] = self.model
1015+
logging.info(f"Failover: switched to profile '{next_profile.name}'")
1016+
# Legacy failover for compatibility (when decision is retry but failover is configured)
1017+
elif self._failover_manager and self._current_profile and decision.action == "retry":
9871018
is_rate_limit = (category == "rate_limit")
9881019
self._failover_manager.mark_failure(
9891020
self._current_profile, error_str, is_rate_limit=is_rate_limit
@@ -1004,7 +1035,7 @@ def _call_with_retry(self, func, *args, **kwargs):
10041035
retry_delay = 0.0
10051036
logging.info(f"Failover: switched to profile '{next_profile.name}'")
10061037

1007-
if not can_retry:
1038+
if decision.action == "surface_error" or not can_retry:
10081039
raise
10091040

10101041
if attempt < self._max_retries:
@@ -1069,10 +1100,16 @@ async def _call_with_retry_async(self, func, *args, **kwargs):
10691100
return result
10701101

10711102
except Exception as e:
1072-
category, can_retry, retry_delay = self._classify_error_and_should_retry(e, attempt + 1)
1103+
# Use new typed failover decision instead of old classification
1104+
decision = self.resolve_failover_decision(e, {"attempt": attempt + 1, "max_retries": self._max_retries})
10731105

10741106
last_error = e
10751107
error_str = str(e)
1108+
1109+
# Map decision to old variables for compatibility with existing logic
1110+
category = decision.reason
1111+
can_retry = decision.is_retryable
1112+
retry_delay = decision.backoff_ms / 1000.0 # Convert ms to seconds
10761113

10771114
# Check for auth errors and try refreshing subscription credentials
10781115
if category == "auth" and self._auth_provider_id and attempt == 0:
@@ -1089,8 +1126,27 @@ async def _call_with_retry_async(self, func, *args, **kwargs):
10891126
except Exception as refresh_error:
10901127
logging.warning(f"Failed to refresh subscription credentials: {refresh_error}")
10911128

1092-
# Failover: mark failure and try next profile (do this before early exit)
1093-
if self._failover_manager and self._current_profile:
1129+
# Handle different failover decision actions
1130+
if decision.action == "rotate_profile" and self._failover_manager:
1131+
if self._current_profile:
1132+
is_rate_limit = (category == "rate_limit")
1133+
self._failover_manager.mark_failure(
1134+
self._current_profile, error_str, is_rate_limit=is_rate_limit
1135+
)
1136+
next_profile = self._failover_manager.get_next_profile()
1137+
if next_profile and next_profile != self._current_profile:
1138+
self._switch_to_profile(next_profile)
1139+
self._current_profile = next_profile
1140+
# Update the kwargs with new profile values for the next retry
1141+
if "api_key" in kwargs:
1142+
kwargs["api_key"] = self.api_key
1143+
if "base_url" in kwargs:
1144+
kwargs["base_url"] = self.base_url
1145+
if "model" in kwargs:
1146+
kwargs["model"] = self.model
1147+
logging.info(f"Failover: switched to profile '{next_profile.name}'")
1148+
# Legacy failover for compatibility (when decision is retry but failover is configured)
1149+
elif self._failover_manager and self._current_profile and decision.action == "retry":
10941150
is_rate_limit = (category == "rate_limit")
10951151
self._failover_manager.mark_failure(
10961152
self._current_profile, error_str, is_rate_limit=is_rate_limit
@@ -1111,7 +1167,7 @@ async def _call_with_retry_async(self, func, *args, **kwargs):
11111167
retry_delay = 0.0
11121168
logging.info(f"Failover: switched to profile '{next_profile.name}'")
11131169

1114-
if not can_retry:
1170+
if decision.action == "surface_error" or not can_retry:
11151171
raise
11161172

11171173
if attempt < self._max_retries:
@@ -2144,7 +2200,7 @@ def _prepare_return_value(text: str) -> Union[str, tuple]:
21442200
)
21452201

21462202
# Sequential tool calling loop - similar to agent.py
2147-
max_iterations = 10 # Prevent infinite loops
2203+
max_iterations = self.max_iter # Use configurable iteration limit
21482204
iteration_count = 0
21492205
final_response_text = ""
21502206
response_text = "" # Initialize to prevent UnboundLocalError on API errors
@@ -3935,7 +3991,7 @@ async def get_response_async(
39353991
formatted_tools = self._format_tools_for_litellm(tools)
39363992

39373993
# Initialize variables for iteration loop
3938-
max_iterations = 50 # Prevent infinite loops
3994+
max_iterations = self.max_iter # Use configurable iteration limit
39393995
iteration_count = 0
39403996
final_response_text = ""
39413997
stored_reasoning_content = None # Store reasoning content from tool execution
@@ -4416,7 +4472,7 @@ async def get_response_async(
44164472
continue
44174473

44184474
# Safety check: prevent infinite loops for any provider
4419-
if iteration_count >= 20:
4475+
if iteration_count >= self.max_iter:
44204476
if tool_results:
44214477
final_response_text = "Task completed successfully based on tool execution results."
44224478
else:

0 commit comments

Comments
 (0)