feat: tpm-rpm limit in prometheus metrics (BerriAI#19725)

Harshit28j · krrishdholakia · web-flow · commit 87acdef89908 · 2026-01-25T23:11:33.000-08:00
Co-authored-by: Krish Dholakia &lt;krrishdholakia@gmail.com&gt;
diff --git a/litellm/integrations/prometheus.py b/litellm/integrations/prometheus.py
@@ -316,6 +316,18 @@ def __init__(  # noqa: PLR0915
                 labelnames=self.get_labels_for_metric("litellm_deployment_state"),
             )
 
+            self.litellm_deployment_tpm_limit = self._gauge_factory(
+                "litellm_deployment_tpm_limit",
+                "Deployment TPM limit found in config",
+                labelnames=self.get_labels_for_metric("litellm_deployment_tpm_limit"),
+            )
+
+            self.litellm_deployment_rpm_limit = self._gauge_factory(
+                "litellm_deployment_rpm_limit",
+                "Deployment RPM limit found in config",
+                labelnames=self.get_labels_for_metric("litellm_deployment_rpm_limit"),
+            )
+
             self.litellm_deployment_cooled_down = self._counter_factory(
                 "litellm_deployment_cooled_down",
                 "LLM Deployment Analytics - Number of times a deployment has been cooled down by LiteLLM load balancing logic. exception_status is the status of the exception that caused the deployment to be cooled down",
@@ -1778,6 +1790,49 @@ def set_llm_deployment_failure_metrics(self, request_kwargs: dict):
                 )
             )
 
+    def _set_deployment_tpm_rpm_limit_metrics(
+        self,
+        model_info: dict,
+        litellm_params: dict,
+        litellm_model_name: Optional[str],
+        model_id: Optional[str],
+        api_base: Optional[str],
+        llm_provider: Optional[str],
+    ):
+        """
+        Set the deployment TPM and RPM limits metrics
+        """
+        tpm = model_info.get("tpm") or litellm_params.get("tpm")
+        rpm = model_info.get("rpm") or litellm_params.get("rpm")
+
+        if tpm is not None:
+            _labels = prometheus_label_factory(
+                supported_enum_labels=self.get_labels_for_metric(
+                    metric_name="litellm_deployment_tpm_limit"
+                ),
+                enum_values=UserAPIKeyLabelValues(
+                    litellm_model_name=litellm_model_name,
+                    model_id=model_id,
+                    api_base=api_base,
+                    api_provider=llm_provider,
+                ),
+            )
+            self.litellm_deployment_tpm_limit.labels(**_labels).set(tpm)
+
+        if rpm is not None:
+            _labels = prometheus_label_factory(
+                supported_enum_labels=self.get_labels_for_metric(
+                    metric_name="litellm_deployment_rpm_limit"
+                ),
+                enum_values=UserAPIKeyLabelValues(
+                    litellm_model_name=litellm_model_name,
+                    model_id=model_id,
+                    api_base=api_base,
+                    api_provider=llm_provider,
+                ),
+            )
+            self.litellm_deployment_rpm_limit.labels(**_labels).set(rpm)
+
     def set_llm_deployment_success_metrics(
         self,
         request_kwargs: dict,
@@ -1811,6 +1866,16 @@ def set_llm_deployment_success_metrics(
             _model_info = _metadata.get("model_info") or {}
             model_id = _model_info.get("id", None)
 
+            if _model_info or _litellm_params:
+                self._set_deployment_tpm_rpm_limit_metrics(
+                    model_info=_model_info,
+                    litellm_params=_litellm_params,
+                    litellm_model_name=litellm_model_name,
+                    model_id=model_id,
+                    api_base=api_base,
+                    llm_provider=llm_provider,
+                )
+
             remaining_requests: Optional[int] = None
             remaining_tokens: Optional[int] = None
             if additional_headers := standard_logging_payload["hidden_params"][
diff --git a/litellm/litellm_core_utils/get_litellm_params.py b/litellm/litellm_core_utils/get_litellm_params.py
@@ -93,8 +93,11 @@ def get_litellm_params(
         "text_completion": text_completion,
         "azure_ad_token_provider": azure_ad_token_provider,
         "user_continue_message": user_continue_message,
-        "base_model": base_model or (
-            _get_base_model_from_litellm_call_metadata(metadata=metadata) if metadata else None
+        "base_model": base_model
+        or (
+            _get_base_model_from_litellm_call_metadata(metadata=metadata)
+            if metadata
+            else None
         ),
         "litellm_trace_id": litellm_trace_id,
         "litellm_session_id": litellm_session_id,
@@ -139,5 +142,7 @@ def get_litellm_params(
         "aws_sts_endpoint": kwargs.get("aws_sts_endpoint"),
         "aws_external_id": kwargs.get("aws_external_id"),
         "aws_bedrock_runtime_endpoint": kwargs.get("aws_bedrock_runtime_endpoint"),
+        "tpm": kwargs.get("tpm"),
+        "rpm": kwargs.get("rpm"),
     }
     return litellm_params
diff --git a/litellm/main.py b/litellm/main.py
@@ -148,7 +148,7 @@
     validate_and_fix_openai_messages,
     validate_and_fix_openai_tools,
     validate_chat_completion_tool_choice,
-    validate_openai_optional_params
+    validate_openai_optional_params,
 )
 
 from ._logging import verbose_logger
@@ -368,7 +368,7 @@ async def create(self, messages, model=None, **kwargs):
 
 @tracer.wrap()
 @client
-async def acompletion( # noqa: PLR0915
+async def acompletion(  # noqa: PLR0915
     model: str,
     # Optional OpenAI params: see https://platform.openai.com/docs/api-reference/chat/create
     messages: List = [],
@@ -603,12 +603,11 @@ async def acompletion( # noqa: PLR0915
         if timeout is not None and isinstance(timeout, (int, float)):
             timeout_value = float(timeout)
             init_response = await asyncio.wait_for(
-                loop.run_in_executor(None, func_with_context),
-                timeout=timeout_value
+                loop.run_in_executor(None, func_with_context), timeout=timeout_value
             )
         else:
             init_response = await loop.run_in_executor(None, func_with_context)
-            
+
         if isinstance(init_response, dict) or isinstance(
             init_response, ModelResponse
         ):  ## CACHING SCENARIO
@@ -640,6 +639,7 @@ async def acompletion( # noqa: PLR0915
     except asyncio.TimeoutError:
         custom_llm_provider = custom_llm_provider or "openai"
         from litellm.exceptions import Timeout
+
         raise Timeout(
             message=f"Request timed out after {timeout} seconds",
             model=model,
@@ -1118,7 +1118,6 @@ def completion(  # type: ignore # noqa: PLR0915
     # validate optional params
     stop = validate_openai_optional_params(stop=stop)
 
-
     ######### unpacking kwargs #####################
     args = locals()
 
@@ -1135,7 +1134,9 @@ def completion(  # type: ignore # noqa: PLR0915
         # Check if MCP tools are present (following responses pattern)
         # Cast tools to Optional[Iterable[ToolParam]] for type checking
         tools_for_mcp = cast(Optional[Iterable[ToolParam]], tools)
-        if LiteLLM_Proxy_MCP_Handler._should_use_litellm_mcp_gateway(tools=tools_for_mcp):
+        if LiteLLM_Proxy_MCP_Handler._should_use_litellm_mcp_gateway(
+            tools=tools_for_mcp
+        ):
             # Return coroutine - acompletion will await it
             # completion() can return a coroutine when MCP tools are present, which acompletion() awaits
             return acompletion_with_mcp(  # type: ignore[return-value]
@@ -1536,6 +1537,8 @@ def completion(  # type: ignore # noqa: PLR0915
             max_retries=max_retries,
             timeout=timeout,
             litellm_request_debug=kwargs.get("litellm_request_debug", False),
+            tpm=kwargs.get("tpm"),
+            rpm=kwargs.get("rpm"),
         )
         cast(LiteLLMLoggingObj, logging).update_environment_variables(
             model=model,
@@ -2361,11 +2364,7 @@ def completion(  # type: ignore # noqa: PLR0915
                 input=messages, api_key=api_key, original_response=response
             )
         elif custom_llm_provider == "minimax":
-            api_key = (
-                api_key
-                or get_secret_str("MINIMAX_API_KEY")
-                or litellm.api_key
-            )
+            api_key = api_key or get_secret_str("MINIMAX_API_KEY") or litellm.api_key
 
             api_base = (
                 api_base
@@ -2413,7 +2412,9 @@ def completion(  # type: ignore # noqa: PLR0915
             or custom_llm_provider == "wandb"
             or custom_llm_provider == "clarifai"
             or custom_llm_provider in litellm.openai_compatible_providers
-            or JSONProviderRegistry.exists(custom_llm_provider)  # JSON-configured providers
+            or JSONProviderRegistry.exists(
+                custom_llm_provider
+            )  # JSON-configured providers
             or "ft:gpt-3.5-turbo" in model  # finetune gpt-3.5-turbo
         ):  # allow user to make an openai call with a custom base
             # note: if a user sets a custom base - we should ensure this works
@@ -4724,7 +4725,7 @@ def embedding(  # noqa: PLR0915
 
             if headers is not None and headers != {}:
                 optional_params["extra_headers"] = headers
-            
+
             if encoding_format is not None:
                 optional_params["encoding_format"] = encoding_format
             else:
@@ -6759,9 +6760,7 @@ def speech(  # noqa: PLR0915
         if text_to_speech_provider_config is None:
             text_to_speech_provider_config = MinimaxTextToSpeechConfig()
 
-        minimax_config = cast(
-            MinimaxTextToSpeechConfig, text_to_speech_provider_config
-        )
+        minimax_config = cast(MinimaxTextToSpeechConfig, text_to_speech_provider_config)
 
         if api_base is not None:
             litellm_params_dict["api_base"] = api_base
@@ -6901,7 +6900,7 @@ async def ahealth_check(
         custom_llm_provider_from_params = model_params.get("custom_llm_provider", None)
         api_base_from_params = model_params.get("api_base", None)
         api_key_from_params = model_params.get("api_key", None)
-        
+
         model, custom_llm_provider, _, _ = get_llm_provider(
             model=model,
             custom_llm_provider=custom_llm_provider_from_params,
@@ -7275,8 +7274,9 @@ def __getattr__(name: str) -> Any:
         _encoding = tiktoken.get_encoding("cl100k_base")
         # Cache it in the module's __dict__ for subsequent accesses
         import sys
+
         sys.modules[__name__].__dict__["encoding"] = _encoding
         global _encoding_cache
         _encoding_cache = _encoding
         return _encoding
-    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
diff --git a/litellm/types/integrations/prometheus.py b/litellm/types/integrations/prometheus.py
@@ -199,6 +199,8 @@ class UserAPIKeyLabelNames(Enum):
     "litellm_cache_hits_metric",
     "litellm_cache_misses_metric",
     "litellm_cached_tokens_metric",
+    "litellm_deployment_tpm_limit",
+    "litellm_deployment_rpm_limit",
     "litellm_remaining_api_key_requests_for_model",
     "litellm_remaining_api_key_tokens_for_model",
     "litellm_llm_api_failed_requests_metric",
@@ -406,6 +408,15 @@ class PrometheusMetricLabels:
         UserAPIKeyLabelNames.API_PROVIDER.value,
     ]
 
+    litellm_deployment_tpm_limit = [
+        UserAPIKeyLabelNames.v2_LITELLM_MODEL_NAME.value,
+        UserAPIKeyLabelNames.MODEL_ID.value,
+        UserAPIKeyLabelNames.API_BASE.value,
+        UserAPIKeyLabelNames.API_PROVIDER.value,
+    ]
+
+    litellm_deployment_rpm_limit = litellm_deployment_tpm_limit
+
     litellm_deployment_cooled_down = [
         UserAPIKeyLabelNames.v2_LITELLM_MODEL_NAME.value,
         UserAPIKeyLabelNames.MODEL_ID.value,