vllm-project · ywang96 · Jan 15, 2025 · Jan 9, 2025 · Jan 9, 2025 · Jan 9, 2025
diff --git a/vllm/config.py b/vllm/config.py
@@ -1379,14 +1379,6 @@ class SchedulerConfig:
 
     is_multimodal_model: bool = False
 
-    # FIXME(woosuk & ywang96): Below are placeholder values. We need to
-    # calculate the actual values from the configurations.
-    # Multimodal encoder run compute budget, only used in V1
-    max_num_encoder_input_tokens = 16384
-
-    # Multimodal encoder cache size, only used in V1
-    encoder_cache_size = 16384
-
     # Whether to perform preemption by swapping or
     # recomputation. If not specified, we determine the mode as follows:
     # We use recomputation by default since it incurs lower overhead than

@@ -1,7 +1,15 @@
-from typing import Dict, List, Set, Tuple
+from typing import TYPE_CHECKING, Dict, List, Set, Tuple
 
+from vllm.logger import init_logger
+from vllm.multimodal import MULTIMODAL_REGISTRY
+from vllm.utils import cdiv
 from vllm.v1.request import Request
 
+if TYPE_CHECKING:
+    from vllm.config import ModelConfig, SchedulerConfig
+
+logger = init_logger(__name__)
+
 
 class EncoderCacheManager:
 
@@ -46,3 +54,61 @@ def get_freed_ids(self) -> List[Tuple[str, int]]:
         freed = self.freed
         self.freed = []
         return freed
+
+
+def compute_encoder_cache_budget(
+    model_config: "ModelConfig",
+    scheduler_config: "SchedulerConfig",
+) -> int:
+    """Compute the encoder cache budget based on the model and scheduler 
+    configurations.
+    """
+
+    encoder_cache_budget = 0
+    if not model_config.is_multimodal_model:
+        return encoder_cache_budget
+
+    max_tokens_by_modality_dict = MULTIMODAL_REGISTRY.get_max_tokens_per_item_by_modality(  # noqa: E501
+        model_config)
+
+    modality, max_tokens_per_mm_item = max(max_tokens_by_modality_dict.items(),
+                                           key=lambda item: item[1])
+
+    max_num_batched_tokens = scheduler_config.max_num_batched_tokens
+    max_num_reqs = scheduler_config.max_num_seqs
+
+    # In case that the biggest possible multimodal item takes space more
+    # than the batch size, then it needs to be cached and chunk prefilled.
+    if max_tokens_per_mm_item > max_num_batched_tokens:
+        num_items = 1
+
+    # In case that the biggest possible multimodal item takes space less
+    # the batch size, then all items will be full prefilled except one.
+    else:
+        num_items = cdiv(max_num_batched_tokens, max_tokens_per_mm_item)
+
+    # NOTE: We need the encoder cache to be able to compute & hold ONE
+    # ADDITIONAL multimodal item, and is required only when:
+    # - Two requests in the current batch share the same prefix with such item
+    #   as part of the prefix.
+    # - AND the prefix length is divisible by the block size, triggering the
+    #   recomputation of the last block.
+    # - AND the part of the embeddings of the item is in this last block.
+
+    # This can be improved when we have a global encoder cache that does
+    # not associate items to request id only.
+    num_items += 1
 if num_encoder_tokens > encoder_budget: 
     # The encoder budget is exhausted. We can only schedule the 
     # decoder tokens up until the encoder input. 
     # NOTE(woosuk): We assume that the encoder tokens should be 
     # processed altogether, as the encoder usually uses 
     # bidirectional attention. 
     num_new_tokens = start_pos - num_computed_tokens 
     break 
 if num_encoder_tokens > encoder_budget: 
     # The encoder budget is exhausted. We can only schedule the 
     # decoder tokens up until the encoder input. 
     # NOTE(woosuk): We assume that the encoder tokens should be 
     # processed altogether, as the encoder usually uses 
     # bidirectional attention. 
     num_new_tokens = start_pos - num_computed_tokens 
     break 
+
+    # Number of items needed cannot be bigger than max number of running
+    # requests * max number of multimodal items per request.
+    max_mm_items_per_req = max(
+        MULTIMODAL_REGISTRY.get_mm_limits_per_prompt(model_config).values())
+    num_items = min(num_items, max_num_reqs * max_mm_items_per_req)
+
+    encoder_cache_budget = num_items * max_tokens_per_mm_item
+    logger.info(
+        "Encoder cache will be initialized with a budget of %s tokens, and "
+        "profiled with %s %s items of the maximum feature size.",
+        encoder_cache_budget, num_items, modality)
+
+    return encoder_cache_budget
@@ -3,10 +3,11 @@
 from typing import (TYPE_CHECKING, Deque, Dict, Iterable, List, Optional, Set,
                     Tuple, Union)
 
-from vllm.config import CacheConfig, LoRAConfig, SchedulerConfig
+from vllm.config import CacheConfig, LoRAConfig, ModelConfig, SchedulerConfig
 from vllm.logger import init_logger
 from vllm.sampling_params import SamplingParams
-from vllm.v1.core.encoder_cache_manager import EncoderCacheManager
+from vllm.v1.core.encoder_cache_manager import (EncoderCacheManager,
+                                                compute_encoder_cache_budget)
 from vllm.v1.core.kv_cache_manager import KVCacheManager
 from vllm.v1.engine import EngineCoreOutput
 from vllm.v1.outputs import ModelRunnerOutput
@@ -24,6 +25,7 @@ class Scheduler:
     def __init__(
         self,
         scheduler_config: SchedulerConfig,
+        model_config: ModelConfig,
         cache_config: CacheConfig,
         lora_config: Optional[LoRAConfig],
     ) -> None:
@@ -68,16 +70,22 @@ def __init__(
         self.running_reqs_data: Dict[str, RunningRequestData] = {}
 
         # Encoder-related.
+        # Calculate encoder cache size if applicable
+        # NOTE: For now we use the same budget for both compute and space.
+        # This can be changed when we make encoder cache for embedding caching
+        # across requests.
+        encoder_cache_budget = compute_encoder_cache_budget(
+            model_config, scheduler_config)
+
         # NOTE(woosuk): Here, "encoder" includes the vision encoder (and
         # projector if needed). Currently, we assume that the encoder also
         # has the Transformer architecture (e.g., ViT).
-        self.max_num_encoder_input_tokens = self.scheduler_config.max_num_encoder_input_tokens  #noqa: E501
-        # NOTE(woosuk): For the models without encoder (e.g., text-only models),
-        # the encoder cache will not be initialized and used, regardless of
-        # the cache size. This is because the memory space for the encoder cache
-        # is preallocated in the profiling run.
+        self.max_num_encoder_input_tokens = encoder_cache_budget
+        # NOTE: For the models without encoder (e.g., text-only models),
+        # the encoder cache will not be initialized because cache size is 0
+        # for these models.
         self.encoder_cache_manager = EncoderCacheManager(
-            cache_size=self.scheduler_config.encoder_cache_size)
+            cache_size=encoder_cache_budget)
 
     def schedule(self) -> "SchedulerOutput":
         # NOTE(woosuk) on the scheduling algorithm:

@@ -58,9 +58,12 @@ def __init__(
         vllm_config.cache_config.num_cpu_blocks = num_cpu_blocks
 
         # Setup scheduler.
-        self.scheduler = Scheduler(vllm_config.scheduler_config,
-                                   vllm_config.cache_config,
-                                   vllm_config.lora_config)
+        self.scheduler = Scheduler(
+            scheduler_config=vllm_config.scheduler_config,
+            model_config=vllm_config.model_config,
+            cache_config=vllm_config.cache_config,
+            lora_config=vllm_config.lora_config,
+        )
 
         self._last_logging_time = time.time()
 

@@ -19,6 +19,7 @@
                         LayerBlockType, cdiv, is_pin_memory_available)
 from vllm.v1.attention.backends.flash_attn import (FlashAttentionBackend,
                                                    FlashAttentionMetadata)
+from vllm.v1.core.encoder_cache_manager import compute_encoder_cache_budget
 from vllm.v1.engine.mm_input_mapper import MMInputMapperClient
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.sample.metadata import SamplingMetadata
@@ -87,8 +88,8 @@ def __init__(
         self.mm_input_mapper_profiling = MMInputMapperClient(self.model_config)
         self.mm_input_mapper_profiling.use_cache = False
 
-        self.max_num_encoder_input_tokens = self.scheduler_config.max_num_encoder_input_tokens  # noqa: E501
-        self.encoder_cache_size = self.scheduler_config.encoder_cache_size
+        self.encoder_cache_budget = compute_encoder_cache_budget(
+            self.model_config, self.scheduler_config)
 
         # Lazy initialization
         # self.model: nn.Module  # Set after load_model
@@ -722,6 +723,10 @@ def profile_run(self) -> None:
         # Profile with multimodal encoder & encoder cache.
         if self.is_multimodal_model:
 
+            # Encoder cache budget should be set to the model and scheduler
+            # configurations accordingly.
+            assert self.encoder_cache_budget > 0
+
             # Create dummy batch of multimodal inputs.
             dummy_request_data = self.input_registry.dummy_data_for_profiling(
                 model_config=self.model_config,
@@ -739,34 +744,7 @@ def profile_run(self) -> None:
             dummy_data_modality, max_tokens_per_mm_item = max(
                 max_tokens_by_modality_dict.items(), key=lambda item: item[1])
 
-            # Check how many items of this modality can be supported by
-            # the encoder cache budget.
-            encoder_cache_budget = min(self.max_num_encoder_input_tokens,
-                                       self.encoder_cache_size)
-            max_num_mm_items_encoder_budget = encoder_cache_budget // \
-                max_tokens_per_mm_item
-
-            # TODO: Allow users to set encoder_cache_budget in case this
-            # happens.
-            assert max_num_mm_items_encoder_budget > 0, (
-                f"Encoder cache budget={encoder_cache_budget} is too small to "
-                f"support the maximum possible size of multimodal embeddings"
-                f"={max_tokens_per_mm_item}.")
-
-            # Check how many items of this modality can be supported by
-            # the decoder budget.
-            max_mm_items_per_req = max(
-                self.mm_registry.get_mm_limits_per_prompt(
-                    self.model_config).values())
-
-            # NOTE: We do not consider max_num_batched_tokens on purpose
-            # because the multimodal embeddings can be generated in advance
-            # and chunked prefilled.
-            max_num_mm_items_decoder_budget = self.max_num_reqs * \
-                max_mm_items_per_req
-
-            max_num_mm_items = min(max_num_mm_items_encoder_budget,
-                                   max_num_mm_items_decoder_budget)
+            max_num_mm_items = self.encoder_cache_budget // max_tokens_per_mm_item  # noqa: E501
 
             # Dummy data definition in V0 may contain multiple multimodal items
             # (e.g, multiple images) for a single request, therefore here we