From 022c6b4a306d0392e671848b08acc91266febf71 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Wed, 1 Jan 2025 17:37:26 +0000
Subject: [PATCH 01/47] initial

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/multimodal/utils.py | 30 +++++++++++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 7b6ded6a2708..2e491d017f2c 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -1,6 +1,6 @@
 from functools import lru_cache
 from pathlib import Path
-from typing import Optional, TypeVar, Union
+from typing import TYPE_CHECKING, Optional, TypeVar, Union
 from urllib.parse import ParseResult, urlparse
 
 import numpy as np
@@ -25,6 +25,9 @@
 
 _M = TypeVar("_M")
 
+if TYPE_CHECKING:
+    from ..multimodal import MultiModalPlaceholderDict
+
 
 class MediaConnector:
 
@@ -437,3 +440,28 @@ def consecutive_placeholder_ranges(
         PlaceholderRange(offset=initial_offset + i * item_size,
                          length=item_size) for i in range(num_items)
     ]
+
+
+def merge_and_sort_placeholders_from_modalities(
+    modalities: list[str], mm_positions: "MultiModalPlaceholderDict"
+) -> tuple[list[tuple[str, int]], list[PlaceholderRange]]:
+
+    placeholder_lists_with_modality = [(modality, mm_positions[modality])
+                                       for modality in modalities
+                                       if modality in mm_positions]
+
+    sorted_lists_with_modality = sorted(placeholder_lists_with_modality,
+                                        key=lambda x: x[1][0]['offset'])
+
+    # Verify if the sorted order avoids interleaving
+    merged: list[PlaceholderRange] = []
+    for modality, placeholder_list in sorted_lists_with_modality:
+        if merged and placeholder_list[0]['offset'] < merged[-1]['offset']:
+            raise ValueError(
+                "Interleaved mixed-modality inference is currently not "
+                "supported.")
+        merged.extend(placeholder_list)
+
+    # Return the order of the keys and the merged result
+    return [(modality, len(lst))
+            for modality, lst in sorted_lists_with_modality], merged

From 43fdf458bb6fb9c706d5bdbd89666c75389fbc50 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Wed, 1 Jan 2025 17:37:40 +0000
Subject: [PATCH 02/47] fix llava ov

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/model_executor/models/llava_onevision.py | 33 ++++++++-----------
 1 file changed, 13 insertions(+), 20 deletions(-)

diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index 0bebc1c745e2..cfe0c4c86f44 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -38,8 +38,8 @@
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
 
-# Result in the max possible feature size (2x2 grid of 336x336px tiles)
-MAX_IMAGE_FEATURE_SIZE_HEIGHT = MAX_IMAGE_FEATURE_SIZE_WIDTH = 448
+# Ref: https://github.com/LLaVA-VL/LLaVA-NeXT/blob/main/docs/LLaVA_OneVision.md?plain=1#L14
+MAX_IMAGE_FEATURE_SIZE_HEIGHT = MAX_IMAGE_FEATURE_SIZE_WIDTH = 2304
 
 # For profile run
 _MAX_FRAMES_PER_VIDEO = 16
@@ -366,9 +366,11 @@ def input_processor_for_llava_onevision(ctx: InputContext,
                                     and "image" not in multi_modal_data):
         return inputs
     if "image" in multi_modal_data:
-        return input_processor_when_multimodal_input_image(ctx, inputs)
+        inputs = input_processor_when_multimodal_input_image(ctx, inputs)
     if "video" in multi_modal_data:
         return input_processor_when_multimodal_input_video(ctx, inputs)
+    else:
+        return inputs
 
     msg = "Unsupported multi data type"
     raise NotImplementedError(msg)
@@ -832,21 +834,18 @@ def get_multimodal_embeddings(
         if not modalities:
             return None
 
-        # We make a tuple of each embedding with its modality string. This is a
-        # temporary workaround for models to handle mixed modalities when
-        # get_multimodal_embeddings and get_input_embeddings are called
-        # separately.
-        # TODO(ywang96): Add support for mixed-modality inference for v1.
-        multimodal_embeddings: List[Tuple[NestedTensors, str]] = []
+        # The result multimoal_embeddings is tuple of tensors, with each
+        # tensor correspoending to a multimodal data item (image or video).
+        multimodal_embeddings: tuple[torch.Tensor, ...] = ()
 
         if "images" in modalities:
             image_input = modalities["images"]
             vision_embeddings = self._process_image_input(image_input)
-            multimodal_embeddings.append((vision_embeddings, "image"))
+            multimodal_embeddings += tuple(vision_embeddings)
         if "videos" in modalities:
             video_input = modalities["videos"]
             video_embeddings = self._process_video_pixels(video_input)
-            multimodal_embeddings.append((video_embeddings, "video"))
+            multimodal_embeddings += tuple(video_embeddings)
 
         return multimodal_embeddings
 
@@ -858,15 +857,9 @@ def get_input_embeddings(
     ) -> torch.Tensor:
         inputs_embeds = self.language_model.get_input_embeddings(input_ids)
         if multimodal_embeddings is not None:
-            for embeddings, modality in multimodal_embeddings:
-                if modality == "image":
-                    inputs_embeds = merge_multimodal_embeddings(
-                        input_ids, inputs_embeds, embeddings,
-                        self.config.image_token_index)
-                if modality == "video":
-                    inputs_embeds = merge_multimodal_embeddings(
-                        input_ids, inputs_embeds, embeddings,
-                        self.config.video_token_index)
+            inputs_embeds = merge_multimodal_embeddings(
+                input_ids, inputs_embeds, multimodal_embeddings,
+                [self.config.image_token_index, self.config.video_token_index])
         return inputs_embeds
 
     def forward(

From e0fb002a37f0edb716e409029361e5ad61bf201b Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Wed, 1 Jan 2025 17:38:06 +0000
Subject: [PATCH 03/47] iterate

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/v1/engine/mm_input_mapper.py |  2 ++
 vllm/v1/request.py                | 22 +++++++++++++++++++---
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/vllm/v1/engine/mm_input_mapper.py b/vllm/v1/engine/mm_input_mapper.py
index 8bfc739b3dbb..d2dd5f7d07b0 100644
--- a/vllm/v1/engine/mm_input_mapper.py
+++ b/vllm/v1/engine/mm_input_mapper.py
@@ -1,6 +1,7 @@
 from typing import Any, Dict, List, Optional
 
 import PIL
+import torch
 from blake3 import blake3
 
 from vllm.config import ModelConfig
@@ -102,6 +103,7 @@ def process_inputs(
                         {"image": [image_inputs[input_id]]},
                         mm_processor_kwargs=mm_processor_kwargs,
                     )
+                    mm_input["image"] = torch.tensor([])
 
                 if self.use_cache:
                     # Add to cache
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index f4783ae366ef..7806fadb79a3 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -4,6 +4,7 @@
 from vllm.inputs import DecoderOnlyInputs, SingletonInputsAdapter, token_inputs
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import MultiModalKwargs
+from vllm.multimodal.utils import merge_and_sort_placeholders_from_modalities
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import RequestMetrics
 from vllm.v1.engine import EngineCoreRequest
@@ -49,16 +50,31 @@ def __init__(
         self.num_computed_tokens = 0
 
         # Multi-modal input metadata.
+        all_modalities = ["image", "video", "audio"]
         mm_positions = self.inputs.multi_modal_placeholders
         if mm_positions:
-            # FIXME(woosuk): Support other modalities.
-            self.mm_positions = mm_positions.get("image", [])
+            sorted_modalities, sorted_mm_positions = merge_and_sort_placeholders_from_modalities(  # noqa: E501
+                all_modalities, mm_positions)
+            self.mm_positions = sorted_mm_positions
         else:
             self.mm_positions = []
         # Output of the mm input mapper (e.g., image tensors).
         self.mm_inputs: List[MultiModalKwargs] = []
         if self.inputs.multi_modal_inputs:
-            self.mm_inputs = self.inputs.multi_modal_inputs
+            if len(sorted_modalities) == 1:
+                self.mm_inputs = self.inputs.multi_modal_inputs
+            else:
+                for modality, count in sorted_modalities:
+                    for i in range(len(self.inputs.multi_modal_inputs)):
+                        if modality in self.inputs.multi_modal_inputs[i]:
+                            for j in range(count):
+                                self.inputs.multi_modal_inputs[i +
+                                                               j].pop(modality)
+                                self.mm_inputs.append(
+                                    self.inputs.multi_modal_inputs[i + j])
+                            break
+        assert len(self.mm_inputs) == len(self.inputs.multi_modal_inputs)
+        assert len(self.mm_inputs) == len(self.mm_positions)
 
         self.mm_hashes: List[str] = self.inputs.multi_modal_hashes
 

From b45010b77e63d363e8d4748fbe44fb88a1921b39 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Thu, 2 Jan 2025 07:24:31 +0000
Subject: [PATCH 04/47] revert padding tensor

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/v1/engine/mm_input_mapper.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/vllm/v1/engine/mm_input_mapper.py b/vllm/v1/engine/mm_input_mapper.py
index d2dd5f7d07b0..8bfc739b3dbb 100644
--- a/vllm/v1/engine/mm_input_mapper.py
+++ b/vllm/v1/engine/mm_input_mapper.py
@@ -1,7 +1,6 @@
 from typing import Any, Dict, List, Optional
 
 import PIL
-import torch
 from blake3 import blake3
 
 from vllm.config import ModelConfig
@@ -103,7 +102,6 @@ def process_inputs(
                         {"image": [image_inputs[input_id]]},
                         mm_processor_kwargs=mm_processor_kwargs,
                     )
-                    mm_input["image"] = torch.tensor([])
 
                 if self.use_cache:
                     # Add to cache

From d83e25e5e93fc3341f6e365d78b479ee164cccc4 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Thu, 2 Jan 2025 11:10:59 +0000
Subject: [PATCH 05/47] simplify

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/multimodal/utils.py |  9 ++++++---
 vllm/v1/request.py       | 40 ++++++++++++++++++++++++----------------
 2 files changed, 30 insertions(+), 19 deletions(-)

diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 2e491d017f2c..460c9d96bedf 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -444,7 +444,11 @@ def consecutive_placeholder_ranges(
 
 def merge_and_sort_placeholders_from_modalities(
     modalities: list[str], mm_positions: "MultiModalPlaceholderDict"
-) -> tuple[list[tuple[str, int]], list[PlaceholderRange]]:
+) -> tuple[list[str], list[PlaceholderRange]]:
+
+    # For single modality, its placeholder ranges are already sorted.
+    if len(modalities) == 1:
+        return modalities, list(mm_positions[modalities[0]])
 
     placeholder_lists_with_modality = [(modality, mm_positions[modality])
                                        for modality in modalities
@@ -463,5 +467,4 @@ def merge_and_sort_placeholders_from_modalities(
         merged.extend(placeholder_list)
 
     # Return the order of the keys and the merged result
-    return [(modality, len(lst))
-            for modality, lst in sorted_lists_with_modality], merged
+    return [modality for modality, _ in sorted_lists_with_modality], merged
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 7806fadb79a3..4317b3e70f27 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -50,32 +50,40 @@ def __init__(
         self.num_computed_tokens = 0
 
         # Multi-modal input metadata.
-        all_modalities = ["image", "video", "audio"]
         mm_positions = self.inputs.multi_modal_placeholders
         if mm_positions:
+            available_modalities = mm_positions.keys()
             sorted_modalities, sorted_mm_positions = merge_and_sort_placeholders_from_modalities(  # noqa: E501
-                all_modalities, mm_positions)
+                list(available_modalities), mm_positions)
             self.mm_positions = sorted_mm_positions
         else:
+            sorted_modalities = []
             self.mm_positions = []
+
         # Output of the mm input mapper (e.g., image tensors).
         self.mm_inputs: List[MultiModalKwargs] = []
         if self.inputs.multi_modal_inputs:
-            if len(sorted_modalities) == 1:
-                self.mm_inputs = self.inputs.multi_modal_inputs
-            else:
-                for modality, count in sorted_modalities:
-                    for i in range(len(self.inputs.multi_modal_inputs)):
-                        if modality in self.inputs.multi_modal_inputs[i]:
-                            for j in range(count):
-                                self.inputs.multi_modal_inputs[i +
-                                                               j].pop(modality)
-                                self.mm_inputs.append(
-                                    self.inputs.multi_modal_inputs[i + j])
-                            break
-        assert len(self.mm_inputs) == len(self.inputs.multi_modal_inputs)
-        assert len(self.mm_inputs) == len(self.mm_positions)
+            # NOTE: We only need to sort multimodal kwargs when there
+            # are multiple modalities involved.
+            if len(sorted_modalities) > 1:
+                modality_order_dict = {
+                    modality: order
+                    for order, modality in enumerate(sorted_modalities)
+                }
+
+                # Sanity check to make sure each multimodal input
+                # has only one modality key.
+                for mm_input in self.inputs.multi_modal_inputs:
+                    assert len(mm_input.modalities) == 1
+
+                # Sort MultiModalKwags to match sorted_mm_positions
+                self.inputs.multi_modal_inputs.sort(
+                    key=lambda mm_input: modality_order_dict[list(
+                        mm_input.modalities)[0]])
+
+            self.mm_inputs = self.inputs.multi_modal_inputs
 
+        assert len(self.mm_inputs) == len(self.mm_positions)
         self.mm_hashes: List[str] = self.inputs.multi_modal_hashes
 
         # Cache the computed kv block hashes of the request to avoid

From d13b0f7533e9c3e5efd0b96bb2885cba3ee9cca9 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Thu, 2 Jan 2025 11:11:12 +0000
Subject: [PATCH 06/47] comment

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/multimodal/utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 460c9d96bedf..fefc7c76eed1 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -466,5 +466,5 @@ def merge_and_sort_placeholders_from_modalities(
                 "supported.")
         merged.extend(placeholder_list)
 
-    # Return the order of the keys and the merged result
+    # Return the order of modalities and the merged placeholder ranges
     return [modality for modality, _ in sorted_lists_with_modality], merged

From 6959ec0ca8ec37e17f4c755889b2c251334f2bac Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Thu, 2 Jan 2025 11:58:17 +0000
Subject: [PATCH 07/47] simplify and doc

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/multimodal/utils.py | 19 ++++++++++++++++++-
 vllm/v1/request.py       |  3 +--
 2 files changed, 19 insertions(+), 3 deletions(-)

diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index fefc7c76eed1..ef980c48b852 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -443,8 +443,25 @@ def consecutive_placeholder_ranges(
 
 
 def merge_and_sort_placeholders_from_modalities(
-    modalities: list[str], mm_positions: "MultiModalPlaceholderDict"
+    mm_positions: "MultiModalPlaceholderDict"
 ) -> tuple[list[str], list[PlaceholderRange]]:
+    """Given a MultiModalPlaceholderDict, merge all PlaceholderRange
+    objects from all available modalities into a single list of 
+    PlaceholderRange, sorted by their offset (starting index in the input 
+    sequence) in the ascending order.
+
+    Raises:
+        ValueError: If the input prompt has interleaved placeholders from
+            different modalities (e.g, "<image><audio><image> Describe the 
+            content.")
+    
+    Returns:
+        list[str]: Sorted list of involved modalities.
+        list[PlaceholderRange]: Sorted list of all PlaceholdeRanges from 
+            mm_positions.
+    """
+
+    modalities = list(mm_positions.keys())
 
     # For single modality, its placeholder ranges are already sorted.
     if len(modalities) == 1:
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 4317b3e70f27..5586179a97d7 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -52,9 +52,8 @@ def __init__(
         # Multi-modal input metadata.
         mm_positions = self.inputs.multi_modal_placeholders
         if mm_positions:
-            available_modalities = mm_positions.keys()
             sorted_modalities, sorted_mm_positions = merge_and_sort_placeholders_from_modalities(  # noqa: E501
-                list(available_modalities), mm_positions)
+                mm_positions)
             self.mm_positions = sorted_mm_positions
         else:
             sorted_modalities = []

From ba071c683acee7317f3e48af259973b1dab11cbf Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Fri, 3 Jan 2025 08:40:25 +0000
Subject: [PATCH 08/47] refactor logic

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/v1/engine/__init__.py  | 18 +++++-----
 vllm/v1/engine/processor.py | 39 +++++++++++++++++++--
 vllm/v1/request.py          | 69 +++++++++++--------------------------
 3 files changed, 67 insertions(+), 59 deletions(-)

diff --git a/vllm/v1/engine/__init__.py b/vllm/v1/engine/__init__.py
index f70464fc8829..5e3c5e327ef6 100644
--- a/vllm/v1/engine/__init__.py
+++ b/vllm/v1/engine/__init__.py
@@ -1,12 +1,14 @@
 import enum
 from dataclasses import dataclass
-from typing import List, Optional, Union
+from typing import TYPE_CHECKING, List, Optional, Union
 
 import msgspec
 
-from vllm.lora.request import LoRARequest
-from vllm.multimodal import MultiModalKwargs, MultiModalPlaceholderDict
-from vllm.sampling_params import SamplingParams
+if TYPE_CHECKING:
+    from vllm.lora.request import LoRARequest
+    from vllm.multimodal import MultiModalKwargs
+    from vllm.multimodal.inputs import PlaceholderRange
+    from vllm.sampling_params import SamplingParams
 
 
 @dataclass
@@ -21,13 +23,13 @@ class EngineCoreRequest:
     # always be tokenized?
     prompt: Optional[str]
     prompt_token_ids: List[int]
-    mm_inputs: Optional[List[Optional[MultiModalKwargs]]]
+    mm_inputs: Optional[List[Optional["MultiModalKwargs"]]]
     mm_hashes: Optional[List[str]]
-    mm_placeholders: Optional[MultiModalPlaceholderDict]
-    sampling_params: SamplingParams
+    mm_placeholders: Optional[List["PlaceholderRange"]]
+    sampling_params: "SamplingParams"
     eos_token_id: Optional[int]
     arrival_time: float
-    lora_request: Optional[LoRARequest]
+    lora_request: Optional["LoRARequest"]
 
 
 class EngineCoreOutput(
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 905d3d1fc3e1..9cf94fb9622f 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -9,6 +9,7 @@
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs,
                              MultiModalRegistry)
+from vllm.multimodal.utils import merge_and_sort_placeholders_from_modalities
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
@@ -135,13 +136,47 @@ def process_inputs(
                 precomputed_mm_inputs,
             )
 
+        # Merge and sort multimodal placeholders, inputs and hashes by
+        # their positions in the input sequence.
+        # NOTE: interleaved modalities are not supported.
+        mm_positions = decoder_inputs.multi_modal_placeholders
+        if mm_positions:
+            sorted_modalities, sorted_mm_positions = merge_and_sort_placeholders_from_modalities(  # noqa: E501
+                mm_positions)
+            self.mm_positions = sorted_mm_positions
+        else:
+            sorted_modalities = []
+            self.mm_positions = []
+
+        sorted_mm_inputs = None
+        if mm_inputs:
+            # NOTE: We only need to sort multimodal inputs/kwargs when
+            # there are multiple modalities involved.
+            if len(sorted_modalities) > 1:
+                modality_order_dict = {
+                    modality: order
+                    for order, modality in enumerate(sorted_modalities)
+                }
+
+                # Sanity check to make sure each multimodal input
+                # has only one modality key.
+                for mm_input in mm_inputs:
+                    assert len(mm_input.modalities) == 1
+
+                # Sort MultiModalKwags to match sorted_mm_positions
+                sorted_mm_inputs = sorted(mm_inputs,
+                                        key=lambda mm_input: modality_order_dict[
+                                            list(mm_input.modalities)[0]])
+            else:
+                sorted_mm_inputs = mm_inputs
+
         return EngineCoreRequest(
             request_id,
             decoder_inputs.prompt,
             decoder_inputs.prompt_token_ids,
-            mm_inputs,
+            sorted_mm_inputs,
             mm_hashes,
-            decoder_inputs.multi_modal_placeholders,
+            sorted_mm_positions,
             sampling_params,
             eos_token_id,
             arrival_time,
diff --git a/vllm/v1/request.py b/vllm/v1/request.py
index 5586179a97d7..45450165eaef 100644
--- a/vllm/v1/request.py
+++ b/vllm/v1/request.py
@@ -1,16 +1,15 @@
 import enum
 from typing import TYPE_CHECKING, List, Optional, Union
 
-from vllm.inputs import DecoderOnlyInputs, SingletonInputsAdapter, token_inputs
 from vllm.lora.request import LoRARequest
-from vllm.multimodal import MultiModalKwargs
-from vllm.multimodal.utils import merge_and_sort_placeholders_from_modalities
 from vllm.sampling_params import SamplingParams
 from vllm.sequence import RequestMetrics
 from vllm.v1.engine import EngineCoreRequest
 from vllm.v1.utils import ConstantList
 
 if TYPE_CHECKING:
+    from vllm.multimodal import MultiModalKwargs
+    from vllm.multimodal.inputs import PlaceholderRange
     from vllm.v1.core.kv_cache_utils import BlockHashType
 
 
@@ -19,14 +18,17 @@ class Request:
     def __init__(
         self,
         request_id: str,
-        inputs: DecoderOnlyInputs,
+        prompt: Optional[str],
+        prompt_token_ids: List[int],
+        multi_modal_inputs: Optional[List["MultiModalKwargs"]],
+        multi_modal_hashes: Optional[List[str]],
+        multi_modal_placeholders: Optional[List["PlaceholderRange"]],
         sampling_params: SamplingParams,
         eos_token_id: Optional[int],
         arrival_time: float,
         lora_request: Optional[LoRARequest] = None,
     ) -> None:
         self.request_id = request_id
-        self.inputs = SingletonInputsAdapter(inputs)
         self.sampling_params = sampling_params
         # Because of LoRA, the eos token id can be different for each request.
         self.eos_token_id = eos_token_id
@@ -42,48 +44,21 @@ def __init__(
         assert sampling_params.max_tokens is not None
         self.max_tokens = sampling_params.max_tokens
 
-        self.prompt = self.inputs.prompt
-        self.prompt_token_ids = self.inputs.prompt_token_ids
+        self.prompt = prompt
+        self.prompt_token_ids = prompt_token_ids
         self.num_prompt_tokens = len(self.prompt_token_ids)
         self._output_token_ids: List[int] = []
         self._all_token_ids: List[int] = self.prompt_token_ids.copy()
         self.num_computed_tokens = 0
 
-        # Multi-modal input metadata.
-        mm_positions = self.inputs.multi_modal_placeholders
-        if mm_positions:
-            sorted_modalities, sorted_mm_positions = merge_and_sort_placeholders_from_modalities(  # noqa: E501
-                mm_positions)
-            self.mm_positions = sorted_mm_positions
-        else:
-            sorted_modalities = []
-            self.mm_positions = []
-
-        # Output of the mm input mapper (e.g., image tensors).
-        self.mm_inputs: List[MultiModalKwargs] = []
-        if self.inputs.multi_modal_inputs:
-            # NOTE: We only need to sort multimodal kwargs when there
-            # are multiple modalities involved.
-            if len(sorted_modalities) > 1:
-                modality_order_dict = {
-                    modality: order
-                    for order, modality in enumerate(sorted_modalities)
-                }
-
-                # Sanity check to make sure each multimodal input
-                # has only one modality key.
-                for mm_input in self.inputs.multi_modal_inputs:
-                    assert len(mm_input.modalities) == 1
-
-                # Sort MultiModalKwags to match sorted_mm_positions
-                self.inputs.multi_modal_inputs.sort(
-                    key=lambda mm_input: modality_order_dict[list(
-                        mm_input.modalities)[0]])
-
-            self.mm_inputs = self.inputs.multi_modal_inputs
+        # Multi-modal related
+        self.mm_positions = multi_modal_placeholders or []
+        self.mm_inputs = multi_modal_inputs or []
+        self.mm_hashes: List[str] = multi_modal_hashes or []
 
+        # Sanity check
         assert len(self.mm_inputs) == len(self.mm_positions)
-        self.mm_hashes: List[str] = self.inputs.multi_modal_hashes
+        assert len(self.mm_inputs) == len(self.mm_hashes)
 
         # Cache the computed kv block hashes of the request to avoid
         # recomputing.
@@ -93,15 +68,11 @@ def __init__(
     def from_engine_core_request(cls, request: EngineCoreRequest) -> "Request":
         return cls(
             request_id=request.request_id,
-            inputs=token_inputs(
-                prompt_token_ids=request.prompt_token_ids,
-                prompt=request.prompt,
-                multi_modal_data=None,
-                multi_modal_inputs=request.mm_inputs,
-                multi_modal_hashes=request.mm_hashes,
-                multi_modal_placeholders=request.mm_placeholders,
-                mm_processor_kwargs=None,
-            ),
+            prompt=request.prompt,
+            prompt_token_ids=request.prompt_token_ids,
+            multi_modal_inputs=request.mm_inputs,
+            multi_modal_hashes=request.mm_hashes,
+            multi_modal_placeholders=request.mm_placeholders,
             sampling_params=request.sampling_params,
             eos_token_id=request.eos_token_id,
             arrival_time=request.arrival_time,

From ba2f3993d314801c2b07e2ff07bdd387ef01a9b0 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Fri, 3 Jan 2025 08:42:07 +0000
Subject: [PATCH 09/47] format

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/v1/engine/processor.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 9cf94fb9622f..4905f4bdf5a7 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -164,9 +164,10 @@ def process_inputs(
                     assert len(mm_input.modalities) == 1
 
                 # Sort MultiModalKwags to match sorted_mm_positions
-                sorted_mm_inputs = sorted(mm_inputs,
-                                        key=lambda mm_input: modality_order_dict[
-                                            list(mm_input.modalities)[0]])
+                sorted_mm_inputs = sorted(
+                    mm_inputs,
+                    key=lambda mm_input: modality_order_dict[list(
+                        mm_input.modalities)[0]])
             else:
                 sorted_mm_inputs = mm_inputs
 

From 2eebfd9408fb4d2f409474b17c44931a6fd1ae00 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Fri, 3 Jan 2025 10:29:57 +0000
Subject: [PATCH 10/47] switch order

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/v1/engine/processor.py | 50 ++++++++++++++++++-------------------
 1 file changed, 24 insertions(+), 26 deletions(-)

diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 4905f4bdf5a7..cf07b2d266a4 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -114,6 +114,7 @@ def process_inputs(
 
         # For merged preprocessor, mm_data is already mm_inputs
         precomputed_mm_inputs = None
+        sorted_mm_inputs = None
         decoder_mm_data = decoder_inputs.multi_modal_data
         if isinstance(decoder_mm_data, MultiModalKwargs):
             # The output of merged multi-modal processor (`decoder_mm_data`)
@@ -126,30 +127,18 @@ def process_inputs(
                 for item in decoder_mm_data.get_items(modality)
             ]
 
-        # Apply MM mapper
-        mm_inputs = None
-        if len(decoder_mm_data) > 0:
-            mm_inputs = self.mm_input_mapper_client.process_inputs(
-                decoder_mm_data,
-                mm_hashes,
-                decoder_inputs.mm_processor_kwargs,
-                precomputed_mm_inputs,
-            )
-
-        # Merge and sort multimodal placeholders, inputs and hashes by
-        # their positions in the input sequence.
-        # NOTE: interleaved modalities are not supported.
-        mm_positions = decoder_inputs.multi_modal_placeholders
-        if mm_positions:
-            sorted_modalities, sorted_mm_positions = merge_and_sort_placeholders_from_modalities(  # noqa: E501
-                mm_positions)
-            self.mm_positions = sorted_mm_positions
-        else:
-            sorted_modalities = []
-            self.mm_positions = []
+            # Merge and sort multimodal placeholders, inputs and hashes by
+            # their positions in the input sequence.
+            # NOTE: interleaved modalities are not supported.
+            mm_positions = decoder_inputs.multi_modal_placeholders
+            if mm_positions:
+                sorted_modalities, sorted_mm_positions = merge_and_sort_placeholders_from_modalities(  # noqa: E501
+                    mm_positions)
+                self.mm_positions = sorted_mm_positions
+            else:
+                sorted_modalities = []
+                self.mm_positions = []
 
-        sorted_mm_inputs = None
-        if mm_inputs:
             # NOTE: We only need to sort multimodal inputs/kwargs when
             # there are multiple modalities involved.
             if len(sorted_modalities) > 1:
@@ -160,16 +149,25 @@ def process_inputs(
 
                 # Sanity check to make sure each multimodal input
                 # has only one modality key.
-                for mm_input in mm_inputs:
+                for mm_input in precomputed_mm_inputs:
                     assert len(mm_input.modalities) == 1
 
                 # Sort MultiModalKwags to match sorted_mm_positions
                 sorted_mm_inputs = sorted(
-                    mm_inputs,
+                    precomputed_mm_inputs,
                     key=lambda mm_input: modality_order_dict[list(
                         mm_input.modalities)[0]])
             else:
-                sorted_mm_inputs = mm_inputs
+                sorted_mm_inputs = precomputed_mm_inputs
+
+        # Apply mm input cache update (and input mapper is necessary).
+        if len(decoder_mm_data) > 0:
+            sorted_mm_inputs = self.mm_input_mapper_client.process_inputs(
+                mm_data=decoder_mm_data,
+                mm_hashes=mm_hashes,
+                mm_processor_kwargs=decoder_inputs.mm_processor_kwargs,
+                precomputed_mm_inputs=sorted_mm_inputs,
+            )
 
         return EngineCoreRequest(
             request_id,

From 20dd84dcac5f4af85e0901e54d179b30cf19b07d Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Fri, 3 Jan 2025 13:14:56 +0000
Subject: [PATCH 11/47] refactor

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/multimodal/__init__.py   |   3 +-
 vllm/multimodal/inputs.py     |   7 +-
 vllm/multimodal/processing.py |  80 ++++++----------------
 vllm/multimodal/utils.py      | 124 +++++++++++++++++++++++++++++-----
 vllm/v1/engine/processor.py   | 113 +++++++++++++++++--------------
 5 files changed, 197 insertions(+), 130 deletions(-)

diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py
index e58bbe81717a..2e5c5236d0d8 100644
--- a/vllm/multimodal/__init__.py
+++ b/vllm/multimodal/__init__.py
@@ -1,6 +1,6 @@
 from .base import MultiModalPlaceholderMap, MultiModalPlugin
 from .inputs import (BatchedTensorInputs, ModalityData, MultiModalDataBuiltins,
-                     MultiModalDataDict, MultiModalKwargs,
+                     MultiModalDataDict, MultiModalHashDict, MultiModalKwargs,
                      MultiModalPlaceholderDict, NestedTensors)
 from .registry import MultiModalRegistry
 
@@ -18,6 +18,7 @@
     "ModalityData",
     "MultiModalDataBuiltins",
     "MultiModalDataDict",
+    "MultiModalHashDict",
     "MultiModalKwargs",
     "MultiModalPlaceholderDict",
     "MultiModalPlaceholderMap",
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index b0a110454618..6d2765a6c9b0 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -491,6 +491,11 @@ def get_items(self, modality: str) -> Sequence[MultiModalKwargsItem]:
 A dictionary containing placeholder ranges.
 """
 
+MultiModalHashDict = Mapping[str, Sequence[str]]
+"""
+A dictionary containing hashes for items in each modality.
+"""
+
 
 class MultiModalInputsV2(TypedDict):
     """
@@ -513,7 +518,7 @@ class MultiModalInputsV2(TypedDict):
     mm_kwargs: MultiModalKwargs
     """Keyword arguments to be directly passed to the model after batching."""
 
-    mm_hashes: NotRequired[list[str]]
+    mm_hashes: NotRequired[MultiModalHashDict]
     """The hashes of the multi-modal data."""
 
     mm_placeholders: MultiModalPlaceholderDict
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index eb7552176e97..bcb8707282fc 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -1,4 +1,3 @@
-import pickle
 import re
 from abc import ABC, abstractmethod
 from collections import defaultdict
@@ -9,8 +8,6 @@
 
 import numpy as np
 import numpy.typing as npt
-import torch
-from blake3 import blake3
 from PIL import Image
 from transformers import BatchFeature, ProcessorMixin
 
@@ -23,6 +20,7 @@
                      MultiModalInputsV2, MultiModalKwargs,
                      MultiModalKwargsItem, PlaceholderRange)
 from .parse import MultiModalDataItems, MultiModalDataParser
+from .utils import hash_kwargs
 
 logger = init_logger(__name__)
 
@@ -492,56 +490,6 @@ def _maybe_log_cache_stats(self) -> None:
             logger.debug("ProcessingCache: hit_ratio = %.2f",
                          cache_stats.hit_ratio)
 
-    def _serialize_item(self, obj: object) -> bytes:
-        # Simple cases
-        if isinstance(obj, str):
-            return obj.encode("utf-8")
-        if isinstance(obj, bytes):
-            return obj
-        if isinstance(obj, Image.Image):
-            return obj.tobytes()
-
-        # Convertible to NumPy arrays
-        if isinstance(obj, torch.Tensor):
-            obj = obj.numpy()
-        if isinstance(obj, (int, float)):
-            obj = np.array(obj)
-        if isinstance(obj, np.ndarray):
-            return obj.tobytes()
-
-        logger.warning(
-            "No serialization method found for %s. "
-            "Falling back to pickle.", type(obj))
-
-        return pickle.dumps(obj)
-
-    def _item_to_bytes(
-        self,
-        key: str,
-        obj: object,
-    ) -> Iterable[tuple[bytes, bytes]]:
-        # Recursive cases
-        if isinstance(obj, (list, tuple)):
-            for i, elem in enumerate(obj):
-                yield from self._item_to_bytes(f"{key}.{i}", elem)
-        elif isinstance(obj, dict):
-            for k, v in obj.items():
-                yield from self._item_to_bytes(f"{key}.{k}", v)
-        else:
-            key_bytes = self._serialize_item(key)
-            value_bytes = self._serialize_item(obj)
-            yield key_bytes, value_bytes
-
-    def _hash_kwargs(self, **kwargs: object) -> str:
-        hasher = blake3()
-
-        for k, v in kwargs.items():
-            for k_bytes, v_bytes in self._item_to_bytes(k, v):
-                hasher.update(k_bytes)
-                hasher.update(v_bytes)
-
-        return hasher.hexdigest()
-
     def get(
         self,
         model_id: str,
@@ -560,9 +508,9 @@ def get(
         """
         self._maybe_log_cache_stats()
 
-        cache_key = self._hash_kwargs(model_id=model_id,
-                                      **{modality: input_item},
-                                      **input_kwargs)
+        cache_key = hash_kwargs(model_id=model_id,
+                                **{modality: input_item},
+                                **input_kwargs)
         return self._cache.get(cache_key)
 
     def put(
@@ -577,9 +525,9 @@ def put(
         Put a processed multi-modal item into the cache
         according to its dependencies (see :meth:`get`).
         """
-        cache_key = self._hash_kwargs(model_id=model_id,
-                                      **{modality: input_item},
-                                      **input_kwargs)
+        cache_key = hash_kwargs(model_id=model_id,
+                                **{modality: input_item},
+                                **input_kwargs)
         self._cache.put(cache_key, output_kwargs)
 
 
@@ -998,6 +946,19 @@ def apply(
         """
         mm_items = self._to_mm_items(mm_data)
 
+        # Create MM hashes
+        # TODO: Use these hash keys for caching operations in apply_hf_processor
+        # instead of rehashing.
+        model_id = self.ctx.model_config.model
+        mm_hashes = {
+            modality: [
+                hash_kwargs(model_id=model_id,
+                            **{modality: item},
+                            **hf_processor_mm_kwargs) for item in items
+            ]
+            for modality, items in mm_items.items()
+        }
+
         prompt_ids, mm_kwargs = self._cached_apply_hf_processor(
             prompt_text,
             mm_items,
@@ -1058,6 +1019,7 @@ def apply(
             prompt=prompt_text,
             prompt_token_ids=prompt_ids,
             mm_kwargs=mm_kwargs,
+            mm_hashes=mm_hashes,
             mm_placeholders=mm_placeholders,
         )
 
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index ef980c48b852..b31f397012e5 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -1,11 +1,13 @@
+import pickle
 from functools import lru_cache
 from pathlib import Path
-from typing import TYPE_CHECKING, Optional, TypeVar, Union
+from typing import TYPE_CHECKING, Iterable, Optional, TypeVar, Union
 from urllib.parse import ParseResult, urlparse
 
 import numpy as np
 import numpy.typing as npt
 import torch
+from blake3 import blake3
 from PIL import Image
 
 import vllm.envs as envs
@@ -26,7 +28,7 @@
 _M = TypeVar("_M")
 
 if TYPE_CHECKING:
-    from ..multimodal import MultiModalPlaceholderDict
+    from ..multimodal import MultiModalHashDict, MultiModalPlaceholderDict
 
 
 class MediaConnector:
@@ -254,6 +256,58 @@ async def fetch_video_async(
 fetch_video = global_media_connector.fetch_video
 
 
+def serialize_item(obj: object) -> bytes:
+    # Simple cases
+    if isinstance(obj, str):
+        return obj.encode("utf-8")
+    if isinstance(obj, bytes):
+        return obj
+    if isinstance(obj, Image.Image):
+        return obj.tobytes()
+
+    # Convertible to NumPy arrays
+    if isinstance(obj, torch.Tensor):
+        obj = obj.numpy()
+    if isinstance(obj, (int, float)):
+        obj = np.array(obj)
+    if isinstance(obj, np.ndarray):
+        return obj.tobytes()
+
+    logger.warning(
+        "No serialization method found for %s. "
+        "Falling back to pickle.", type(obj))
+
+    return pickle.dumps(obj)
+
+
+def item_to_bytes(
+    key: str,
+    obj: object,
+) -> Iterable[tuple[bytes, bytes]]:
+    # Recursive cases
+    if isinstance(obj, (list, tuple)):
+        for i, elem in enumerate(obj):
+            yield from item_to_bytes(f"{key}.{i}", elem)
+    elif isinstance(obj, dict):
+        for k, v in obj.items():
+            yield from item_to_bytes(f"{key}.{k}", v)
+    else:
+        key_bytes = serialize_item(key)
+        value_bytes = serialize_item(obj)
+        yield key_bytes, value_bytes
+
+
+def hash_kwargs(**kwargs: object) -> str:
+    hasher = blake3()
+
+    for k, v in kwargs.items():
+        for k_bytes, v_bytes in item_to_bytes(k, v):
+            hasher.update(k_bytes)
+            hasher.update(v_bytes)
+
+    return hasher.hexdigest()
+
+
 def encode_audio_base64(
     audio: np.ndarray,
     sampling_rate: int,
@@ -442,14 +496,18 @@ def consecutive_placeholder_ranges(
     ]
 
 
-def merge_and_sort_placeholders_from_modalities(
-    mm_positions: "MultiModalPlaceholderDict"
-) -> tuple[list[str], list[PlaceholderRange]]:
+def merge_and_sort_mm_metadata_from_modalities(
+    mm_positions: "MultiModalPlaceholderDict",
+    mm_hashes: Optional["MultiModalHashDict"],
+) -> tuple[list[str], list[PlaceholderRange], Optional[list[str]]]:
     """Given a MultiModalPlaceholderDict, merge all PlaceholderRange
     objects from all available modalities into a single list of 
     PlaceholderRange, sorted by their offset (starting index in the input 
     sequence) in the ascending order.
 
+    Optionally if a MultiModalHashDict is given, same operation will be 
+    applied to the object and the sorted list of hashes will be returned.
+
     Raises:
         ValueError: If the input prompt has interleaved placeholders from
             different modalities (e.g, "<image><audio><image> Describe the 
@@ -459,29 +517,61 @@ def merge_and_sort_placeholders_from_modalities(
         list[str]: Sorted list of involved modalities.
         list[PlaceholderRange]: Sorted list of all PlaceholdeRanges from 
             mm_positions.
+        Optional[list[str]]: Sorted list of all hashes from mm_hashes if 
+            given, None otherwise.
     """
 
     modalities = list(mm_positions.keys())
 
-    # For single modality, its placeholder ranges are already sorted.
+    assert len(modalities) > 0, "No modalities found in the mm_positions."
+
+    # For single modality, placeholder ranges and hashes are already sorted
+    # so we can return the list directly.
     if len(modalities) == 1:
-        return modalities, list(mm_positions[modalities[0]])
+        if mm_hashes is None:
+            return modalities, list(mm_positions[modalities[0]]), None
+        else:
+            return modalities, list(mm_positions[modalities[0]]), list(
+                mm_hashes[modalities[0]])
 
     placeholder_lists_with_modality = [(modality, mm_positions[modality])
                                        for modality in modalities
                                        if modality in mm_positions]
 
-    sorted_lists_with_modality = sorted(placeholder_lists_with_modality,
-                                        key=lambda x: x[1][0]['offset'])
-
-    # Verify if the sorted order avoids interleaving
-    merged: list[PlaceholderRange] = []
-    for modality, placeholder_list in sorted_lists_with_modality:
-        if merged and placeholder_list[0]['offset'] < merged[-1]['offset']:
+    if mm_hashes is None:
+        sorted_placeholder_lists = sorted(placeholder_lists_with_modality,
+                                          key=lambda x: x[1][0]['offset'])
+        sorted_hash_lists = None
+    else:
+        hashes_lists = [
+            mm_hashes[modality] for modality in modalities
+            if modality in mm_hashes
+        ]
+        sorted_pairs = sorted(zip(placeholder_lists_with_modality,
+                                  hashes_lists),
+                              key=lambda x: x[0][1][0]['offset'])
+        sorted_placeholder_tuple, sorted_hash_tuple = zip(*sorted_pairs)
+        sorted_placeholder_lists = list(sorted_placeholder_tuple)
+        sorted_hash_lists = list(sorted_hash_tuple)
+
+    sorted_modalities = [modality for modality, _ in sorted_placeholder_lists]
+
+    # Flatten sorted list of lists to a single list and verify there is no
+    # interleaving of placeholders from different modalities.
+    merged_placeholders: list[PlaceholderRange] = []
+    for modality, placeholder_list in sorted_placeholder_lists:
+        if merged_placeholders and placeholder_list[0][
+                'offset'] < merged_placeholders[-1]['offset']:
             raise ValueError(
                 "Interleaved mixed-modality inference is currently not "
                 "supported.")
-        merged.extend(placeholder_list)
+        merged_placeholders.extend(placeholder_list)
+
+    if sorted_hash_lists is not None:
+        merged_hashes = []
+        for hash_list in sorted_hash_lists:
+            merged_hashes.extend(hash_list)
+    else:
+        merged_hashes = None
 
-    # Return the order of modalities and the merged placeholder ranges
-    return [modality for modality, _ in sorted_lists_with_modality], merged
+    return sorted_modalities, merged_placeholders, merged_hashes
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index cf07b2d266a4..49571cccc8f9 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -9,7 +9,7 @@
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs,
                              MultiModalRegistry)
-from vllm.multimodal.utils import merge_and_sort_placeholders_from_modalities
+from vllm.multimodal.utils import merge_and_sort_mm_metadata_from_modalities
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
@@ -77,11 +77,6 @@ def process_inputs(
         assert priority == 0, "vLLM V1 does not support priority at the moment."
         assert trace_headers is None, "vLLM V1 does not support tracing yet."
 
-        # Compute MM hashes (if enabled)
-        mm_hashes = None
-        if self.use_hash:
-            mm_hashes = self.mm_hasher.hash_prompt_mm_data(prompt)
-
         # Process inputs.
         preprocessed_inputs = self.input_preprocessor.preprocess(
             prompt,
@@ -102,6 +97,19 @@ def process_inputs(
             decoder_inputs = SingletonInputsAdapter(processed_inputs)
             encoder_inputs = None
 
+        # Compute MM hashes (if enabled)
+        mm_hashes = None
+        if self.use_hash:
+            # Use mm_hashes from processed inputs if the model has merged
+            # input processor.
+            if decoder_inputs.multi_modal_hashes:
+                mm_hashes = decoder_inputs.multi_modal_hashes
+            # Fallback to MMhasher that only supports image hashing.
+            else:
+                image_hashes = self.mm_hasher.hash_prompt_mm_data(prompt)
+                if image_hashes:
+                    mm_hashes = {"image": image_hashes}
+
         # TODO: Impl encoder-decoder
         if encoder_inputs is not None:
             raise NotImplementedError
@@ -113,8 +121,7 @@ def process_inputs(
             self.generation_config_fields, eos_token_id)
 
         # For merged preprocessor, mm_data is already mm_inputs
-        precomputed_mm_inputs = None
-        sorted_mm_inputs = None
+        precomputed_mm_inputs: Optional[list[MultiModalKwargs]] = None
         decoder_mm_data = decoder_inputs.multi_modal_data
         if isinstance(decoder_mm_data, MultiModalKwargs):
             # The output of merged multi-modal processor (`decoder_mm_data`)
@@ -127,59 +134,61 @@ def process_inputs(
                 for item in decoder_mm_data.get_items(modality)
             ]
 
-            # Merge and sort multimodal placeholders, inputs and hashes by
-            # their positions in the input sequence.
-            # NOTE: interleaved modalities are not supported.
-            mm_positions = decoder_inputs.multi_modal_placeholders
-            if mm_positions:
-                sorted_modalities, sorted_mm_positions = merge_and_sort_placeholders_from_modalities(  # noqa: E501
-                    mm_positions)
-                self.mm_positions = sorted_mm_positions
-            else:
-                sorted_modalities = []
-                self.mm_positions = []
-
-            # NOTE: We only need to sort multimodal inputs/kwargs when
-            # there are multiple modalities involved.
-            if len(sorted_modalities) > 1:
-                modality_order_dict = {
-                    modality: order
-                    for order, modality in enumerate(sorted_modalities)
-                }
-
-                # Sanity check to make sure each multimodal input
-                # has only one modality key.
-                for mm_input in precomputed_mm_inputs:
-                    assert len(mm_input.modalities) == 1
-
-                # Sort MultiModalKwags to match sorted_mm_positions
-                sorted_mm_inputs = sorted(
-                    precomputed_mm_inputs,
-                    key=lambda mm_input: modality_order_dict[list(
-                        mm_input.modalities)[0]])
-            else:
-                sorted_mm_inputs = precomputed_mm_inputs
+        # Merge and flatten multimodal placeholders, hashes and inputs
+        # from dictionaries to lists, and sort them by each item's position
+        # in the input sequence.
+        # NOTE: interleaved modalities are not supported.
+        mm_positions = decoder_inputs.multi_modal_placeholders
+
+        if mm_positions:
+            sorted_modalities, sorted_mm_positions, sorted_mm_hashes = merge_and_sort_mm_metadata_from_modalities(  # noqa: E501
+                mm_positions,
+                mm_hashes,
+            )
+        else:
+            sorted_modalities = []
+            sorted_mm_positions = None
+            sorted_mm_hashes = None
+
+        # NOTE: We sort multimodal inputs/kwargs only if there are multiple
+        # modalities involved and the model supports merged input processor.
+        if precomputed_mm_inputs is not None and len(sorted_modalities) > 1:
+            modality_order_dict = {
+                modality: order
+                for order, modality in enumerate(sorted_modalities)
+            }
+
+            # Sanity check to make sure each multimodal input
+            # has only one modality key.
+            for mm_input in precomputed_mm_inputs:
+                assert len(mm_input.modalities) == 1
+
+            # Sort MultiModalKwags to match sorted_mm_positions
+            precomputed_mm_inputs = sorted(
+                precomputed_mm_inputs,
+                key=lambda mm_input: modality_order_dict[list(mm_input.
+                                                              modalities)[0]])
 
         # Apply mm input cache update (and input mapper is necessary).
         if len(decoder_mm_data) > 0:
             sorted_mm_inputs = self.mm_input_mapper_client.process_inputs(
                 mm_data=decoder_mm_data,
-                mm_hashes=mm_hashes,
+                mm_hashes=sorted_mm_hashes,
                 mm_processor_kwargs=decoder_inputs.mm_processor_kwargs,
-                precomputed_mm_inputs=sorted_mm_inputs,
+                precomputed_mm_inputs=precomputed_mm_inputs,
             )
 
         return EngineCoreRequest(
-            request_id,
-            decoder_inputs.prompt,
-            decoder_inputs.prompt_token_ids,
-            sorted_mm_inputs,
-            mm_hashes,
-            sorted_mm_positions,
-            sampling_params,
-            eos_token_id,
-            arrival_time,
-            lora_request,
+            request_id=request_id,
+            prompt=decoder_inputs.prompt,
+            prompt_token_ids=decoder_inputs.prompt_token_ids,
+            mm_inputs=sorted_mm_inputs,
+            mm_hashes=sorted_mm_hashes,
+            mm_placeholders=sorted_mm_positions,
+            sampling_params=sampling_params,
+            eos_token_id=eos_token_id,
+            arrival_time=arrival_time,
+            lora_request=lora_request,
         )
 
     def _validate_model_inputs(self, inputs: ProcessorInputs):

From 34ec19462eb74dae3feb065b14ec8e46bcba277f Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Fri, 3 Jan 2025 19:17:13 +0000
Subject: [PATCH 12/47] typing

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/multimodal/inputs.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 6d2765a6c9b0..98f238bf3a0c 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -491,7 +491,7 @@ def get_items(self, modality: str) -> Sequence[MultiModalKwargsItem]:
 A dictionary containing placeholder ranges.
 """
 
-MultiModalHashDict = Mapping[str, Sequence[str]]
+MultiModalHashDict = Mapping[str, list[str]]
 """
 A dictionary containing hashes for items in each modality.
 """

From 9f1962932ab76a8e1534b93f07ec7a84c957537a Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Fri, 3 Jan 2025 22:46:27 +0000
Subject: [PATCH 13/47] hasher

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/multimodal/__init__.py   |  2 ++
 vllm/multimodal/hasher.py     | 64 +++++++++++++++++++++++++++++++++++
 vllm/multimodal/processing.py | 21 ++++++------
 vllm/multimodal/utils.py      | 56 +-----------------------------
 4 files changed, 78 insertions(+), 65 deletions(-)
 create mode 100644 vllm/multimodal/hasher.py

diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py
index 2e5c5236d0d8..d8a5d37b958a 100644
--- a/vllm/multimodal/__init__.py
+++ b/vllm/multimodal/__init__.py
@@ -1,4 +1,5 @@
 from .base import MultiModalPlaceholderMap, MultiModalPlugin
+from .hasher import MultiModalHasher
 from .inputs import (BatchedTensorInputs, ModalityData, MultiModalDataBuiltins,
                      MultiModalDataDict, MultiModalHashDict, MultiModalKwargs,
                      MultiModalPlaceholderDict, NestedTensors)
@@ -19,6 +20,7 @@
     "MultiModalDataBuiltins",
     "MultiModalDataDict",
     "MultiModalHashDict",
+    "MultiModalHasher",
     "MultiModalKwargs",
     "MultiModalPlaceholderDict",
     "MultiModalPlaceholderMap",
diff --git a/vllm/multimodal/hasher.py b/vllm/multimodal/hasher.py
new file mode 100644
index 000000000000..1ed6384718a7
--- /dev/null
+++ b/vllm/multimodal/hasher.py
@@ -0,0 +1,64 @@
+import pickle
+from blake3 import blake3
+import torch
+import numpy as np
+from PIL import Image
+from typing import Iterable
+
+from vllm.logger import init_logger
+
+logger = init_logger(__name__)
+
+class MultiModalHasher:
+
+    @classmethod
+    def serialize_item(cls, obj: object) -> bytes:
+        # Simple cases
+        if isinstance(obj, str):
+            return obj.encode("utf-8")
+        if isinstance(obj, bytes):
+            return obj
+        if isinstance(obj, Image.Image):
+            return obj.tobytes()
+
+        # Convertible to NumPy arrays
+        if isinstance(obj, torch.Tensor):
+            obj = obj.numpy()
+        if isinstance(obj, (int, float)):
+            obj = np.array(obj)
+        if isinstance(obj, np.ndarray):
+            return obj.tobytes()
+
+        logger.warning(
+            "No serialization method found for %s. "
+            "Falling back to pickle.", type(obj))
+
+        return pickle.dumps(obj)
+
+    @classmethod
+    def item_to_bytes(cls, 
+        key: str,
+        obj: object,
+    ) -> Iterable[tuple[bytes, bytes]]:
+        # Recursive cases
+        if isinstance(obj, (list, tuple)):
+            for i, elem in enumerate(obj):
+                yield from cls.item_to_bytes(f"{key}.{i}", elem)
+        elif isinstance(obj, dict):
+            for k, v in obj.items():
+                yield from cls.item_to_bytes(f"{key}.{k}", v)
+        else:
+            key_bytes = cls.serialize_item(key)
+            value_bytes = cls.serialize_item(obj)
+            yield key_bytes, value_bytes
+
+    @classmethod
+    def hash_kwargs(cls, **kwargs: object) -> str:
+        hasher = blake3()
+
+        for k, v in kwargs.items():
+            for k_bytes, v_bytes in cls.item_to_bytes(k, v):
+                hasher.update(k_bytes)
+                hasher.update(v_bytes)
+
+        return hasher.hexdigest()
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index bcb8707282fc..faf26d30598b 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -16,11 +16,11 @@
 from vllm.transformers_utils.tokenizer import AnyTokenizer, encode_tokens
 from vllm.utils import LRUCache, flatten_2d_lists, full_groupby
 
+from .hasher import MultiModalHasher
 from .inputs import (MultiModalDataDict, MultiModalFieldConfig,
                      MultiModalInputsV2, MultiModalKwargs,
                      MultiModalKwargsItem, PlaceholderRange)
 from .parse import MultiModalDataItems, MultiModalDataParser
-from .utils import hash_kwargs
 
 logger = init_logger(__name__)
 
@@ -508,9 +508,9 @@ def get(
         """
         self._maybe_log_cache_stats()
 
-        cache_key = hash_kwargs(model_id=model_id,
-                                **{modality: input_item},
-                                **input_kwargs)
+        cache_key = MultiModalHasher.hash_kwargs(model_id=model_id,
+                                                 **{modality: input_item},
+                                                 **input_kwargs)
         return self._cache.get(cache_key)
 
     def put(
@@ -525,9 +525,9 @@ def put(
         Put a processed multi-modal item into the cache
         according to its dependencies (see :meth:`get`).
         """
-        cache_key = hash_kwargs(model_id=model_id,
-                                **{modality: input_item},
-                                **input_kwargs)
+        cache_key = MultiModalHasher.hash_kwargs(model_id=model_id,
+                                                 **{modality: input_item},
+                                                 **input_kwargs)
         self._cache.put(cache_key, output_kwargs)
 
 
@@ -952,9 +952,10 @@ def apply(
         model_id = self.ctx.model_config.model
         mm_hashes = {
             modality: [
-                hash_kwargs(model_id=model_id,
-                            **{modality: item},
-                            **hf_processor_mm_kwargs) for item in items
+                MultiModalHasher.hash_kwargs(model_id=model_id,
+                                             **{modality: item},
+                                             **hf_processor_mm_kwargs)
+                for item in items
             ]
             for modality, items in mm_items.items()
         }
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index b31f397012e5..4a4903ecf7ee 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -1,13 +1,11 @@
-import pickle
 from functools import lru_cache
 from pathlib import Path
-from typing import TYPE_CHECKING, Iterable, Optional, TypeVar, Union
+from typing import TYPE_CHECKING, Optional, TypeVar, Union
 from urllib.parse import ParseResult, urlparse
 
 import numpy as np
 import numpy.typing as npt
 import torch
-from blake3 import blake3
 from PIL import Image
 
 import vllm.envs as envs
@@ -256,58 +254,6 @@ async def fetch_video_async(
 fetch_video = global_media_connector.fetch_video
 
 
-def serialize_item(obj: object) -> bytes:
-    # Simple cases
-    if isinstance(obj, str):
-        return obj.encode("utf-8")
-    if isinstance(obj, bytes):
-        return obj
-    if isinstance(obj, Image.Image):
-        return obj.tobytes()
-
-    # Convertible to NumPy arrays
-    if isinstance(obj, torch.Tensor):
-        obj = obj.numpy()
-    if isinstance(obj, (int, float)):
-        obj = np.array(obj)
-    if isinstance(obj, np.ndarray):
-        return obj.tobytes()
-
-    logger.warning(
-        "No serialization method found for %s. "
-        "Falling back to pickle.", type(obj))
-
-    return pickle.dumps(obj)
-
-
-def item_to_bytes(
-    key: str,
-    obj: object,
-) -> Iterable[tuple[bytes, bytes]]:
-    # Recursive cases
-    if isinstance(obj, (list, tuple)):
-        for i, elem in enumerate(obj):
-            yield from item_to_bytes(f"{key}.{i}", elem)
-    elif isinstance(obj, dict):
-        for k, v in obj.items():
-            yield from item_to_bytes(f"{key}.{k}", v)
-    else:
-        key_bytes = serialize_item(key)
-        value_bytes = serialize_item(obj)
-        yield key_bytes, value_bytes
-
-
-def hash_kwargs(**kwargs: object) -> str:
-    hasher = blake3()
-
-    for k, v in kwargs.items():
-        for k_bytes, v_bytes in item_to_bytes(k, v):
-            hasher.update(k_bytes)
-            hasher.update(v_bytes)
-
-    return hasher.hexdigest()
-
-
 def encode_audio_base64(
     audio: np.ndarray,
     sampling_rate: int,

From 66484aa55b957438c26022d078e2cb729daa5743 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Sat, 4 Jan 2025 00:00:07 +0000
Subject: [PATCH 14/47] consolidate mm hasher

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/model_executor/models/molmo.py |  3 --
 vllm/multimodal/hasher.py           | 40 +++++++++++++++--
 vllm/v1/engine/mm_input_mapper.py   | 67 -----------------------------
 vllm/v1/engine/processor.py         | 13 +++---
 vllm/v1/worker/gpu_model_runner.py  | 22 +++-------
 5 files changed, 48 insertions(+), 97 deletions(-)

diff --git a/vllm/model_executor/models/molmo.py b/vllm/model_executor/models/molmo.py
index cc25be9f5b6a..f874c1ad51eb 100644
--- a/vllm/model_executor/models/molmo.py
+++ b/vllm/model_executor/models/molmo.py
@@ -972,8 +972,6 @@ def image_input_mapper_for_molmo(
         assert len(data) == 1, "Molmo supports only one image per prompt."
         data = data[0]
 
-    # Remove unused dummy PIL image
-    data.pop('raw_mm_data', None)
     return MultiModalKwargs(data)
 
 
@@ -1019,7 +1017,6 @@ def dummy_data_for_molmo(ctx: InputContext, seq_len: int,
     dummy_imgdata = {
         "images": out["images"],
         "image_input_idx": out["image_input_idx"],
-        "raw_mm_data": dummy_image,
     }
     if "image_masks" in out:
         dummy_imgdata["image_masks"] = out["image_masks"]
diff --git a/vllm/multimodal/hasher.py b/vllm/multimodal/hasher.py
index 1ed6384718a7..c077de3b4c9a 100644
--- a/vllm/multimodal/hasher.py
+++ b/vllm/multimodal/hasher.py
@@ -1,14 +1,20 @@
 import pickle
-from blake3 import blake3
-import torch
+from typing import TYPE_CHECKING, Iterable, Optional
+
 import numpy as np
+import torch
+from blake3 import blake3
 from PIL import Image
-from typing import Iterable
 
 from vllm.logger import init_logger
 
+if TYPE_CHECKING:
+    from vllm.multimodal.types import MultiModalHashDict
+    from vllm.prompt_adapter.request import PromptType
+
 logger = init_logger(__name__)
 
+
 class MultiModalHasher:
 
     @classmethod
@@ -36,7 +42,8 @@ def serialize_item(cls, obj: object) -> bytes:
         return pickle.dumps(obj)
 
     @classmethod
-    def item_to_bytes(cls, 
+    def item_to_bytes(
+        cls,
         key: str,
         obj: object,
     ) -> Iterable[tuple[bytes, bytes]]:
@@ -62,3 +69,28 @@ def hash_kwargs(cls, **kwargs: object) -> str:
                 hasher.update(v_bytes)
 
         return hasher.hexdigest()
+
+    @classmethod
+    def hash_prompt_mm_data(
+            cls, prompt: "PromptType") -> Optional["MultiModalHashDict"]:
+        """Hash multimodal data in the user input prompt if they exist."""
+
+        if "multi_modal_data" not in prompt:
+            return None
+
+        mm_data = prompt["multi_modal_data"]
+        if not mm_data:
+            # mm_data can be None or an empty dict.
+            return None
+
+        mm_items = {
+            modality: items if isinstance(items, list) else [items]
+            for modality, items in mm_data.items()
+        }
+
+        mm_hashes = {
+            modality: [cls.hash_kwargs(**{modality: item}) for item in items]
+            for modality, items in mm_items.items()
+        }
+
+        return mm_hashes
diff --git a/vllm/v1/engine/mm_input_mapper.py b/vllm/v1/engine/mm_input_mapper.py
index 8bfc739b3dbb..d83460a40ad2 100644
--- a/vllm/v1/engine/mm_input_mapper.py
+++ b/vllm/v1/engine/mm_input_mapper.py
@@ -1,10 +1,6 @@
 from typing import Any, Dict, List, Optional
 
-import PIL
-from blake3 import blake3
-
 from vllm.config import ModelConfig
-from vllm.inputs import PromptType
 from vllm.logger import init_logger
 from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalDataDict,
                              MultiModalKwargs, MultiModalRegistry)
@@ -144,66 +140,3 @@ def process_inputs(
             full_mm_inputs.append(mm_input)
 
         return full_mm_inputs
-
-
-class MMHasher:
-
-    def __init__(self):
-        pass
-
-    def hash_dummy_mm_data(
-            self,
-            mm_data: Optional[MultiModalDataDict]) -> Optional[List[str]]:
-        """Hash user-defined dummy multimodal data used for profiling."""
-
-        if mm_data is None:
-            return None
-
-        image_inputs = mm_data['image']
-
-        # This is a temporary workaround for models (e.g, Molmo) that
-        # process multimodal data in the input processor (therefore
-        # image_inputs is MultiModalKwargs instead of raw input format).
-        # `raw_mm_data` with the original input format is expected
-        # in this case.
-        if isinstance(image_inputs, dict):
-            assert "raw_mm_data" in image_inputs and isinstance(
-                image_inputs["raw_mm_data"], PIL.Image.Image)
-            image_inputs = image_inputs.pop("raw_mm_data")
-
-        return self.hash_images(image_inputs)
-
-    def hash_prompt_mm_data(self, prompt: PromptType) -> Optional[List[str]]:
-        """Hash multimodal data in the user input prompt if they exist."""
-
-        if "multi_modal_data" not in prompt:
-            return None
-
-        mm_data = prompt["multi_modal_data"]
-        if not mm_data:
-            # mm_data can be None or an empty dict.
-            return None
-
-        image_inputs = mm_data["image"]
-
-        return self.hash_images(image_inputs)
-
-    def hash_images(self, image_inputs) -> Optional[List[str]]:
-        """Hash PIL image objects to strings."""
-        if not isinstance(image_inputs, list):
-            image_inputs = [image_inputs]
-        assert len(image_inputs) > 0
-
-        ret = []
-        for image in image_inputs:
-            assert isinstance(image, PIL.Image.Image)
-
-            # Convert image to bytes
-            bytes = image.tobytes()
-
-            # Hash image bytes
-            hasher = blake3()
-            hasher.update(bytes)
-            ret.append(hasher.hexdigest())
-
-        return ret
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 49571cccc8f9..01356faa7792 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -7,15 +7,15 @@
 from vllm.inputs.parse import is_encoder_decoder_inputs
 from vllm.inputs.preprocess import InputPreprocessor
 from vllm.lora.request import LoRARequest
-from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalKwargs,
-                             MultiModalRegistry)
+from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalHasher,
+                             MultiModalKwargs, MultiModalRegistry)
 from vllm.multimodal.utils import merge_and_sort_mm_metadata_from_modalities
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
 from vllm.transformers_utils.tokenizer_group import BaseTokenizerGroup
 from vllm.v1.engine import EngineCoreRequest
-from vllm.v1.engine.mm_input_mapper import MMHasher, MMInputMapperClient
+from vllm.v1.engine.mm_input_mapper import MMInputMapperClient
 
 
 class Processor:
@@ -48,7 +48,6 @@ def __init__(
         # Multi-modal hasher (for images)
         self.use_hash = (not model_config.disable_mm_preprocessor_cache) or \
             cache_config.enable_prefix_caching
-        self.mm_hasher = MMHasher()
 
     # TODO: run in an ThreadpoolExecutor or BackgroundProcess.
     # This ideally should releases the GIL, so we should not block the
@@ -104,11 +103,9 @@ def process_inputs(
             # input processor.
             if decoder_inputs.multi_modal_hashes:
                 mm_hashes = decoder_inputs.multi_modal_hashes
-            # Fallback to MMhasher that only supports image hashing.
+            # Fallback to using MultiModalHasher directly.
             else:
-                image_hashes = self.mm_hasher.hash_prompt_mm_data(prompt)
-                if image_hashes:
-                    mm_hashes = {"image": image_hashes}
+                mm_hashes = MultiModalHasher.hash_prompt_mm_data(prompt)
 
         # TODO: Impl encoder-decoder
         if encoder_inputs is not None:
diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 75098b0330ac..62c6bd4d27b9 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -19,7 +19,7 @@
                         LayerBlockType, cdiv, is_pin_memory_available)
 from vllm.v1.attention.backends.flash_attn import (FlashAttentionBackend,
                                                    FlashAttentionMetadata)
-from vllm.v1.engine.mm_input_mapper import MMHasher, MMInputMapperClient
+from vllm.v1.engine.mm_input_mapper import MMInputMapperClient
 from vllm.v1.outputs import ModelRunnerOutput
 from vllm.v1.sample.metadata import SamplingMetadata
 from vllm.v1.worker.gpu_input_batch import CachedRequestState, InputBatch
@@ -82,12 +82,10 @@ def __init__(
         self.input_registry = INPUT_REGISTRY
         self.mm_registry = MULTIMODAL_REGISTRY
 
-        # NOTE: mm_input_mapper_client and mm_hasher are only used for memory
-        # profiling.
-        self.mm_input_mapper_client = MMInputMapperClient(self.model_config)
-        self.mm_hasher = MMHasher()
-        self.use_hash = (not model_config.disable_mm_preprocessor_cache) or \
-            cache_config.enable_prefix_caching
+        # NOTE: Initialized input mapper is only used for processing dummy
+        # multimodal data into multimodal kwargs for GPU memory profiling.
+        self.mm_input_mapper_profiling = MMInputMapperClient(self.model_config)
+        self.mm_input_mapper_profiling.use_cache = False
 
         self.max_num_encoder_input_tokens = self.scheduler_config.max_num_encoder_input_tokens  # noqa: E501
         self.encoder_cache_size = self.scheduler_config.encoder_cache_size
@@ -779,15 +777,9 @@ def profile_run(self) -> None:
             # `MultiModalDataDict`, so they need to be processed through input
             # mapper.
             else:
-                # Compute MM hashes (if enabled)
-                mm_hashes = None
-                if self.use_hash:
-                    mm_hashes = self.mm_hasher.hash_dummy_mm_data(
-                        dummy_mm_data)
-
-                mm_kwargs_list = self.mm_input_mapper_client.process_inputs(
+                mm_kwargs_list = self.mm_input_mapper_profiling.process_inputs(
                     mm_data=dummy_mm_data,
-                    mm_hashes=mm_hashes,
+                    mm_hashes=None,
                     mm_processor_kwargs=None,
                     precomputed_mm_inputs=None)
 

From 1423f5f650313a2ffc9ab26783a772857b53c912 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Sat, 4 Jan 2025 00:09:52 +0000
Subject: [PATCH 15/47] typing

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/multimodal/hasher.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/multimodal/hasher.py b/vllm/multimodal/hasher.py
index c077de3b4c9a..0acce70ef133 100644
--- a/vllm/multimodal/hasher.py
+++ b/vllm/multimodal/hasher.py
@@ -9,8 +9,8 @@
 from vllm.logger import init_logger
 
 if TYPE_CHECKING:
-    from vllm.multimodal.types import MultiModalHashDict
-    from vllm.prompt_adapter.request import PromptType
+    from vllm.inputs import TokensPrompt
+    from vllm.multimodal import MultiModalHashDict
 
 logger = init_logger(__name__)
 
@@ -72,7 +72,7 @@ def hash_kwargs(cls, **kwargs: object) -> str:
 
     @classmethod
     def hash_prompt_mm_data(
-            cls, prompt: "PromptType") -> Optional["MultiModalHashDict"]:
+            cls, prompt: "TokensPrompt") -> Optional["MultiModalHashDict"]:
         """Hash multimodal data in the user input prompt if they exist."""
 
         if "multi_modal_data" not in prompt:

From 14481fd8663d49072217154af563c60111758ac6 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Sun, 5 Jan 2025 05:34:50 +0000
Subject: [PATCH 16/47] fix length check

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/multimodal/processing.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index bfa25b5d8688..28d3fd1ddc89 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -11,6 +11,7 @@
 from PIL import Image
 from transformers import BatchFeature, ProcessorMixin
 
+from vllm import envs
 from vllm.inputs import DummyData, InputProcessingContext
 from vllm.logger import init_logger
 from vllm.transformers_utils.tokenizer import (AnyTokenizer, decode_tokens,
@@ -1203,7 +1204,9 @@ def get_dummy_data(self, seq_len: int) -> DummyData:
                 "tokens.")
 
         total_len = len(prompt_token_ids)
-        if total_len > seq_len:
+
+        # V0 does not support chunked prefill.
+        if total_len > seq_len and not envs.VLLM_USE_V1:
             logger.warning(
                 "The context length (%d) of the model is too short "
                 "to hold the multi-modal embeddings in the worst case "

From 6f435cfe26992cb82d2c7803a1d8f4dfbc1f3eb6 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Sun, 5 Jan 2025 05:46:37 +0000
Subject: [PATCH 17/47] update profiling

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/v1/worker/gpu_model_runner.py | 54 ++++++++++++++++++------------
 1 file changed, 32 insertions(+), 22 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index d0dfdfd43272..029dead16876 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -722,8 +722,6 @@ def profile_run(self) -> None:
         ]
 
         # Profile with multimodal encoder & encoder cache.
-        # TODO (ywang96): generalize this beyond image modality since
-        # mm_input_mapper only supports image inputs.
         if self.is_multimodal_model:
 
             # Create dummy batch of multimodal inputs.
@@ -735,15 +733,30 @@ def profile_run(self) -> None:
             dummy_mm_data = dummy_request_data.multi_modal_data
 
             # NOTE: Currently model is profiled with a single non-text
-            # modality even when it supports multiple.
-            max_tokens_per_mm_item = max(
-                self.mm_registry.get_max_tokens_per_item_by_modality(
-                    self.model_config).values())
-
-            max_num_mm_items_encoder_budget = min(
-                self.max_num_encoder_input_tokens,
-                self.encoder_cache_size) // max_tokens_per_mm_item
-
+            # modality with the max possible input tokens even when
+            # it supports multiple.
+            max_tokens_by_modality_dict = self.mm_registry.get_max_tokens_per_item_by_modality(  # noqa: E501
+                self.model_config)
+
+            dummy_data_modality, max_tokens_per_mm_item = max(
+                max_tokens_by_modality_dict.items(), key=lambda item: item[1])
+
+            # Check how many items of this modality can be supported by
+            # the encoder cache budget.
+            encoder_cache_budget = min(self.max_num_encoder_input_tokens,
+                                       self.encoder_cache_size)
+            max_num_mm_items_encoder_budget = encoder_cache_budget // \
+                max_tokens_per_mm_item
+
+            # TODO: Allow users to set encoder_cache_budget in case this
+            # happens.
+            assert max_num_mm_items_encoder_budget > 0, (
+                f"Encoder cache budget={encoder_cache_budget} is too small to "
+                f"support the maximum possible size of multimodal embeddings"
+                f"={max_tokens_per_mm_item}.")
+
+            # Check how many items of this modality can be supported by
+            # the decoder budget.
             max_mm_items_per_req = max(
                 self.mm_registry.get_mm_limits_per_prompt(
                     self.model_config).values())
@@ -763,29 +776,26 @@ def profile_run(self) -> None:
             # they are scheduled to be processed separately.
 
             # Case when models have a merged processor, their dummy data is
-            # already batched `MultiModalKwargs`, therefore we need to "unbatch"
-            # and take the first item in each batched tensor.
-            # TODO (ywang96): This is somewhat hacky. Refactor this to be
-            # consistent with the other case.
+            # already batched `MultiModalKwargs`, therefore we take the first
+            # `MultiModalKwargsItem` from the desired modality to profile on.
             if isinstance(dummy_mm_data, MultiModalKwargs):
-                dummy_mm_kwargs = {
-                    k: v[0].unsqueeze(0)
-                    for k, v in dummy_mm_data.items()
-                }
+                dummy_mm_item = dummy_mm_data.get_item(
+                    modality=dummy_data_modality, item_index=0)
+                dummy_mm_kwargs = MultiModalKwargs.from_items([dummy_mm_item])
 
             # Case when models have dummy data explicitly defined as
             # `MultiModalDataDict`, so they need to be processed through input
             # mapper.
+            # TODO (ywang96): deprecate this path once merged processor is supported
+            # on all models.
             else:
                 mm_kwargs_list = self.mm_input_mapper_profiling.process_inputs(
                     mm_data=dummy_mm_data,
                     mm_hashes=None,
                     mm_processor_kwargs=None,
                     precomputed_mm_inputs=None)
-
-                # Take the first `MultiModalKwargs`
                 dummy_mm_kwargs = mm_kwargs_list[0]
-
+            
             batched_dummy_mm_inputs = MultiModalKwargs.batch(
                 [dummy_mm_kwargs] * max_num_mm_items)
             batched_dummy_mm_inputs = MultiModalKwargs.as_kwargs(

From 16e5b04b0a1b9022e838f69592f173c33fd9685c Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Sun, 5 Jan 2025 05:47:07 +0000
Subject: [PATCH 18/47] update dummy data for llava-ov

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/model_executor/models/llava_onevision.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index 6c51167c531d..354ad048127f 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -218,8 +218,10 @@ def _get_dummy_num_frames(self, seq_len: int) -> int:
         max_image_tokens = self._get_max_image_tokens() * max_images
         max_total_frames = self._get_max_video_frames(seq_len -
                                                       max_image_tokens)
+        max_frames_per_video = min(max_total_frames // max(max_videos, 1),
+                                   _MAX_FRAMES_PER_VIDEO)
 
-        return max(max_total_frames // max(max_videos, 1), 1)
+        return max(max_frames_per_video, 1)
 
     def _get_video_token(self) -> str:
         return self._get_hf_processor().video_token

From 612880bb3b9839c22e7955bbcb24f05e26fb0470 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Sun, 5 Jan 2025 06:49:22 +0000
Subject: [PATCH 19/47] preserve modality order

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/model_executor/models/llava_onevision.py | 50 +++++++++++++------
 1 file changed, 36 insertions(+), 14 deletions(-)

diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index 354ad048127f..4dd1b9235833 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -528,13 +528,34 @@ def _parse_and_validate_video_input(
     def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
         modalities = {}
 
-        if "pixel_values" in kwargs:
-            modalities["images"] = self._parse_and_validate_image_input(
-                **kwargs)
+        # Preserve the order of modalities from the kwargs if there are
+        # multiple of them.
+        if "pixel_values" in kwargs and "pixel_values_videos" in kwargs:
+            keys_order = list(kwargs.keys())
+
+            # Find the indices of the keys
+            index_image = keys_order.index("pixel_values")
+            index_video = keys_order.index("pixel_values_videos")
+
+            if index_image < index_video:
+                modalities["images"] = self._parse_and_validate_image_input(
+                    **kwargs)
+                modalities["videos"] = self._parse_and_validate_video_input(
+                    **kwargs)
+            else:
+                modalities["videos"] = self._parse_and_validate_video_input(
+                    **kwargs)
+                modalities["images"] = self._parse_and_validate_image_input(
+                    **kwargs)
 
-        if "pixel_values_videos" in kwargs:
-            modalities["videos"] = self._parse_and_validate_video_input(
-                **kwargs)
+        # Single modality
+        else:
+            if "pixel_values" in kwargs:
+                modalities["images"] = self._parse_and_validate_image_input(
+                    **kwargs)
+            if "pixel_values_videos" in kwargs:
+                modalities["videos"] = self._parse_and_validate_video_input(
+                    **kwargs)
 
         return modalities
 
@@ -798,14 +819,15 @@ def get_multimodal_embeddings(
         # tensor correspoending to a multimodal data item (image or video).
         multimodal_embeddings: tuple[torch.Tensor, ...] = ()
 
-        if "images" in modalities:
-            image_input = modalities["images"]
-            vision_embeddings = self._process_image_input(image_input)
-            multimodal_embeddings += tuple(vision_embeddings)
-        if "videos" in modalities:
-            video_input = modalities["videos"]
-            video_embeddings = self._process_video_pixels(video_input)
-            multimodal_embeddings += tuple(video_embeddings)
+        for modality in modalities:
+            if modality == "images":
+                image_input = modalities["images"]
+                vision_embeddings = self._process_image_input(image_input)
+                multimodal_embeddings += tuple(vision_embeddings)
+            if modality == "videos":
+                video_input = modalities["videos"]
+                video_embeddings = self._process_video_pixels(video_input)
+                multimodal_embeddings += tuple(video_embeddings)
 
         return multimodal_embeddings
 

From 30227545405754a15fb7ec3bc78a59a13796246a Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Sun, 5 Jan 2025 06:49:32 +0000
Subject: [PATCH 20/47] format

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/v1/worker/gpu_model_runner.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/vllm/v1/worker/gpu_model_runner.py b/vllm/v1/worker/gpu_model_runner.py
index 029dead16876..588339041d15 100644
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -786,8 +786,8 @@ def profile_run(self) -> None:
             # Case when models have dummy data explicitly defined as
             # `MultiModalDataDict`, so they need to be processed through input
             # mapper.
-            # TODO (ywang96): deprecate this path once merged processor is supported
-            # on all models.
+            # TODO (ywang96): deprecate this path once merged processor is
+            # supported on all models.
             else:
                 mm_kwargs_list = self.mm_input_mapper_profiling.process_inputs(
                     mm_data=dummy_mm_data,
@@ -795,7 +795,7 @@ def profile_run(self) -> None:
                     mm_processor_kwargs=None,
                     precomputed_mm_inputs=None)
                 dummy_mm_kwargs = mm_kwargs_list[0]
-            
+
             batched_dummy_mm_inputs = MultiModalKwargs.batch(
                 [dummy_mm_kwargs] * max_num_mm_items)
             batched_dummy_mm_inputs = MultiModalKwargs.as_kwargs(

From 20d6a671af32102110041d4cd6add4810d484a78 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Sun, 5 Jan 2025 07:57:36 +0000
Subject: [PATCH 21/47] simplify

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/model_executor/models/llava_onevision.py | 25 +++----------------
 1 file changed, 3 insertions(+), 22 deletions(-)

diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index 4dd1b9235833..1183d10aa64b 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -530,30 +530,11 @@ def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
 
         # Preserve the order of modalities from the kwargs if there are
         # multiple of them.
-        if "pixel_values" in kwargs and "pixel_values_videos" in kwargs:
-            keys_order = list(kwargs.keys())
-
-            # Find the indices of the keys
-            index_image = keys_order.index("pixel_values")
-            index_video = keys_order.index("pixel_values_videos")
-
-            if index_image < index_video:
-                modalities["images"] = self._parse_and_validate_image_input(
-                    **kwargs)
-                modalities["videos"] = self._parse_and_validate_video_input(
-                    **kwargs)
-            else:
-                modalities["videos"] = self._parse_and_validate_video_input(
-                    **kwargs)
-                modalities["images"] = self._parse_and_validate_image_input(
-                    **kwargs)
-
-        # Single modality
-        else:
-            if "pixel_values" in kwargs:
+        for input_key in kwargs:
+            if input_key == "pixel_values" and "image" not in modalities:
                 modalities["images"] = self._parse_and_validate_image_input(
                     **kwargs)
-            if "pixel_values_videos" in kwargs:
+            if input_key == "pixel_values_videos" and "videos" not in modalities:  # noqa E501
                 modalities["videos"] = self._parse_and_validate_video_input(
                     **kwargs)
 

From 3dd2db2ed6915c907249200c8c17c485a8f854a4 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Sun, 5 Jan 2025 07:59:19 +0000
Subject: [PATCH 22/47] typo

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/model_executor/models/llava_onevision.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index 1183d10aa64b..3edf72bf4011 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -531,7 +531,7 @@ def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
         # Preserve the order of modalities from the kwargs if there are
         # multiple of them.
         for input_key in kwargs:
-            if input_key == "pixel_values" and "image" not in modalities:
+            if input_key == "pixel_values" and "images" not in modalities:
                 modalities["images"] = self._parse_and_validate_image_input(
                     **kwargs)
             if input_key == "pixel_values_videos" and "videos" not in modalities:  # noqa E501

From 5ce6f7abd745e27d495624b6fdcbb8ac572bc996 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Sun, 5 Jan 2025 08:02:20 +0000
Subject: [PATCH 23/47] clarify

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/model_executor/models/llava_onevision.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index 3edf72bf4011..5882c0d79f5e 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -528,8 +528,8 @@ def _parse_and_validate_video_input(
     def _parse_and_validate_multimodal_inputs(self, **kwargs: object) -> dict:
         modalities = {}
 
-        # Preserve the order of modalities from the kwargs if there are
-        # multiple of them.
+        # Preserve the order of modalities if there are multiple of them
+        # from the order of kwargs.
         for input_key in kwargs:
             if input_key == "pixel_values" and "images" not in modalities:
                 modalities["images"] = self._parse_and_validate_image_input(

From 4113e51f9ae67c9ebaae23f36410c11e33326c96 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Sun, 5 Jan 2025 01:05:24 -0800
Subject: [PATCH 24/47] add test

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 tests/multimodal/test_utils.py | 189 +++++++++++++++++++++++++++++++++
 1 file changed, 189 insertions(+)

diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py
index 6029f2e51477..fa988cff764d 100644
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -9,7 +9,9 @@
 from PIL import Image, ImageChops
 from transformers import AutoConfig, AutoTokenizer
 
+from vllm.multimodal.inputs import PlaceholderRange
 from vllm.multimodal.utils import (MediaConnector,
+                                   merge_and_sort_mm_metadata_from_modalities,
                                    repeat_and_pad_placeholder_tokens)
 
 # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
@@ -191,3 +193,190 @@ def test_repeat_and_pad_placeholder_tokens(model):
         assert new_prompt == expected_prompt
         assert new_token_ids == expected_token_ids
         assert ranges == expected_ranges
+
+
+def test_merge_and_sort_mm_metadata_from_modalities():
+
+    # Each test case is a tuple of :
+    # - mm_positions: MultiModalPlaceholderDict
+    # - mm_hashes: Optional[MultiModalHashDict]
+    # - expected sorted modalities
+    # - expected sorted & flattened PlaceholderRanges
+    # - expected sorted & flattened hash strings.
+    test_cases = [
+        # Single modality should return result as is but flattened
+        (
+            {
+                "image": [
+                    PlaceholderRange(offset=0, length=2),
+                    PlaceholderRange(offset=3, length=2)
+                ]
+            },
+            {
+                "image": ["hash1", "hash2"]
+            },
+            ["image"],
+            [
+                PlaceholderRange(offset=0, length=2),
+                PlaceholderRange(offset=3, length=2)
+            ],
+            ["hash1", "hash2"],
+        ),
+        # Single modality without hashes return None for mm hash.
+        (
+            {
+                "image": [
+                    PlaceholderRange(offset=0, length=2),
+                    PlaceholderRange(offset=2, length=2)
+                ]
+            },
+            None,
+            ["image"],
+            [
+                PlaceholderRange(offset=0, length=2),
+                PlaceholderRange(offset=2, length=2)
+            ],
+            None,
+        ),
+        # Multiple modalities with hashes should return sorted modalities
+        # and flattened ranges and hashes.
+        (
+            {
+                "image": [
+                    PlaceholderRange(offset=7, length=4),
+                    PlaceholderRange(offset=11, length=5)
+                ],
+                "audio": [
+                    PlaceholderRange(offset=0, length=2),
+                    PlaceholderRange(offset=2, length=3)
+                ]
+            },
+            {
+                "image": ["image_hash1", "image_hash2"],
+                "audio": ["audio_hash1", "audio_hash2"]
+            },
+            ["audio", "image"],
+            [
+                PlaceholderRange(offset=0, length=2),
+                PlaceholderRange(offset=2, length=3),
+                PlaceholderRange(offset=7, length=4),
+                PlaceholderRange(offset=11, length=5)
+            ],
+            ["audio_hash1", "audio_hash2", "image_hash1", "image_hash2"],
+        ),
+        # Multiple modalities without hashes should return sorted modalities
+        # and flattened ranges and None.
+        (
+            {
+                "image": [
+                    PlaceholderRange(offset=7, length=4),
+                    PlaceholderRange(offset=11, length=5)
+                ],
+                "audio": [
+                    PlaceholderRange(offset=0, length=2),
+                    PlaceholderRange(offset=2, length=3)
+                ]
+            },
+            None,
+            ["audio", "image"],
+            [
+                PlaceholderRange(offset=0, length=2),
+                PlaceholderRange(offset=2, length=3),
+                PlaceholderRange(offset=7, length=4),
+                PlaceholderRange(offset=11, length=5)
+            ],
+            None,
+        ),
+        # Three modalities
+        (
+            {
+                "image": [
+                    PlaceholderRange(offset=15, length=7),
+                    PlaceholderRange(offset=22, length=8),
+                ],
+                "audio": [
+                    PlaceholderRange(offset=0, length=2),
+                ],
+                "video": [
+                    PlaceholderRange(offset=3, length=4),
+                    PlaceholderRange(offset=7, length=5),
+                    PlaceholderRange(offset=12, length=6),
+                ]
+            },
+            {
+                "image": ["image_hash1", "image_hash2"],
+                "audio": ["audio_hash1"],
+                "video": ["video_hash1", "video_hash2", "video_hash3"]
+            },
+            ["audio", "video", "audio"],
+            [
+                PlaceholderRange(offset=0, length=2),
+                PlaceholderRange(offset=3, length=4),
+                PlaceholderRange(offset=7, length=5),
+                PlaceholderRange(offset=12, length=6),
+                PlaceholderRange(offset=15, length=7),
+                PlaceholderRange(offset=22, length=8),
+            ],
+            [
+                "audio_hash1", "video_hash1", "video_hash2", "video_hash3",
+                "image_hash1", "image_hash2"
+            ],
+        ),
+    ]
+
+    for (mm_positions, mm_hashes, expected_modalities, expected_ranges,
+         expected_hashes) in test_cases:
+        modalities, ranges, hashes = merge_and_sort_mm_metadata_from_modalities(
+            mm_positions, mm_hashes)
+
+        assert modalities == expected_modalities
+        assert ranges == expected_ranges
+        assert hashes == expected_hashes
+    return test_cases
+
+
+def test_merge_and_sort_mm_metadata_from_modalities_interleaving():
+
+    test_cases = [
+
+        # <image> <audio> <image> <audio>
+        (
+            {
+                "image": [
+                    PlaceholderRange(offset=0, length=4),
+                    PlaceholderRange(offset=8, length=2)
+                ],
+                "audio": [
+                    PlaceholderRange(offset=5, length=2),
+                    PlaceholderRange(offset=11, length=4)
+                ]
+            },
+            {
+                "image": ["image_hash1", "image_hash2"],
+                "audio": ["audio_hash1", "audio_hash2"]
+            },
+        ),
+        # <image> <image> <video> <audio> <image>
+        (
+            {
+                "image": [
+                    PlaceholderRange(offset=0, length=2),
+                    PlaceholderRange(offset=2, length=3),
+                    PlaceholderRange(offset=20, length=4),
+                ],
+                "audio": [
+                    PlaceholderRange(offset=5, length=2),
+                ],
+                "video": [
+                    PlaceholderRange(offset=8, length=5),
+                ]
+            },
+            None,
+        ),
+    ]
+
+    for (mm_positions, mm_hashes) in test_cases:
+        with pytest.raises(ValueError) as ex_info:
+            merge_and_sort_mm_metadata_from_modalities(mm_positions, mm_hashes)
+
+        assert "Interleaved mixed-modality" in str(ex_info.value)

From 3ca30fc3f2f683d404bf06803d4bcbd95deb4065 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Sun, 5 Jan 2025 09:09:45 +0000
Subject: [PATCH 25/47] fix test

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 tests/multimodal/test_utils.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py
index fa988cff764d..9bafb575e136 100644
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -308,7 +308,7 @@ def test_merge_and_sort_mm_metadata_from_modalities():
                 "audio": ["audio_hash1"],
                 "video": ["video_hash1", "video_hash2", "video_hash3"]
             },
-            ["audio", "video", "audio"],
+            ["audio", "video", "image"],
             [
                 PlaceholderRange(offset=0, length=2),
                 PlaceholderRange(offset=3, length=4),
@@ -332,7 +332,6 @@ def test_merge_and_sort_mm_metadata_from_modalities():
         assert modalities == expected_modalities
         assert ranges == expected_ranges
         assert hashes == expected_hashes
-    return test_cases
 
 
 def test_merge_and_sort_mm_metadata_from_modalities_interleaving():

From ef8c6d119c3f0e6b3a2f6042cca8e168bdce6cdd Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Sun, 5 Jan 2025 01:15:56 -0800
Subject: [PATCH 26/47] add note

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/model_executor/models/llava_onevision.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index 5882c0d79f5e..cc77f4eca21f 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -800,6 +800,8 @@ def get_multimodal_embeddings(
         # tensor correspoending to a multimodal data item (image or video).
         multimodal_embeddings: tuple[torch.Tensor, ...] = ()
 
+        # NOTE: It is imported to iterate over the keys in this dictionary
+        # to preserve the order of the modalities.
         for modality in modalities:
             if modality == "images":
                 image_input = modalities["images"]

From bc1debd87c016106c25be32bb1c20d97b421ea80 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Sun, 5 Jan 2025 01:18:43 -0800
Subject: [PATCH 27/47] comment

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 tests/multimodal/test_utils.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py
index 9bafb575e136..ed8212174014 100644
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -200,9 +200,9 @@ def test_merge_and_sort_mm_metadata_from_modalities():
     # Each test case is a tuple of :
     # - mm_positions: MultiModalPlaceholderDict
     # - mm_hashes: Optional[MultiModalHashDict]
-    # - expected sorted modalities
-    # - expected sorted & flattened PlaceholderRanges
-    # - expected sorted & flattened hash strings.
+    # - expected sorted modalities: list[str]
+    # - expected sorted & flattened PlaceholderRanges: list[PlaceholderRange]
+    # - expected sorted & flattened hash strings: Optional[list[str]]
     test_cases = [
         # Single modality should return result as is but flattened
         (

From 56a7ef0b7e4fc8e3189cd8f42a7a00bd122f2e1a Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Sun, 5 Jan 2025 01:21:07 -0800
Subject: [PATCH 28/47] typo

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/model_executor/models/llava_onevision.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index cc77f4eca21f..4a9b6452b81f 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -800,7 +800,7 @@ def get_multimodal_embeddings(
         # tensor correspoending to a multimodal data item (image or video).
         multimodal_embeddings: tuple[torch.Tensor, ...] = ()
 
-        # NOTE: It is imported to iterate over the keys in this dictionary
+        # NOTE: It is important to iterate over the keys in this dictionary
         # to preserve the order of the modalities.
         for modality in modalities:
             if modality == "images":

From 568a586d4bee20b92581af19d4b2deacf21f4a0a Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Sun, 5 Jan 2025 01:52:12 -0800
Subject: [PATCH 29/47] rename

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 tests/multimodal/test_utils.py | 10 +++++-----
 vllm/multimodal/utils.py       |  2 +-
 vllm/v1/engine/processor.py    |  4 ++--
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py
index ed8212174014..208e1a36f6aa 100644
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -11,7 +11,7 @@
 
 from vllm.multimodal.inputs import PlaceholderRange
 from vllm.multimodal.utils import (MediaConnector,
-                                   merge_and_sort_mm_metadata_from_modalities,
+                                   merge_and_sort_multimodal_metadata,
                                    repeat_and_pad_placeholder_tokens)
 
 # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
@@ -195,7 +195,7 @@ def test_repeat_and_pad_placeholder_tokens(model):
         assert ranges == expected_ranges
 
 
-def test_merge_and_sort_mm_metadata_from_modalities():
+def test_merge_and_sort_multimodal_metadata():
 
     # Each test case is a tuple of :
     # - mm_positions: MultiModalPlaceholderDict
@@ -326,7 +326,7 @@ def test_merge_and_sort_mm_metadata_from_modalities():
 
     for (mm_positions, mm_hashes, expected_modalities, expected_ranges,
          expected_hashes) in test_cases:
-        modalities, ranges, hashes = merge_and_sort_mm_metadata_from_modalities(
+        modalities, ranges, hashes = merge_and_sort_multimodal_metadata(
             mm_positions, mm_hashes)
 
         assert modalities == expected_modalities
@@ -334,7 +334,7 @@ def test_merge_and_sort_mm_metadata_from_modalities():
         assert hashes == expected_hashes
 
 
-def test_merge_and_sort_mm_metadata_from_modalities_interleaving():
+def test_merge_and_sort_multimodal_metadata_with_interleaving():
 
     test_cases = [
 
@@ -376,6 +376,6 @@ def test_merge_and_sort_mm_metadata_from_modalities_interleaving():
 
     for (mm_positions, mm_hashes) in test_cases:
         with pytest.raises(ValueError) as ex_info:
-            merge_and_sort_mm_metadata_from_modalities(mm_positions, mm_hashes)
+            merge_and_sort_multimodal_metadata(mm_positions, mm_hashes)
 
         assert "Interleaved mixed-modality" in str(ex_info.value)
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 4a4903ecf7ee..645a0c310e72 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -442,7 +442,7 @@ def consecutive_placeholder_ranges(
     ]
 
 
-def merge_and_sort_mm_metadata_from_modalities(
+def merge_and_sort_multimodal_metadata(
     mm_positions: "MultiModalPlaceholderDict",
     mm_hashes: Optional["MultiModalHashDict"],
 ) -> tuple[list[str], list[PlaceholderRange], Optional[list[str]]]:
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 836b87e0cacb..15962bc4e9b4 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -9,7 +9,7 @@
 from vllm.lora.request import LoRARequest
 from vllm.multimodal import (MULTIMODAL_REGISTRY, MultiModalHasher,
                              MultiModalKwargs, MultiModalRegistry)
-from vllm.multimodal.utils import merge_and_sort_mm_metadata_from_modalities
+from vllm.multimodal.utils import merge_and_sort_multimodal_metadata
 from vllm.pooling_params import PoolingParams
 from vllm.prompt_adapter.request import PromptAdapterRequest
 from vllm.sampling_params import SamplingParams
@@ -135,7 +135,7 @@ def process_inputs(
         mm_positions = decoder_inputs.multi_modal_placeholders
 
         if mm_positions:
-            sorted_modalities, sorted_mm_positions, sorted_mm_hashes = merge_and_sort_mm_metadata_from_modalities(  # noqa: E501
+            sorted_modalities, sorted_mm_positions, sorted_mm_hashes = merge_and_sort_multimodal_metadata(  # noqa: E501
                 mm_positions,
                 mm_hashes,
             )

From 6ca99a3537fb299c344195e24ff8067aa47acaf9 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Sun, 5 Jan 2025 09:54:57 +0000
Subject: [PATCH 30/47] remove redundant constants

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/model_executor/models/llava_onevision.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index 4a9b6452b81f..bf591aeb9fc5 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -34,9 +34,6 @@
 from .utils import (AutoWeightsLoader, flatten_bn, init_vllm_registered_model,
                     maybe_prefix, merge_multimodal_embeddings)
 
-# Ref: https://github.com/LLaVA-VL/LLaVA-NeXT/blob/main/docs/LLaVA_OneVision.md?plain=1#L14
-MAX_IMAGE_FEATURE_SIZE_HEIGHT = MAX_IMAGE_FEATURE_SIZE_WIDTH = 2304
-
 # For profile run
 _MAX_FRAMES_PER_VIDEO = 16
 

From 6c8ff3b6c867e810fd1a33dd37be58b47a050671 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Sun, 5 Jan 2025 10:04:22 +0000
Subject: [PATCH 31/47] update interface with note

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/model_executor/models/interfaces.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/vllm/model_executor/models/interfaces.py b/vllm/model_executor/models/interfaces.py
index 70b78fe64f2d..6f2660304648 100644
--- a/vllm/model_executor/models/interfaces.py
+++ b/vllm/model_executor/models/interfaces.py
@@ -39,8 +39,12 @@ def get_multimodal_embeddings(self, **kwargs) -> Optional[T]:
 
         The output embeddings must be one of the following formats:
         - A list or tuple of 2D tensors, where each tensor corresponds to 
-          each input image.
+          each input multimodal data item (e.g, image).
         - A single 3D tensor, with the batch dimension grouping the 2D tensors.
+
+        NOTE: The returned multimodal embeddings must be in the same order as 
+        the appearances of their corresponding multimodal data item in the 
+        input prompt.
         """
         ...
 

From 293b3fef61d3006f2e981aac319997fe69b2c0bf Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Sun, 5 Jan 2025 03:04:11 -0800
Subject: [PATCH 32/47] update doc

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 docs/source/models/supported_models.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/source/models/supported_models.md b/docs/source/models/supported_models.md
index 7682ed104b8c..cb544b83974d 100644
--- a/docs/source/models/supported_models.md
+++ b/docs/source/models/supported_models.md
@@ -647,7 +647,7 @@ See [this page](#generative-models) for more information on how to use generativ
   - `llava-hf/llava-onevision-qwen2-7b-ov-hf`, `llava-hf/llava-onevision-qwen2-0.5b-ov-hf`, etc.
   -
   - ✅︎
-  -
+  - ✅︎
 * - `MiniCPMV`
   - MiniCPM-V
   - T + I<sup>E+</sup>

From 14482bfcf338298b89bec919fc12b592519189f1 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Sun, 5 Jan 2025 17:05:56 -0800
Subject: [PATCH 33/47] address review comments

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/model_executor/models/llava_onevision.py |  2 +-
 vllm/multimodal/__init__.py                   |  4 ++--
 vllm/multimodal/hasher.py                     |  8 ++++++--
 vllm/multimodal/inputs.py                     | 14 ++++++--------
 vllm/multimodal/utils.py                      |  3 ++-
 vllm/v1/engine/processor.py                   |  6 +++++-
 6 files changed, 22 insertions(+), 15 deletions(-)

diff --git a/vllm/model_executor/models/llava_onevision.py b/vllm/model_executor/models/llava_onevision.py
index bf591aeb9fc5..7f541cf3b979 100644
--- a/vllm/model_executor/models/llava_onevision.py
+++ b/vllm/model_executor/models/llava_onevision.py
@@ -793,7 +793,7 @@ def get_multimodal_embeddings(
         if not modalities:
             return None
 
-        # The result multimoal_embeddings is tuple of tensors, with each
+        # The result multimodal_embeddings is tuple of tensors, with each
         # tensor correspoending to a multimodal data item (image or video).
         multimodal_embeddings: tuple[torch.Tensor, ...] = ()
 
diff --git a/vllm/multimodal/__init__.py b/vllm/multimodal/__init__.py
index d8a5d37b958a..343b9322ecc5 100644
--- a/vllm/multimodal/__init__.py
+++ b/vllm/multimodal/__init__.py
@@ -1,7 +1,7 @@
 from .base import MultiModalPlaceholderMap, MultiModalPlugin
-from .hasher import MultiModalHasher
+from .hasher import MultiModalHashDict, MultiModalHasher
 from .inputs import (BatchedTensorInputs, ModalityData, MultiModalDataBuiltins,
-                     MultiModalDataDict, MultiModalHashDict, MultiModalKwargs,
+                     MultiModalDataDict, MultiModalKwargs,
                      MultiModalPlaceholderDict, NestedTensors)
 from .registry import MultiModalRegistry
 
diff --git a/vllm/multimodal/hasher.py b/vllm/multimodal/hasher.py
index 0acce70ef133..24aa1ca65804 100644
--- a/vllm/multimodal/hasher.py
+++ b/vllm/multimodal/hasher.py
@@ -1,5 +1,5 @@
 import pickle
-from typing import TYPE_CHECKING, Iterable, Optional
+from typing import TYPE_CHECKING, Iterable, Mapping, Optional
 
 import numpy as np
 import torch
@@ -10,10 +10,14 @@
 
 if TYPE_CHECKING:
     from vllm.inputs import TokensPrompt
-    from vllm.multimodal import MultiModalHashDict
 
 logger = init_logger(__name__)
 
+MultiModalHashDict = Mapping[str, list[str]]
+"""
+A dictionary containing hashes for items in each modality.
+"""
+
 
 class MultiModalHasher:
 
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 98f238bf3a0c..2abc91595cef 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -2,8 +2,8 @@
 from collections import UserDict, defaultdict
 from collections.abc import Mapping, Sequence
 from dataclasses import dataclass
-from typing import (Any, Literal, Optional, TypedDict, TypeVar, Union, cast,
-                    final)
+from typing import (TYPE_CHECKING, Any, Literal, Optional, TypedDict, TypeVar,
+                    Union, cast, final)
 
 import numpy as np
 import torch
@@ -14,6 +14,9 @@
 
 from vllm.utils import JSONTree, full_groupby, is_list_of, json_map_leaves
 
+if TYPE_CHECKING:
+    from .hasher import MultiModalHashDict
+
 _T = TypeVar("_T")
 
 HfImageItem: TypeAlias = Union[Image, np.ndarray, torch.Tensor]
@@ -491,11 +494,6 @@ def get_items(self, modality: str) -> Sequence[MultiModalKwargsItem]:
 A dictionary containing placeholder ranges.
 """
 
-MultiModalHashDict = Mapping[str, list[str]]
-"""
-A dictionary containing hashes for items in each modality.
-"""
-
 
 class MultiModalInputsV2(TypedDict):
     """
@@ -518,7 +516,7 @@ class MultiModalInputsV2(TypedDict):
     mm_kwargs: MultiModalKwargs
     """Keyword arguments to be directly passed to the model after batching."""
 
-    mm_hashes: NotRequired[MultiModalHashDict]
+    mm_hashes: NotRequired["MultiModalHashDict"]
     """The hashes of the multi-modal data."""
 
     mm_placeholders: MultiModalPlaceholderDict
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 645a0c310e72..9323bd489e83 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -26,7 +26,8 @@
 _M = TypeVar("_M")
 
 if TYPE_CHECKING:
-    from ..multimodal import MultiModalHashDict, MultiModalPlaceholderDict
+    from .hasher import MultiModalHashDict
+    from .inputs import MultiModalPlaceholderDict
 
 
 class MediaConnector:
diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 15962bc4e9b4..1c4e062ea266 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -135,7 +135,11 @@ def process_inputs(
         mm_positions = decoder_inputs.multi_modal_placeholders
 
         if mm_positions:
-            sorted_modalities, sorted_mm_positions, sorted_mm_hashes = merge_and_sort_multimodal_metadata(  # noqa: E501
+            (
+                sorted_modalities,
+                sorted_mm_positions,
+                sorted_mm_hashes,
+            ) = merge_and_sort_multimodal_metadata(
                 mm_positions,
                 mm_hashes,
             )

From eeee40290ce505327f5f591e1044f17a9bd0f16a Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Sun, 5 Jan 2025 20:12:46 -0800
Subject: [PATCH 34/47] use namedtuple

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 tests/multimodal/test_utils.py | 277 ++++++++++++++++-----------------
 1 file changed, 137 insertions(+), 140 deletions(-)

diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py
index 208e1a36f6aa..2faf8117ed76 100644
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -1,6 +1,7 @@
 import base64
 import mimetypes
 import os
+from collections import namedtuple
 from tempfile import NamedTemporaryFile, TemporaryDirectory
 from typing import Dict, Tuple
 
@@ -195,133 +196,129 @@ def test_repeat_and_pad_placeholder_tokens(model):
         assert ranges == expected_ranges
 
 
+# Each test case is a tuple of :
+# - mm_positions: MultiModalPlaceholderDict
+# - mm_hashes: Optional[MultiModalHashDict]
+# - expected sorted modalities: list[str]
+# - expected sorted & flattened PlaceholderRanges: list[PlaceholderRange]
+# - expected sorted & flattened hash strings: Optional[list[str]]
+TestCase = namedtuple("TestCase", [
+    "mm_positions", "mm_hashes", "expected_modalities", "expected_ranges",
+    "expected_hashes"
+])
+
+
 def test_merge_and_sort_multimodal_metadata():
 
-    # Each test case is a tuple of :
-    # - mm_positions: MultiModalPlaceholderDict
-    # - mm_hashes: Optional[MultiModalHashDict]
-    # - expected sorted modalities: list[str]
-    # - expected sorted & flattened PlaceholderRanges: list[PlaceholderRange]
-    # - expected sorted & flattened hash strings: Optional[list[str]]
     test_cases = [
         # Single modality should return result as is but flattened
-        (
-            {
-                "image": [
-                    PlaceholderRange(offset=0, length=2),
-                    PlaceholderRange(offset=3, length=2)
-                ]
-            },
-            {
-                "image": ["hash1", "hash2"]
-            },
-            ["image"],
-            [
+        TestCase(mm_positions={
+            "image": [
                 PlaceholderRange(offset=0, length=2),
                 PlaceholderRange(offset=3, length=2)
-            ],
-            ["hash1", "hash2"],
-        ),
+            ]
+        },
+                 mm_hashes={"image": ["hash1", "hash2"]},
+                 expected_modalities=["image"],
+                 expected_ranges=[
+                     PlaceholderRange(offset=0, length=2),
+                     PlaceholderRange(offset=3, length=2)
+                 ],
+                 expected_hashes=["hash1", "hash2"]),
         # Single modality without hashes return None for mm hash.
-        (
-            {
-                "image": [
-                    PlaceholderRange(offset=0, length=2),
-                    PlaceholderRange(offset=2, length=2)
-                ]
-            },
-            None,
-            ["image"],
-            [
+        TestCase(mm_positions={
+            "image": [
                 PlaceholderRange(offset=0, length=2),
                 PlaceholderRange(offset=2, length=2)
-            ],
-            None,
-        ),
+            ]
+        },
+                 mm_hashes=None,
+                 expected_modalities=["image"],
+                 expected_ranges=[
+                     PlaceholderRange(offset=0, length=2),
+                     PlaceholderRange(offset=2, length=2)
+                 ],
+                 expected_hashes=None),
         # Multiple modalities with hashes should return sorted modalities
         # and flattened ranges and hashes.
-        (
-            {
-                "image": [
-                    PlaceholderRange(offset=7, length=4),
-                    PlaceholderRange(offset=11, length=5)
-                ],
-                "audio": [
-                    PlaceholderRange(offset=0, length=2),
-                    PlaceholderRange(offset=2, length=3)
-                ]
-            },
-            {
-                "image": ["image_hash1", "image_hash2"],
-                "audio": ["audio_hash1", "audio_hash2"]
-            },
-            ["audio", "image"],
-            [
-                PlaceholderRange(offset=0, length=2),
-                PlaceholderRange(offset=2, length=3),
+        TestCase(mm_positions={
+            "image": [
                 PlaceholderRange(offset=7, length=4),
                 PlaceholderRange(offset=11, length=5)
             ],
-            ["audio_hash1", "audio_hash2", "image_hash1", "image_hash2"],
-        ),
+            "audio": [
+                PlaceholderRange(offset=0, length=2),
+                PlaceholderRange(offset=2, length=3)
+            ]
+        },
+                 mm_hashes={
+                     "image": ["image_hash1", "image_hash2"],
+                     "audio": ["audio_hash1", "audio_hash2"]
+                 },
+                 expected_modalities=["audio", "image"],
+                 expected_ranges=[
+                     PlaceholderRange(offset=0, length=2),
+                     PlaceholderRange(offset=2, length=3),
+                     PlaceholderRange(offset=7, length=4),
+                     PlaceholderRange(offset=11, length=5)
+                 ],
+                 expected_hashes=[
+                     "audio_hash1", "audio_hash2", "image_hash1", "image_hash2"
+                 ]),
         # Multiple modalities without hashes should return sorted modalities
         # and flattened ranges and None.
-        (
-            {
-                "image": [
-                    PlaceholderRange(offset=7, length=4),
-                    PlaceholderRange(offset=11, length=5)
-                ],
-                "audio": [
-                    PlaceholderRange(offset=0, length=2),
-                    PlaceholderRange(offset=2, length=3)
-                ]
-            },
-            None,
-            ["audio", "image"],
-            [
-                PlaceholderRange(offset=0, length=2),
-                PlaceholderRange(offset=2, length=3),
+        TestCase(mm_positions={
+            "image": [
                 PlaceholderRange(offset=7, length=4),
                 PlaceholderRange(offset=11, length=5)
             ],
-            None,
-        ),
-        # Three modalities
-        (
-            {
-                "image": [
-                    PlaceholderRange(offset=15, length=7),
-                    PlaceholderRange(offset=22, length=8),
-                ],
-                "audio": [
-                    PlaceholderRange(offset=0, length=2),
-                ],
-                "video": [
-                    PlaceholderRange(offset=3, length=4),
-                    PlaceholderRange(offset=7, length=5),
-                    PlaceholderRange(offset=12, length=6),
-                ]
-            },
-            {
-                "image": ["image_hash1", "image_hash2"],
-                "audio": ["audio_hash1"],
-                "video": ["video_hash1", "video_hash2", "video_hash3"]
-            },
-            ["audio", "video", "image"],
-            [
+            "audio": [
                 PlaceholderRange(offset=0, length=2),
-                PlaceholderRange(offset=3, length=4),
-                PlaceholderRange(offset=7, length=5),
-                PlaceholderRange(offset=12, length=6),
+                PlaceholderRange(offset=2, length=3)
+            ]
+        },
+                 mm_hashes=None,
+                 expected_modalities=["audio", "image"],
+                 expected_ranges=[
+                     PlaceholderRange(offset=0, length=2),
+                     PlaceholderRange(offset=2, length=3),
+                     PlaceholderRange(offset=7, length=4),
+                     PlaceholderRange(offset=11, length=5)
+                 ],
+                 expected_hashes=None),
+        # Three modalities
+        TestCase(mm_positions={
+            "image": [
                 PlaceholderRange(offset=15, length=7),
                 PlaceholderRange(offset=22, length=8),
             ],
-            [
-                "audio_hash1", "video_hash1", "video_hash2", "video_hash3",
-                "image_hash1", "image_hash2"
+            "audio": [
+                PlaceholderRange(offset=0, length=2),
             ],
-        ),
+            "video": [
+                PlaceholderRange(offset=3, length=4),
+                PlaceholderRange(offset=7, length=5),
+                PlaceholderRange(offset=12, length=6),
+            ]
+        },
+                 mm_hashes={
+                     "image": ["image_hash1", "image_hash2"],
+                     "audio": ["audio_hash1"],
+                     "video": ["video_hash1", "video_hash2", "video_hash3"]
+                 },
+                 expected_modalities=["audio", "video", "image"],
+                 expected_ranges=[
+                     PlaceholderRange(offset=0, length=2),
+                     PlaceholderRange(offset=3, length=4),
+                     PlaceholderRange(offset=7, length=5),
+                     PlaceholderRange(offset=12, length=6),
+                     PlaceholderRange(offset=15, length=7),
+                     PlaceholderRange(offset=22, length=8),
+                 ],
+                 expected_hashes=[
+                     "audio_hash1", "video_hash1", "video_hash2",
+                     "video_hash3", "image_hash1", "image_hash2"
+                 ]),
     ]
 
     for (mm_positions, mm_hashes, expected_modalities, expected_ranges,
@@ -337,45 +334,45 @@ def test_merge_and_sort_multimodal_metadata():
 def test_merge_and_sort_multimodal_metadata_with_interleaving():
 
     test_cases = [
-
-        # <image> <audio> <image> <audio>
-        (
-            {
-                "image": [
-                    PlaceholderRange(offset=0, length=4),
-                    PlaceholderRange(offset=8, length=2)
-                ],
-                "audio": [
-                    PlaceholderRange(offset=5, length=2),
-                    PlaceholderRange(offset=11, length=4)
-                ]
-            },
-            {
-                "image": ["image_hash1", "image_hash2"],
-                "audio": ["audio_hash1", "audio_hash2"]
-            },
-        ),
-        # <image> <image> <video> <audio> <image>
-        (
-            {
-                "image": [
-                    PlaceholderRange(offset=0, length=2),
-                    PlaceholderRange(offset=2, length=3),
-                    PlaceholderRange(offset=20, length=4),
-                ],
-                "audio": [
-                    PlaceholderRange(offset=5, length=2),
-                ],
-                "video": [
-                    PlaceholderRange(offset=8, length=5),
-                ]
-            },
-            None,
-        ),
+        TestCase(mm_positions={
+            "image": [
+                PlaceholderRange(offset=0, length=4),
+                PlaceholderRange(offset=8, length=2)
+            ],
+            "audio": [
+                PlaceholderRange(offset=5, length=2),
+                PlaceholderRange(offset=11, length=4)
+            ]
+        },
+                 mm_hashes={
+                     "image": ["image_hash1", "image_hash2"],
+                     "audio": ["audio_hash1", "audio_hash2"]
+                 },
+                 expected_modalities=None,
+                 expected_ranges=None,
+                 expected_hashes=None),
+        TestCase(mm_positions={
+            "image": [
+                PlaceholderRange(offset=0, length=2),
+                PlaceholderRange(offset=2, length=3),
+                PlaceholderRange(offset=20, length=4),
+            ],
+            "audio": [
+                PlaceholderRange(offset=5, length=2),
+            ],
+            "video": [
+                PlaceholderRange(offset=8, length=5),
+            ]
+        },
+                 mm_hashes=None,
+                 expected_modalities=None,
+                 expected_ranges=None,
+                 expected_hashes=None),
     ]
 
-    for (mm_positions, mm_hashes) in test_cases:
+    for case in test_cases:
         with pytest.raises(ValueError) as ex_info:
-            merge_and_sort_multimodal_metadata(mm_positions, mm_hashes)
+            merge_and_sort_multimodal_metadata(case.mm_positions,
+                                               case.mm_hashes)
 
         assert "Interleaved mixed-modality" in str(ex_info.value)

From 7f4815efdd426429ba611751d8021d9f88745be2 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Sun, 5 Jan 2025 20:23:43 -0800
Subject: [PATCH 35/47] add comment

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 tests/multimodal/test_utils.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py
index 2faf8117ed76..5c20a5605aed 100644
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -334,6 +334,8 @@ def test_merge_and_sort_multimodal_metadata():
 def test_merge_and_sort_multimodal_metadata_with_interleaving():
 
     test_cases = [
+
+        # <image> <audio> <image> <audio>
         TestCase(mm_positions={
             "image": [
                 PlaceholderRange(offset=0, length=4),
@@ -351,6 +353,8 @@ def test_merge_and_sort_multimodal_metadata_with_interleaving():
                  expected_modalities=None,
                  expected_ranges=None,
                  expected_hashes=None),
+
+        # <image> <image> <video> <audio> <image>
         TestCase(mm_positions={
             "image": [
                 PlaceholderRange(offset=0, length=2),

From 1ba40e9a3238fe24f259c0cb98d099c03d87e342 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Sun, 5 Jan 2025 20:44:56 -0800
Subject: [PATCH 36/47] update

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 tests/multimodal/test_utils.py | 32 ++++++++++++++++----------------
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py
index 5c20a5605aed..fc8c4d454306 100644
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -1,9 +1,8 @@
 import base64
 import mimetypes
 import os
-from collections import namedtuple
 from tempfile import NamedTemporaryFile, TemporaryDirectory
-from typing import Dict, Tuple
+from typing import TYPE_CHECKING, Dict, NamedTuple, Optional, Tuple
 
 import numpy as np
 import pytest
@@ -15,6 +14,10 @@
                                    merge_and_sort_multimodal_metadata,
                                    repeat_and_pad_placeholder_tokens)
 
+if TYPE_CHECKING:
+    from vllm.multimodal.hasher import MultiModalHashDict
+    from vllm.multimodal.inputs import MultiModalPlaceholderDict
+
 # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
 TEST_IMAGE_URLS = [
     "https://upload.wikimedia.org/wikipedia/commons/thumb/d/dd/Gfp-wisconsin-madison-the-nature-boardwalk.jpg/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",
@@ -196,16 +199,13 @@ def test_repeat_and_pad_placeholder_tokens(model):
         assert ranges == expected_ranges
 
 
-# Each test case is a tuple of :
-# - mm_positions: MultiModalPlaceholderDict
-# - mm_hashes: Optional[MultiModalHashDict]
-# - expected sorted modalities: list[str]
-# - expected sorted & flattened PlaceholderRanges: list[PlaceholderRange]
-# - expected sorted & flattened hash strings: Optional[list[str]]
-TestCase = namedtuple("TestCase", [
-    "mm_positions", "mm_hashes", "expected_modalities", "expected_ranges",
-    "expected_hashes"
-])
+# Used for the next two tests related to `merge_and_sort_multimodal_metadata`.
+class TestCase(NamedTuple):
+    mm_positions: "MultiModalPlaceholderDict"
+    mm_hashes: Optional["MultiModalHashDict"]
+    expected_modalities: list[str]
+    expected_ranges: list[PlaceholderRange]
+    expected_hashes: Optional[list[str]]
 
 
 def test_merge_and_sort_multimodal_metadata():
@@ -350,8 +350,8 @@ def test_merge_and_sort_multimodal_metadata_with_interleaving():
                      "image": ["image_hash1", "image_hash2"],
                      "audio": ["audio_hash1", "audio_hash2"]
                  },
-                 expected_modalities=None,
-                 expected_ranges=None,
+                 expected_modalities=[],
+                 expected_ranges=[],
                  expected_hashes=None),
 
         # <image> <image> <video> <audio> <image>
@@ -369,8 +369,8 @@ def test_merge_and_sort_multimodal_metadata_with_interleaving():
             ]
         },
                  mm_hashes=None,
-                 expected_modalities=None,
-                 expected_ranges=None,
+                 expected_modalities=[],
+                 expected_ranges=[],
                  expected_hashes=None),
     ]
 

From 2eb4cf1002f601ccd7d5610f8c9cc13da6fc238e Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Sun, 5 Jan 2025 21:00:45 -0800
Subject: [PATCH 37/47] format

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 tests/multimodal/test_utils.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py
index fc8c4d454306..c4268574edc3 100644
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -210,6 +210,7 @@ class TestCase(NamedTuple):
 
 def test_merge_and_sort_multimodal_metadata():
 
+    # yapf: disable
     test_cases = [
         # Single modality should return result as is but flattened
         TestCase(mm_positions={
@@ -225,6 +226,7 @@ def test_merge_and_sort_multimodal_metadata():
                      PlaceholderRange(offset=3, length=2)
                  ],
                  expected_hashes=["hash1", "hash2"]),
+
         # Single modality without hashes return None for mm hash.
         TestCase(mm_positions={
             "image": [
@@ -239,6 +241,7 @@ def test_merge_and_sort_multimodal_metadata():
                      PlaceholderRange(offset=2, length=2)
                  ],
                  expected_hashes=None),
+
         # Multiple modalities with hashes should return sorted modalities
         # and flattened ranges and hashes.
         TestCase(mm_positions={
@@ -265,6 +268,7 @@ def test_merge_and_sort_multimodal_metadata():
                  expected_hashes=[
                      "audio_hash1", "audio_hash2", "image_hash1", "image_hash2"
                  ]),
+
         # Multiple modalities without hashes should return sorted modalities
         # and flattened ranges and None.
         TestCase(mm_positions={
@@ -286,6 +290,7 @@ def test_merge_and_sort_multimodal_metadata():
                      PlaceholderRange(offset=11, length=5)
                  ],
                  expected_hashes=None),
+
         # Three modalities
         TestCase(mm_positions={
             "image": [
@@ -320,6 +325,7 @@ def test_merge_and_sort_multimodal_metadata():
                      "video_hash3", "image_hash1", "image_hash2"
                  ]),
     ]
+    # yapf: enable
 
     for (mm_positions, mm_hashes, expected_modalities, expected_ranges,
          expected_hashes) in test_cases:
@@ -333,6 +339,7 @@ def test_merge_and_sort_multimodal_metadata():
 
 def test_merge_and_sort_multimodal_metadata_with_interleaving():
 
+    # yapf: disable
     test_cases = [
 
         # <image> <audio> <image> <audio>
@@ -373,6 +380,7 @@ def test_merge_and_sort_multimodal_metadata_with_interleaving():
                  expected_ranges=[],
                  expected_hashes=None),
     ]
+    # yapf: enable
 
     for case in test_cases:
         with pytest.raises(ValueError) as ex_info:

From fe714317316725d8d0864f96971a29d41a847e8b Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Mon, 6 Jan 2025 06:56:25 +0000
Subject: [PATCH 38/47] format

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 tests/multimodal/test_utils.py | 268 +++++++++++++++++----------------
 1 file changed, 139 insertions(+), 129 deletions(-)

diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py
index c4268574edc3..198344e5bd88 100644
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -210,122 +210,130 @@ class TestCase(NamedTuple):
 
 def test_merge_and_sort_multimodal_metadata():
 
-    # yapf: disable
     test_cases = [
         # Single modality should return result as is but flattened
-        TestCase(mm_positions={
-            "image": [
+        TestCase(
+            mm_positions={
+                "image": [
+                    PlaceholderRange(offset=0, length=2),
+                    PlaceholderRange(offset=3, length=2),
+                ]
+            },
+            mm_hashes={"image": ["hash1", "hash2"]},
+            expected_modalities=["image"],
+            expected_ranges=[
                 PlaceholderRange(offset=0, length=2),
-                PlaceholderRange(offset=3, length=2)
-            ]
-        },
-                 mm_hashes={"image": ["hash1", "hash2"]},
-                 expected_modalities=["image"],
-                 expected_ranges=[
-                     PlaceholderRange(offset=0, length=2),
-                     PlaceholderRange(offset=3, length=2)
-                 ],
-                 expected_hashes=["hash1", "hash2"]),
+                PlaceholderRange(offset=3, length=2),
+            ],
+            expected_hashes=["hash1", "hash2"],
+        ),
 
         # Single modality without hashes return None for mm hash.
-        TestCase(mm_positions={
-            "image": [
+        TestCase(
+            mm_positions={
+                "image": [
+                    PlaceholderRange(offset=0, length=2),
+                    PlaceholderRange(offset=2, length=2),
+                ]
+            },
+            mm_hashes=None,
+            expected_modalities=["image"],
+            expected_ranges=[
                 PlaceholderRange(offset=0, length=2),
-                PlaceholderRange(offset=2, length=2)
-            ]
-        },
-                 mm_hashes=None,
-                 expected_modalities=["image"],
-                 expected_ranges=[
-                     PlaceholderRange(offset=0, length=2),
-                     PlaceholderRange(offset=2, length=2)
-                 ],
-                 expected_hashes=None),
+                PlaceholderRange(offset=2, length=2),
+            ],
+            expected_hashes=None,
+        ),
 
         # Multiple modalities with hashes should return sorted modalities
         # and flattened ranges and hashes.
-        TestCase(mm_positions={
-            "image": [
+        TestCase(
+            mm_positions={
+                "image": [
+                    PlaceholderRange(offset=7, length=4),
+                    PlaceholderRange(offset=11, length=5),
+                ],
+                "audio": [
+                    PlaceholderRange(offset=0, length=2),
+                    PlaceholderRange(offset=2, length=3),
+                ]
+            },
+            mm_hashes={
+                "image": ["image_hash1", "image_hash2"],
+                "audio": ["audio_hash1", "audio_hash2"],
+            },
+            expected_modalities=["audio", "image"],
+            expected_ranges=[
+                PlaceholderRange(offset=0, length=2),
+                PlaceholderRange(offset=2, length=3),
                 PlaceholderRange(offset=7, length=4),
-                PlaceholderRange(offset=11, length=5)
+                PlaceholderRange(offset=11, length=5),
             ],
-            "audio": [
-                PlaceholderRange(offset=0, length=2),
-                PlaceholderRange(offset=2, length=3)
-            ]
-        },
-                 mm_hashes={
-                     "image": ["image_hash1", "image_hash2"],
-                     "audio": ["audio_hash1", "audio_hash2"]
-                 },
-                 expected_modalities=["audio", "image"],
-                 expected_ranges=[
-                     PlaceholderRange(offset=0, length=2),
-                     PlaceholderRange(offset=2, length=3),
-                     PlaceholderRange(offset=7, length=4),
-                     PlaceholderRange(offset=11, length=5)
-                 ],
-                 expected_hashes=[
-                     "audio_hash1", "audio_hash2", "image_hash1", "image_hash2"
-                 ]),
+            expected_hashes=[
+                "audio_hash1", "audio_hash2", "image_hash1", "image_hash2"
+            ],
+        ),
 
         # Multiple modalities without hashes should return sorted modalities
         # and flattened ranges and None.
-        TestCase(mm_positions={
-            "image": [
+        TestCase(
+            mm_positions={
+                "image": [
+                    PlaceholderRange(offset=7, length=4),
+                    PlaceholderRange(offset=11, length=5),
+                ],
+                "audio": [
+                    PlaceholderRange(offset=0, length=2),
+                    PlaceholderRange(offset=2, length=3),
+                ]
+            },
+            mm_hashes=None,
+            expected_modalities=["audio", "image"],
+            expected_ranges=[
+                PlaceholderRange(offset=0, length=2),
+                PlaceholderRange(offset=2, length=3),
                 PlaceholderRange(offset=7, length=4),
-                PlaceholderRange(offset=11, length=5)
+                PlaceholderRange(offset=11, length=5),
             ],
-            "audio": [
-                PlaceholderRange(offset=0, length=2),
-                PlaceholderRange(offset=2, length=3)
-            ]
-        },
-                 mm_hashes=None,
-                 expected_modalities=["audio", "image"],
-                 expected_ranges=[
-                     PlaceholderRange(offset=0, length=2),
-                     PlaceholderRange(offset=2, length=3),
-                     PlaceholderRange(offset=7, length=4),
-                     PlaceholderRange(offset=11, length=5)
-                 ],
-                 expected_hashes=None),
+            expected_hashes=None,
+        ),
 
         # Three modalities
-        TestCase(mm_positions={
-            "image": [
-                PlaceholderRange(offset=15, length=7),
-                PlaceholderRange(offset=22, length=8),
-            ],
-            "audio": [
+        TestCase(
+            mm_positions={
+                "image": [
+                    PlaceholderRange(offset=15, length=7),
+                    PlaceholderRange(offset=22, length=8),
+                ],
+                "audio": [
+                    PlaceholderRange(offset=0, length=2),
+                ],
+                "video": [
+                    PlaceholderRange(offset=3, length=4),
+                    PlaceholderRange(offset=7, length=5),
+                    PlaceholderRange(offset=12, length=6),
+                ]
+            },
+            mm_hashes={
+                "image": ["image_hash1", "image_hash2"],
+                "audio": ["audio_hash1"],
+                "video": ["video_hash1", "video_hash2", "video_hash3"]
+            },
+            expected_modalities=["audio", "video", "image"],
+            expected_ranges=[
                 PlaceholderRange(offset=0, length=2),
-            ],
-            "video": [
                 PlaceholderRange(offset=3, length=4),
                 PlaceholderRange(offset=7, length=5),
                 PlaceholderRange(offset=12, length=6),
-            ]
-        },
-                 mm_hashes={
-                     "image": ["image_hash1", "image_hash2"],
-                     "audio": ["audio_hash1"],
-                     "video": ["video_hash1", "video_hash2", "video_hash3"]
-                 },
-                 expected_modalities=["audio", "video", "image"],
-                 expected_ranges=[
-                     PlaceholderRange(offset=0, length=2),
-                     PlaceholderRange(offset=3, length=4),
-                     PlaceholderRange(offset=7, length=5),
-                     PlaceholderRange(offset=12, length=6),
-                     PlaceholderRange(offset=15, length=7),
-                     PlaceholderRange(offset=22, length=8),
-                 ],
-                 expected_hashes=[
-                     "audio_hash1", "video_hash1", "video_hash2",
-                     "video_hash3", "image_hash1", "image_hash2"
-                 ]),
+                PlaceholderRange(offset=15, length=7),
+                PlaceholderRange(offset=22, length=8),
+            ],
+            expected_hashes=[
+                "audio_hash1", "video_hash1", "video_hash2", "video_hash3",
+                "image_hash1", "image_hash2"
+            ],
+        ),
     ]
-    # yapf: enable
 
     for (mm_positions, mm_hashes, expected_modalities, expected_ranges,
          expected_hashes) in test_cases:
@@ -339,48 +347,50 @@ def test_merge_and_sort_multimodal_metadata():
 
 def test_merge_and_sort_multimodal_metadata_with_interleaving():
 
-    # yapf: disable
     test_cases = [
 
         # <image> <audio> <image> <audio>
-        TestCase(mm_positions={
-            "image": [
-                PlaceholderRange(offset=0, length=4),
-                PlaceholderRange(offset=8, length=2)
-            ],
-            "audio": [
-                PlaceholderRange(offset=5, length=2),
-                PlaceholderRange(offset=11, length=4)
-            ]
-        },
-                 mm_hashes={
-                     "image": ["image_hash1", "image_hash2"],
-                     "audio": ["audio_hash1", "audio_hash2"]
-                 },
-                 expected_modalities=[],
-                 expected_ranges=[],
-                 expected_hashes=None),
+        TestCase(
+            mm_positions={
+                "image": [
+                    PlaceholderRange(offset=0, length=4),
+                    PlaceholderRange(offset=8, length=2),
+                ],
+                "audio": [
+                    PlaceholderRange(offset=5, length=2),
+                    PlaceholderRange(offset=11, length=4),
+                ]
+            },
+            mm_hashes={
+                "image": ["image_hash1", "image_hash2"],
+                "audio": ["audio_hash1", "audio_hash2"],
+            },
+            expected_modalities=[],
+            expected_ranges=[],
+            expected_hashes=None,
+        ),
 
         # <image> <image> <video> <audio> <image>
-        TestCase(mm_positions={
-            "image": [
-                PlaceholderRange(offset=0, length=2),
-                PlaceholderRange(offset=2, length=3),
-                PlaceholderRange(offset=20, length=4),
-            ],
-            "audio": [
-                PlaceholderRange(offset=5, length=2),
-            ],
-            "video": [
-                PlaceholderRange(offset=8, length=5),
-            ]
-        },
-                 mm_hashes=None,
-                 expected_modalities=[],
-                 expected_ranges=[],
-                 expected_hashes=None),
+        TestCase(
+            mm_positions={
+                "image": [
+                    PlaceholderRange(offset=0, length=2),
+                    PlaceholderRange(offset=2, length=3),
+                    PlaceholderRange(offset=20, length=4),
+                ],
+                "audio": [
+                    PlaceholderRange(offset=5, length=2),
+                ],
+                "video": [
+                    PlaceholderRange(offset=8, length=5),
+                ]
+            },
+            mm_hashes=None,
+            expected_modalities=[],
+            expected_ranges=[],
+            expected_hashes=None,
+        ),
     ]
-    # yapf: enable
 
     for case in test_cases:
         with pytest.raises(ValueError) as ex_info:

From 1a7b39c24deeb183e9d8e00be7b3d69045f88a2c Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Mon, 6 Jan 2025 07:01:27 +0000
Subject: [PATCH 39/47] remove unneeded check

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/multimodal/utils.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 9323bd489e83..f4a514ba55d0 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -482,8 +482,7 @@ def merge_and_sort_multimodal_metadata(
                 mm_hashes[modalities[0]])
 
     placeholder_lists_with_modality = [(modality, mm_positions[modality])
-                                       for modality in modalities
-                                       if modality in mm_positions]
+                                       for modality in modalities]
 
     if mm_hashes is None:
         sorted_placeholder_lists = sorted(placeholder_lists_with_modality,

From ceec26e465b90f4f00507270b1eea3825ce14d32 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Mon, 6 Jan 2025 08:39:24 +0000
Subject: [PATCH 40/47] remove unused import

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/multimodal/processing.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index 98eea6cb940e..c60cc963b753 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -6,9 +6,6 @@
 from functools import lru_cache
 from typing import Any, NamedTuple, Optional, Protocol, TypeVar, Union
 
-import numpy as np
-import numpy.typing as npt
-from PIL import Image
 from transformers import BatchFeature, PretrainedConfig, ProcessorMixin
 
 from vllm import envs

From 787995298ec55b9a414b1f0343a5ab79df72849d Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Mon, 6 Jan 2025 10:12:20 +0000
Subject: [PATCH 41/47] restrict mm_hash to V1

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/multimodal/inputs.py     |  2 +-
 vllm/multimodal/processing.py | 26 +++++++++++++++-----------
 2 files changed, 16 insertions(+), 12 deletions(-)

diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 2abc91595cef..8fdcc4b52403 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -516,7 +516,7 @@ class MultiModalInputsV2(TypedDict):
     mm_kwargs: MultiModalKwargs
     """Keyword arguments to be directly passed to the model after batching."""
 
-    mm_hashes: NotRequired["MultiModalHashDict"]
+    mm_hashes: NotRequired[Optional["MultiModalHashDict"]]
     """The hashes of the multi-modal data."""
 
     mm_placeholders: MultiModalPlaceholderDict
diff --git a/vllm/multimodal/processing.py b/vllm/multimodal/processing.py
index c60cc963b753..41113cd85bd1 100644
--- a/vllm/multimodal/processing.py
+++ b/vllm/multimodal/processing.py
@@ -996,19 +996,23 @@ def apply(
         """
         mm_items = self._to_mm_items(mm_data)
 
-        # Create MM hashes
+        # Create MM hashes (only used in V1)
         # TODO: Use these hash keys for caching operations in apply_hf_processor
         # instead of rehashing.
-        model_id = self.ctx.model_config.model
-        mm_hashes = {
-            modality: [
-                MultiModalHasher.hash_kwargs(model_id=model_id,
-                                             **{modality: item},
-                                             **hf_processor_mm_kwargs)
-                for item in items
-            ]
-            for modality, items in mm_items.items()
-        }
+
+        if envs.VLLM_USE_V1:
+            model_id = self.ctx.model_config.model
+            mm_hashes = {
+                modality: [
+                    MultiModalHasher.hash_kwargs(model_id=model_id,
+                                                 **{modality: item},
+                                                 **hf_processor_mm_kwargs)
+                    for item in items
+                ]
+                for modality, items in mm_items.items()
+            }
+        else:
+            mm_hashes = None
 
         prompt_ids, mm_kwargs = self._cached_apply_hf_processor(
             prompt_text,

From 72ae7699a284dd23dcc0de87e91e1f5f930336e6 Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Mon, 6 Jan 2025 11:20:11 +0000
Subject: [PATCH 42/47] fix test and reorder code for readability

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/v1/engine/processor.py | 82 +++++++++++++++++++------------------
 1 file changed, 42 insertions(+), 40 deletions(-)

diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 1c4e062ea266..7e483c9efb8f 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -93,6 +93,17 @@ def process_inputs(
             decoder_inputs = SingletonInputsAdapter(processed_inputs)
             encoder_inputs = None
 
+        # TODO: Impl encoder-decoder
+        if encoder_inputs is not None:
+            raise NotImplementedError
+
+        assert isinstance(params, SamplingParams)
+        # TODO: can we avoid cloning here in multiproc case
+        sampling_params = params.clone()
+        sampling_params.update_from_generation_config(
+            self.generation_config_fields, eos_token_id)
+
+        # Multimodal related.
         # Compute MM hashes (if enabled)
         mm_hashes = None
         if self.use_hash:
@@ -104,16 +115,6 @@ def process_inputs(
             else:
                 mm_hashes = MultiModalHasher.hash_prompt_mm_data(prompt)
 
-        # TODO: Impl encoder-decoder
-        if encoder_inputs is not None:
-            raise NotImplementedError
-
-        assert isinstance(params, SamplingParams)
-        # TODO: can we avoid cloning here in multiproc case
-        sampling_params = params.clone()
-        sampling_params.update_from_generation_config(
-            self.generation_config_fields, eos_token_id)
-
         # For merged preprocessor, mm_data is already mm_inputs
         precomputed_mm_inputs: Optional[list[MultiModalKwargs]] = None
         decoder_mm_data = decoder_inputs.multi_modal_data
@@ -128,13 +129,15 @@ def process_inputs(
                 for item in decoder_mm_data.get_items(modality)
             ]
 
-        # Merge and flatten multimodal placeholders, hashes and inputs
-        # from dictionaries to lists, and sort them by each item's position
-        # in the input sequence.
-        # NOTE: interleaved modalities are not supported.
         mm_positions = decoder_inputs.multi_modal_placeholders
 
+        # Last-mile processing of multimodal metadata and inputs.
         if mm_positions:
+
+            # Merge and flatten multimodal placeholders, hashes and inputs
+            # from dictionaries to lists, and sort them by each item's position
+            # in the input sequence.
+            # NOTE: interleaved modalities are not supported.
             (
                 sorted_modalities,
                 sorted_mm_positions,
@@ -143,38 +146,37 @@ def process_inputs(
                 mm_positions,
                 mm_hashes,
             )
-        else:
-            sorted_modalities = []
-            sorted_mm_positions = None
-            sorted_mm_hashes = None
-
-        # NOTE: We sort multimodal inputs/kwargs only if there are multiple
-        # modalities involved and the model supports merged input processor.
-        if precomputed_mm_inputs is not None and len(sorted_modalities) > 1:
-            modality_order_dict = {
-                modality: order
-                for order, modality in enumerate(sorted_modalities)
-            }
-
-            # Sanity check to make sure each multimodal input
-            # has only one modality key.
-            for mm_input in precomputed_mm_inputs:
-                assert len(mm_input.modalities) == 1
-
-            # Sort MultiModalKwags to match sorted_mm_positions
-            precomputed_mm_inputs = sorted(
-                precomputed_mm_inputs,
-                key=lambda mm_input: modality_order_dict[list(mm_input.
-                                                              modalities)[0]])
-
-        # Apply mm input cache update (and input mapper is necessary).
-        if len(decoder_mm_data) > 0:
+            # NOTE: We sort multimodal inputs/kwargs only if there are multiple
+            # modalities involved and the model supports merged input processor.
+            if precomputed_mm_inputs is not None and len(
+                    sorted_modalities) > 1:
+                modality_order_dict = {
+                    modality: order
+                    for order, modality in enumerate(sorted_modalities)
+                }
+
+                # Sanity check to make sure each multimodal input
+                # has only one modality key.
+                for mm_input in precomputed_mm_inputs:
+                    assert len(mm_input.modalities) == 1
+
+                # Sort MultiModalKwags to match sorted_mm_positions
+                precomputed_mm_inputs = sorted(
+                    precomputed_mm_inputs,
+                    key=lambda mm_input: modality_order_dict[list(
+                        mm_input.modalities)[0]])
+
+            # Apply mm input cache update (and input mapper is necessary).
             sorted_mm_inputs = self.mm_input_mapper_client.process_inputs(
                 mm_data=decoder_mm_data,
                 mm_hashes=sorted_mm_hashes,
                 mm_processor_kwargs=decoder_inputs.mm_processor_kwargs,
                 precomputed_mm_inputs=precomputed_mm_inputs,
             )
+        else:
+            sorted_mm_inputs = None
+            sorted_mm_hashes = None
+            sorted_mm_positions = None
 
         return EngineCoreRequest(
             request_id=request_id,

From 48811b67340ae194b8d0d6dccf47ddd7674343ee Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Mon, 6 Jan 2025 11:29:41 +0000
Subject: [PATCH 43/47] typo

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/v1/engine/processor.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 7e483c9efb8f..4400b0260633 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -166,7 +166,7 @@ def process_inputs(
                     key=lambda mm_input: modality_order_dict[list(
                         mm_input.modalities)[0]])
 
-            # Apply mm input cache update (and input mapper is necessary).
+            # Apply mm input cache update (and input mapper if necessary).
             sorted_mm_inputs = self.mm_input_mapper_client.process_inputs(
                 mm_data=decoder_mm_data,
                 mm_hashes=sorted_mm_hashes,

From b31fd4fa028f5871dd3e484b0f90fe97dc2e344a Mon Sep 17 00:00:00 2001
From: Roger Wang <ywang@roblox.com>
Date: Mon, 6 Jan 2025 11:35:26 +0000
Subject: [PATCH 44/47] format

Signed-off-by: Roger Wang <ywang@roblox.com>
---
 vllm/v1/engine/processor.py | 13 +++++++------
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/vllm/v1/engine/processor.py b/vllm/v1/engine/processor.py
index 4400b0260633..43419d2ff538 100644
--- a/vllm/v1/engine/processor.py
+++ b/vllm/v1/engine/processor.py
@@ -146,17 +146,18 @@ def process_inputs(
                 mm_positions,
                 mm_hashes,
             )
-            # NOTE: We sort multimodal inputs/kwargs only if there are multiple
-            # modalities involved and the model supports merged input processor.
-            if precomputed_mm_inputs is not None and len(
-                    sorted_modalities) > 1:
+
+            # NOTE: Sort multimodal inputs/kwargs ONLY IF there are multiple
+            # modalities involved AND the model supports merged input processor.
+            if len(sorted_modalities) > 1 and precomputed_mm_inputs:
+
                 modality_order_dict = {
                     modality: order
                     for order, modality in enumerate(sorted_modalities)
                 }
 
-                # Sanity check to make sure each multimodal input
-                # has only one modality key.
+                # Sanity check to make sure each multimodal input has only one
+                # modality key.
                 for mm_input in precomputed_mm_inputs:
                     assert len(mm_input.modalities) == 1
 

From be54b2c108c1c396f89681ef5b34c7f9f5c70480 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 6 Jan 2025 15:45:55 +0000
Subject: [PATCH 45/47] Fix dummy requests

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/v1/core/test_kv_cache_utils.py | 12 +++++-------
 tests/v1/core/test_prefix_caching.py | 10 +++++-----
 2 files changed, 10 insertions(+), 12 deletions(-)

diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index 2ed70b42991b..cc67ba0b578c 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -1,6 +1,5 @@
 import pytest
 
-from vllm.inputs import token_inputs
 from vllm.sampling_params import SamplingParams
 from vllm.v1.core.kv_cache_utils import (BlockHashType, FreeKVCacheBlockQueue,
                                          KVCacheBlock,
@@ -16,12 +15,11 @@ def make_request(request_id,
                  mm_hashes=None):
     return Request(
         request_id=request_id,
-        inputs=token_inputs(
-            prompt_token_ids=prompt_token_ids,
-            multi_modal_placeholders={"image": mm_positions}
-            if mm_positions else None,
-            multi_modal_hashes=mm_hashes,
-        ),
+        prompt=None,
+        prompt_token_ids=prompt_token_ids,
+        multi_modal_inputs=None,
+        multi_modal_hashes=mm_hashes,
+        multi_modal_placeholders=mm_positions,
         sampling_params=SamplingParams(max_tokens=17),
         eos_token_id=100,
         arrival_time=0,
diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index 35e3a2f97272..1c779285f1d6 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -1,7 +1,6 @@
 """Compare the with and without prefix caching."""
 import pytest
 
-from vllm.inputs import token_inputs
 from vllm.multimodal.inputs import PlaceholderRange
 from vllm.sampling_params import SamplingParams
 from vllm.utils import cdiv
@@ -15,10 +14,11 @@ def make_request(request_id,
                  mm_hashes=None):
     return Request(
         request_id=request_id,
-        inputs=token_inputs(prompt_token_ids=prompt_token_ids,
-                            multi_modal_placeholders={"image": mm_positions}
-                            if mm_positions else None,
-                            multi_modal_hashes=mm_hashes),
+        prompt=None,
+        prompt_token_ids=prompt_token_ids,
+        multi_modal_inputs=None,
+        multi_modal_hashes=mm_hashes,
+        multi_modal_placeholders=mm_positions,
         sampling_params=SamplingParams(max_tokens=17),
         eos_token_id=100,
         arrival_time=0,

From b2cbc5a5cd9b200de1477c4ebe1a2460d9b3de02 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 6 Jan 2025 15:49:42 +0000
Subject: [PATCH 46/47] Pass sanity check

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/v1/core/test_kv_cache_utils.py | 8 +++++++-
 tests/v1/core/test_prefix_caching.py | 9 +++++++--
 2 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/tests/v1/core/test_kv_cache_utils.py b/tests/v1/core/test_kv_cache_utils.py
index cc67ba0b578c..f4081766e39a 100644
--- a/tests/v1/core/test_kv_cache_utils.py
+++ b/tests/v1/core/test_kv_cache_utils.py
@@ -1,5 +1,6 @@
 import pytest
 
+from vllm.multimodal.inputs import MultiModalKwargs
 from vllm.sampling_params import SamplingParams
 from vllm.v1.core.kv_cache_utils import (BlockHashType, FreeKVCacheBlockQueue,
                                          KVCacheBlock,
@@ -13,11 +14,16 @@ def make_request(request_id,
                  prompt_token_ids,
                  mm_positions=None,
                  mm_hashes=None):
+    if mm_positions is None:
+        multi_modal_inputs = None
+    else:
+        multi_modal_inputs = [MultiModalKwargs({})] * len(mm_positions)
+
     return Request(
         request_id=request_id,
         prompt=None,
         prompt_token_ids=prompt_token_ids,
-        multi_modal_inputs=None,
+        multi_modal_inputs=multi_modal_inputs,
         multi_modal_hashes=mm_hashes,
         multi_modal_placeholders=mm_positions,
         sampling_params=SamplingParams(max_tokens=17),
diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index 1c779285f1d6..d0606a60e193 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -1,7 +1,7 @@
 """Compare the with and without prefix caching."""
 import pytest
 
-from vllm.multimodal.inputs import PlaceholderRange
+from vllm.multimodal.inputs import PlaceholderRange, MultiModalKwargs
 from vllm.sampling_params import SamplingParams
 from vllm.utils import cdiv
 from vllm.v1.core.kv_cache_manager import KVCacheManager, Request
@@ -12,11 +12,16 @@ def make_request(request_id,
                  prompt_token_ids,
                  mm_positions=None,
                  mm_hashes=None):
+    if mm_positions is None:
+        multi_modal_inputs = None
+    else:
+        multi_modal_inputs = [MultiModalKwargs({})] * len(mm_positions)
+
     return Request(
         request_id=request_id,
         prompt=None,
         prompt_token_ids=prompt_token_ids,
-        multi_modal_inputs=None,
+        multi_modal_inputs=multi_modal_inputs,
         multi_modal_hashes=mm_hashes,
         multi_modal_placeholders=mm_positions,
         sampling_params=SamplingParams(max_tokens=17),

From 3400d07c9ee6acc0e241307de59c070baa04d985 Mon Sep 17 00:00:00 2001
From: DarkLight1337 <tlleungac@connect.ust.hk>
Date: Mon, 6 Jan 2025 15:50:43 +0000
Subject: [PATCH 47/47] format

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
---
 tests/v1/core/test_prefix_caching.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/v1/core/test_prefix_caching.py b/tests/v1/core/test_prefix_caching.py
index d0606a60e193..b97f55b8c653 100644
--- a/tests/v1/core/test_prefix_caching.py
+++ b/tests/v1/core/test_prefix_caching.py
@@ -1,7 +1,7 @@
 """Compare the with and without prefix caching."""
 import pytest
 
-from vllm.multimodal.inputs import PlaceholderRange, MultiModalKwargs
+from vllm.multimodal.inputs import MultiModalKwargs, PlaceholderRange
 from vllm.sampling_params import SamplingParams
 from vllm.utils import cdiv
 from vllm.v1.core.kv_cache_manager import KVCacheManager, Request