add support to enable lora with embedding models (sgl-project#17780)

vedantjh2 · Vedant Jhaveri · web-flow · commit 98b5013d59b4 · 2026-02-11T23:19:40.000+08:00
Co-authored-by: Vedant Jhaveri &lt;vjhaveri@linkedin.com&gt;
diff --git a/python/sglang/srt/entrypoints/engine.py b/python/sglang/srt/entrypoints/engine.py
@@ -379,6 +379,7 @@ def encode(
         audio_data: Optional[MultimodalDataInputFormat] = None,
         video_data: Optional[MultimodalDataInputFormat] = None,
         dimensions: Optional[int] = None,
+        lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None,
         external_trace_header: Optional[Dict] = None,
         rid: Optional[Union[List[str], str]] = None,
     ) -> Dict:
@@ -392,6 +393,7 @@ def encode(
             audio_data=audio_data,
             video_data=video_data,
             dimensions=dimensions,
+            lora_path=lora_path,
             external_trace_header=external_trace_header,
             rid=rid,
         )
@@ -406,6 +408,7 @@ async def async_encode(
         audio_data: Optional[MultimodalDataInputFormat] = None,
         video_data: Optional[MultimodalDataInputFormat] = None,
         dimensions: Optional[int] = None,
+        lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None,
         external_trace_header: Optional[Dict] = None,
         rid: Optional[Union[List[str], str]] = None,
     ) -> Dict:
@@ -421,6 +424,7 @@ async def async_encode(
             audio_data=audio_data,
             video_data=video_data,
             dimensions=dimensions,
+            lora_path=lora_path,
             external_trace_header=external_trace_header,
             rid=rid,
         )
diff --git a/python/sglang/srt/entrypoints/openai/protocol.py b/python/sglang/srt/entrypoints/openai/protocol.py
@@ -897,6 +897,8 @@ class EmbeddingRequest(BaseModel):
     rid: Optional[Union[List[str], str]] = None
     # Priority for the request
     priority: Optional[int] = None
+    # LoRA adapter path(s)
+    lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
 
 
 class EmbeddingObject(BaseModel):
diff --git a/python/sglang/srt/entrypoints/openai/serving_embedding.py b/python/sglang/srt/entrypoints/openai/serving_embedding.py
@@ -126,12 +126,24 @@ def _convert_to_internal_request(
             # Other types (should not happen but handle gracefully)
             prompt_kwargs = {"input_ids": prompt}
 
+        # Resolve LoRA adapter from model parameter or explicit lora_path
+        lora_path = self._resolve_lora_path(request.model, request.lora_path)
+        if lora_path:
+            first_adapter = (
+                lora_path
+                if isinstance(lora_path, str)
+                else next((a for a in lora_path if a), None)
+            )
+            if first_adapter:
+                self._validate_lora_enabled(first_adapter)
+
         adapted_request = EmbeddingReqInput(
             **prompt_kwargs,
             rid=request.rid,
             priority=request.priority,
             routing_key=self.extract_routing_key(raw_request),
             dimensions=request.dimensions,
+            lora_path=lora_path,
         )
 
         return adapted_request, request
diff --git a/python/sglang/srt/managers/io_struct.py b/python/sglang/srt/managers/io_struct.py
@@ -824,6 +824,11 @@ class EmbeddingReqInput(BaseReq, APIServingTimingMixin):
     # The number of dimensions the resulting output embeddings should have. It is applicable for Matryoshka Embeddings.
     dimensions: Optional[int] = None
 
+    # The path to the LoRA adaptors
+    lora_path: Optional[Union[List[Optional[str]], Optional[str]]] = None
+    # The uid of LoRA adaptors, should be initialized by tokenizer manager
+    lora_id: Optional[Union[List[Optional[str]], Optional[str]]] = None
+
     def normalize_batch_and_arguments(self):
         # at least one of text, input_ids, or image should be provided
         if self.text is None and self.input_ids is None and self.image_data is None:
@@ -875,6 +880,21 @@ def normalize_batch_and_arguments(self):
             for i in range(self.batch_size):
                 self.sampling_params[i]["max_new_tokens"] = 0
 
+            self._normalize_lora_paths(self.batch_size)
+
+    def _normalize_lora_paths(self, num):
+        """Normalize LoRA paths for batch processing."""
+        if self.lora_path is not None:
+            if isinstance(self.lora_path, str):
+                self.lora_path = [self.lora_path] * num
+            elif isinstance(self.lora_path, list):
+                if len(self.lora_path) != num:
+                    raise ValueError(
+                        f"lora_path list length ({len(self.lora_path)}) must match batch size ({num})"
+                    )
+            else:
+                raise ValueError("lora_path should be a list or a string.")
+
     def contains_mm_input(self) -> bool:
         return (
             has_valid_data(self.image_data)
@@ -888,6 +908,8 @@ def __getitem__(self, i):
                 text=[self.text[i]] if self.text is not None else None,
                 sampling_params=self.sampling_params[i],
                 rid=self.rid[i],
+                lora_path=self.lora_path[i] if self.lora_path is not None else None,
+                lora_id=self.lora_id[i] if self.lora_id is not None else None,
                 is_cross_encoder_request=True,
                 http_worker_ipc=self.http_worker_ipc,
             )
@@ -900,6 +922,8 @@ def __getitem__(self, i):
             video_data=self.video_data[i] if self.video_data is not None else None,
             sampling_params=self.sampling_params[i],
             rid=self.rid[i],
+            lora_path=self.lora_path[i] if self.lora_path is not None else None,
+            lora_id=self.lora_id[i] if self.lora_id is not None else None,
             external_trace_header=self.external_trace_header,
             dimensions=self.dimensions,
             http_worker_ipc=self.http_worker_ipc,
@@ -928,6 +952,8 @@ class TokenizedEmbeddingReqInput(BaseReq):
     priority: Optional[int] = None
     # The number of dimensions the resulting output embeddings should have. It is applicable for Matryoshka Embeddings.
     dimensions: Optional[int] = None
+    # LoRA related
+    lora_id: Optional[str] = None  # None means just use the base model
 
 
 @dataclass
diff --git a/python/sglang/srt/managers/scheduler.py b/python/sglang/srt/managers/scheduler.py
@@ -1762,6 +1762,7 @@ def handle_embedding_request(
             token_type_ids=recv_req.token_type_ids,
             priority=recv_req.priority,
             dimensions=recv_req.dimensions,
+            lora_id=recv_req.lora_id,
             http_worker_ipc=recv_req.http_worker_ipc,
         )
         req.tokenizer = self.tokenizer
diff --git a/python/sglang/srt/managers/tokenizer_manager.py b/python/sglang/srt/managers/tokenizer_manager.py
@@ -958,6 +958,7 @@ def _create_tokenized_object(
                 rid=obj.rid,
                 priority=obj.priority,
                 dimensions=obj.dimensions,
+                lora_id=obj.lora_id,
                 http_worker_ipc=obj.http_worker_ipc,
             )
 
diff --git a/test/registered/lora/test_embedding_lora_support.py b/test/registered/lora/test_embedding_lora_support.py
@@ -0,0 +1,214 @@
+# Copyright 2023-2024 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+"""
+Unit tests for LoRA support in embedding models.
+
+Validates that EmbeddingReqInput correctly handles LoRA fields through
+normalization, batching, and request splitting.
+"""
+
+import multiprocessing as mp
+import unittest
+
+import numpy as np
+import torch
+
+from sglang.srt.entrypoints.openai.protocol import EmbeddingRequest
+from sglang.srt.managers.io_struct import EmbeddingReqInput, TokenizedEmbeddingReqInput
+from sglang.srt.sampling.sampling_params import SamplingParams
+from sglang.test.ci.ci_register import register_cuda_ci
+from sglang.test.runners import SRTRunner
+from sglang.test.test_utils import DEFAULT_PORT_FOR_SRT_TEST_RUNNER, CustomTestCase
+
+# Test configuration (same model/LoRA as test_lora_hf_sgl_logprob_diff.py)
+MODEL_PATH = "meta-llama/Llama-2-7b-hf"
+LORA_PATH = "yushengsu/sglang_lora_logprob_diff_without_tuning"
+LORA_BACKEND = "triton"
+SIMILARITY_THRESHOLD = 0.9999
+
+register_cuda_ci(
+    est_time=150,
+    suite="nightly-1-gpu",
+)
+
+
+class TestEmbeddingLoraSupport(unittest.TestCase):
+    """Test LoRA support in embedding request structures."""
+
+    def test_embedding_lora_fields(self):
+        """Test LoRA fields exist and work correctly across all embedding structures."""
+        # EmbeddingReqInput: fields exist, normalization expands single to batch, indexing works
+        req = EmbeddingReqInput(
+            text=["Hello", "World"], lora_path="my-adapter", lora_id=["id1", "id2"]
+        )
+        self.assertIsNotNone(req.lora_path)
+        req.normalize_batch_and_arguments()
+        self.assertEqual(req.lora_path, ["my-adapter", "my-adapter"])
+        self.assertEqual(req[0].lora_path, "my-adapter")
+        self.assertEqual(req[1].lora_id, "id2")
+
+        # EmbeddingReqInput: mismatched list length raises error
+        req = EmbeddingReqInput(text=["Hello", "World", "Test"], lora_path=["adapter1"])
+        with self.assertRaises(ValueError):
+            req.normalize_batch_and_arguments()
+
+        # TokenizedEmbeddingReqInput and EmbeddingRequest have lora fields
+        tokenized = TokenizedEmbeddingReqInput(
+            input_text="Hello",
+            input_ids=[1, 2, 3],
+            image_inputs={},
+            token_type_ids=[],
+            sampling_params=SamplingParams(),
+            lora_id="my-lora-id",
+        )
+        self.assertEqual(tokenized.lora_id, "my-lora-id")
+        self.assertEqual(
+            EmbeddingRequest(
+                input="Hello", model="test", lora_path="adapter"
+            ).lora_path,
+            "adapter",
+        )
+
+
+class TestEmbeddingLoraHFComparison(CustomTestCase):
+    """Compare HF+LoRA vs SGLang+LoRA embedding outputs."""
+
+    @classmethod
+    def get_hf_embedding_with_lora(cls, model_path, lora_path, texts, torch_dtype):
+        """Get embeddings from HuggingFace model with LoRA adapter."""
+        from peft import PeftModel
+        from transformers import AutoModelForCausalLM, AutoTokenizer
+
+        # Load base model as CausalLM to match adapter's expected structure
+        base_model = AutoModelForCausalLM.from_pretrained(
+            model_path,
+            torch_dtype=torch_dtype,
+            trust_remote_code=True,
+        ).cuda()
+
+        # Load LoRA adapter
+        model = PeftModel.from_pretrained(base_model, lora_path)
+        model.eval()
+
+        tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
+        if tokenizer.pad_token is None:
+            tokenizer.pad_token = tokenizer.eos_token
+
+        with torch.no_grad():
+            inputs = tokenizer(
+                texts, padding=True, truncation=True, return_tensors="pt"
+            ).to("cuda")
+
+            # Access the inner model (CausalLM wraps the base model)
+            outputs = model.model(**inputs, output_hidden_states=True)
+            hidden_states = outputs.hidden_states[-1]
+
+            # Last token pooling with L2 normalization (matching SGLang)
+            attention_mask = inputs["attention_mask"]
+            last_token_indices = attention_mask.sum(dim=1) - 1
+            batch_size = hidden_states.shape[0]
+            embeddings = hidden_states[
+                torch.arange(batch_size, device="cuda"), last_token_indices
+            ]
+            embeddings = embeddings / embeddings.norm(dim=1, keepdim=True)
+
+        # Cleanup
+        del model, base_model
+        torch.cuda.empty_cache()
+
+        return embeddings.cpu().numpy()
+
+    @classmethod
+    def get_sglang_embedding_with_lora(cls, model_path, lora_path, texts, torch_dtype):
+        """Get embeddings from SGLang with LoRA adapter."""
+        with SRTRunner(
+            model_path,
+            torch_dtype=torch_dtype,
+            model_type="embedding",
+            lora_paths=[lora_path],
+            lora_backend=LORA_BACKEND,
+            port=DEFAULT_PORT_FOR_SRT_TEST_RUNNER,
+            trust_remote_code=True,
+            mem_fraction_static=0.88,
+        ) as runner:
+            # Call engine.encode directly with lora_path
+            response = runner.engine.encode(prompt=texts, lora_path=lora_path)
+            if isinstance(response, list):
+                embeddings = [r["embedding"] for r in response]
+            else:
+                embeddings = [response["embedding"]]
+
+        return np.array(embeddings)
+
+    @staticmethod
+    def cosine_similarity(a, b):
+        """Compute cosine similarity between vectors."""
+        return np.dot(a, b) / (np.linalg.norm(a) * np.linalg.norm(b))
+
+    def test_embedding_lora_hf_sglang_similarity(self):
+        """Test that HF+LoRA and SGLang+LoRA produce similar embeddings."""
+        test_texts = [
+            "Hello world",
+            "This is a test sentence for embedding comparison",
+        ]
+
+        print(f"\nModel: {MODEL_PATH}")
+        print(f"LoRA: {LORA_PATH}")
+
+        # Get SGLang embeddings first (before HF loads model into GPU)
+        # This order matches test_lora_hf_sgl_logprob_diff.py and avoids OOM
+        print("\nGetting SGLang embeddings...")
+        sglang_embeddings = self.get_sglang_embedding_with_lora(
+            MODEL_PATH, LORA_PATH, test_texts, torch.float16
+        )
+
+        # Clear GPU memory
+        torch.cuda.empty_cache()
+
+        # Get HF embeddings
+        print("Getting HF embeddings...")
+        hf_embeddings = self.get_hf_embedding_with_lora(
+            MODEL_PATH, LORA_PATH, test_texts, torch.float16
+        )
+
+        # Compare embeddings
+        print("\nHF vs SGLang LoRA Embedding Comparison:")
+        similarities = []
+        for i, (hf_emb, sgl_emb) in enumerate(zip(hf_embeddings, sglang_embeddings)):
+            sim = self.cosine_similarity(hf_emb, sgl_emb)
+            similarities.append(sim)
+            print(f"  Text {i}: cosine similarity = {sim:.6f}")
+            self.assertGreater(
+                sim,
+                SIMILARITY_THRESHOLD,
+                f"Text {i} similarity {sim:.6f} below threshold {SIMILARITY_THRESHOLD}",
+            )
+
+        avg_similarity = np.mean(similarities)
+        print(f"  Average similarity: {avg_similarity:.6f}")
+        print(f"  Threshold: {SIMILARITY_THRESHOLD}")
+
+        self.assertGreater(
+            avg_similarity,
+            SIMILARITY_THRESHOLD,
+            f"Average similarity {avg_similarity:.4f} below threshold {SIMILARITY_THRESHOLD}",
+        )
+
+
+if __name__ == "__main__":
+    try:
+        mp.set_start_method("spawn")
+    except RuntimeError:
+        pass
+    unittest.main()

Original file line number	Diff line number	Diff line change
`@@ -1762,6 +1762,7 @@ def handle_embedding_request(`
`1762`	`1762`	`token_type_ids=recv_req.token_type_ids,`
`1763`	`1763`	`priority=recv_req.priority,`
`1764`	`1764`	`dimensions=recv_req.dimensions,`
	`1765`	`+ lora_id=recv_req.lora_id,`
`1765`	`1766`	`http_worker_ipc=recv_req.http_worker_ipc,`
`1766`	`1767`	`)`
`1767`	`1768`	`req.tokenizer = self.tokenizer`
Original file line number	Diff line number	Diff line change
`@@ -958,6 +958,7 @@ def _create_tokenized_object(`
`958`	`958`	`rid=obj.rid,`
`959`	`959`	`priority=obj.priority,`
`960`	`960`	`dimensions=obj.dimensions,`
	`961`	`+ lora_id=obj.lora_id,`
`961`	`962`	`http_worker_ipc=obj.http_worker_ipc,`
`962`	`963`	`)`
`963`	`964`