[Fix] Add lora tied lm head support (for Qwen2.5, Gemma, etc model need) (sgl-project#18634)

yushengsu-thu · yushengsu-thu · commit b3828900f4eb · 2026-02-25T09:58:51.000Z
diff --git a/python/sglang/srt/lora/lora.py b/python/sglang/srt/lora/lora.py
@@ -102,13 +102,24 @@ def _process_weight(self, name: str, loaded_weight: torch.Tensor):
             self.config.target_modules
         )
 
+        # Remap PEFT "unembed_tokens" key to "lm_head" so the weight is
+        # recognized and loaded into the correct buffer.
+        if "unembed_tokens" in name:
+            name = name.replace("unembed_tokens", "lm_head")
+
         layer_id = get_layer_id(name)
         if layer_id is not None:
             self.layers[layer_id].weights[name] = loaded_weight.cpu()
         elif "embed_tokens" in name or "lm_head" in name:
-            # Check if this module is declared in target_modules before loading
+            # Check if this module is declared in target_modules before loading.
+            # When normalized_target_modules is {"all"} (e.g. target_modules was
+            # "all-linear"), we allow loading since the server-level
+            # --lora-target-modules will govern which modules are active.
             module_name = "embed_tokens" if "embed_tokens" in name else "lm_head"
-            if module_name in normalized_target_modules:
+            if (
+                "all" in normalized_target_modules
+                or module_name in normalized_target_modules
+            ):
                 self.embedding_layers[name] = loaded_weight.cpu()
             else:
                 logger.debug(
diff --git a/python/sglang/srt/lora/lora_manager.py b/python/sglang/srt/lora/lora_manager.py
@@ -387,6 +387,33 @@ def init_lora_shapes(
         )
 
         for lora_id, config in self.configs.items():
+            # Handle PEFT shorthand strings like "all-linear" or "all".
+            # These cannot be resolved to concrete module names without
+            # inspecting the base model, so we require the user to specify
+            # --lora-target-modules explicitly when such shorthands are used.
+            if isinstance(config.target_modules, str):
+                if config.target_modules in ("all-linear", "all"):
+                    if target_modules is not None:
+                        # CLI --lora-target-modules already provided; skip
+                        # per-adapter inference for this adapter.
+                        continue
+                    else:
+                        lora_name = self.lora_refs[lora_id].lora_name
+                        raise ValueError(
+                            f"LoRA adapter '{lora_name}' uses "
+                            f"target_modules='{config.target_modules}' which cannot "
+                            "be resolved automatically. Please explicitly specify "
+                            "--lora-target-modules during server startup. You can "
+                            "specify 'all' to enable all supported module types."
+                        )
+                else:
+                    raise ValueError(
+                        f"SGLang does not recognize target_modules="
+                        f"'{config.target_modules}'. Please use a list of module "
+                        "name suffixes in the adapter's PEFT config, or explicitly "
+                        "specify --lora-target-modules during server startup."
+                    )
+
             if not isinstance(config.target_modules, list):
                 raise ValueError(
                     f"SGLang currently only supports inferring LoRA target modules when a list of "
@@ -541,6 +568,40 @@ def init_lora_modules(self):
         self.embed_tokens_module: Optional[BaseLayerWithLoRA] = None
         self.lm_head_module: Optional[BaseLayerWithLoRA] = None
 
+        # When tie_word_embeddings=True, lm_head is the same Python object as
+        # embed_tokens. PyTorch's named_modules() deduplicates by object identity,
+        # so lm_head will not appear as a separate entry in the scan below,
+        # preventing LoRA from wrapping it. To fix this, we create a new
+        # ParallelLMHead that shares the same base weight tensor (no extra GPU
+        # memory) so that named_modules() yields it as an independent module.
+        if "lm_head" in self.target_modules:
+            lm_head = getattr(self.base_model, "lm_head", None)
+            embed_tokens = None
+            for name, mod in self.base_model.named_modules():
+                if name.endswith("embed_tokens"):
+                    embed_tokens = mod
+                    break
+            if (
+                lm_head is not None
+                and embed_tokens is not None
+                and lm_head is embed_tokens
+            ):
+                logger.info(
+                    "lm_head is tied with embed_tokens. Creating a separate "
+                    "ParallelLMHead that shares the base weight for LoRA support."
+                )
+                untied_lm_head = ParallelLMHead(
+                    num_embeddings=embed_tokens.org_vocab_size,
+                    embedding_dim=embed_tokens.embedding_dim,
+                    params_dtype=embed_tokens.weight.dtype,
+                    org_num_embeddings=embed_tokens.org_vocab_size,
+                )
+                # Share the base weight tensor — no additional GPU memory.
+                untied_lm_head.weight = embed_tokens.weight
+                # Replace the model attribute so named_modules() sees it
+                # independently.
+                self.base_model.lm_head = untied_lm_head
+
         for module_name, module in self.base_model.named_modules():
             # TODO (lifuhuang): in the future, we should consider generalizing the
             # should_apply_lora function to support mapping by full module name instead
diff --git a/python/sglang/srt/lora/mem_pool.py b/python/sglang/srt/lora/mem_pool.py
@@ -115,6 +115,8 @@ def _can_support(config: LoRAConfig) -> bool:
             if config.lora_added_tokens_size > self.lora_added_tokens_size:
                 return False
             target_module_names = get_normalized_target_modules(config.target_modules)
+            if "all" in target_module_names:
+                return True
             return target_module_names.issubset(self.target_modules)
 
         if isinstance(config, LoRAConfig):
diff --git a/python/sglang/srt/lora/utils.py b/python/sglang/srt/lora/utils.py
@@ -1,6 +1,6 @@
 from dataclasses import dataclass
 from enum import Enum
-from typing import Iterable, Optional, Set, Tuple
+from typing import Iterable, Optional, Set, Tuple, Union
 
 import torch
 
@@ -98,12 +98,22 @@ def get_hidden_dim(
 
 
 def get_normalized_target_modules(
-    target_modules: Iterable[str],
+    target_modules: Union[str, Iterable[str]],
 ) -> set[str]:
     """
     Mapping a list of target module name to names of the normalized LoRA weights.
     Handles both base module names (e.g., "gate_proj") and prefixed module names (e.g., "feed_forward.gate_proj").
+
+    Also handles PEFT shorthand strings like "all-linear" or "all" by returning
+    {"all"} as a sentinel value (the caller should check for "all" and fall
+    back to the CLI --lora-target-modules to determine the concrete module set).
     """
+    # Handle PEFT shorthand strings — these cannot be resolved to concrete
+    # module names without inspecting the base model, so we return {"all"}
+    # and let the caller fall back to the CLI --lora-target-modules.
+    if isinstance(target_modules, str):
+        return {"all"}
+
     params_mapping = {
         "q_proj": "qkv_proj",
         "k_proj": "qkv_proj",
@@ -116,6 +126,7 @@ def get_normalized_target_modules(
         "word_embeddings": "embed_tokens",
         "lm_head": "lm_head",
         "output": "lm_head",
+        "unembed_tokens": "lm_head",
     }
 
     result = set()
diff --git a/test/registered/lora/test_lora_tied_lm_head.py b/test/registered/lora/test_lora_tied_lm_head.py
@@ -0,0 +1,224 @@
+# Copyright 2023-2025 SGLang Team
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ==============================================================================
+
+"""
+Test LoRA on models with tied lm_head (tie_word_embeddings=True).
+
+When tie_word_embeddings=True, lm_head shares the same weight tensor as
+embed_tokens. PyTorch's named_modules() deduplicates by object identity,
+so lm_head won't appear as a separate module. This test validates that
+SGLang correctly handles this case by untying lm_head before LoRA wrapping.
+
+The test:
+1. Programmatically creates a LoRA adapter with lm_head in target_modules
+   using PEFT on a model with tie_word_embeddings=True (Qwen/Qwen2.5-0.5B).
+2. Compares logprobs between HuggingFace+PEFT and SGLang to ensure numerical
+   consistency. This implicitly verifies no NaN values are produced and that
+   LoRA is actually being applied (since HF+PEFT is the trusted reference).
+"""
+
+import multiprocessing as mp
+import os
+import shutil
+import tempfile
+import unittest
+
+import torch
+
+try:
+    from peft import LoraConfig, get_peft_model
+except ImportError:
+    import subprocess
+
+    subprocess.check_call(["pip", "install", "peft", "--no-deps"])
+    from peft import LoraConfig, get_peft_model
+
+from transformers import AutoModelForCausalLM
+
+from sglang.test.ci.ci_register import register_cuda_ci
+from sglang.test.runners import HFRunner, SRTRunner
+from sglang.test.test_utils import DEFAULT_PORT_FOR_SRT_TEST_RUNNER, CustomTestCase
+
+register_cuda_ci(est_time=120, suite="nightly-1-gpu", nightly=True)
+
+# Use a small model with tie_word_embeddings=True
+BASE_MODEL = "Qwen/Qwen2.5-0.5B"
+
+TEST_PROMPTS = [
+    "AI is a field of computer science focused on",
+    "The capital of France is",
+]
+
+MAX_NEW_TOKENS = 16
+LOGPROB_THRESHOLD = 2e-1
+
+
+def create_lora_adapter_with_lm_head(base_model_name: str, output_dir: str):
+    """
+    Programmatically create a LoRA adapter that targets lm_head,
+    using a model with tie_word_embeddings=True.
+
+    The adapter uses randomly initialized LoRA weights (no training).
+    This is sufficient to test that:
+    - SGLang can load the adapter without errors
+    - lm_head LoRA is applied (output differs from base model)
+    - Logprobs match between HF and SGLang
+    """
+    model = AutoModelForCausalLM.from_pretrained(
+        base_model_name,
+        torch_dtype=torch.float16,
+        device_map="cpu",
+    )
+
+    # Verify the model actually has tied embeddings
+    assert (
+        model.config.tie_word_embeddings
+    ), f"Expected tie_word_embeddings=True for {base_model_name}"
+
+    # Only target lm_head to isolate the test to the tied-embedding scenario.
+    lora_config = LoraConfig(
+        r=8,
+        lora_alpha=16,
+        target_modules=["lm_head"],
+        lora_dropout=0,
+        bias="none",
+        task_type="CAUSAL_LM",
+    )
+
+    peft_model = get_peft_model(model, lora_config)
+
+    # PEFT initializes lora_B to zeros by default, which makes the adapter
+    # produce identical output to the base model. Initialize lora_B with
+    # non-zero random weights so the adapter has a visible effect.
+    with torch.no_grad():
+        for name, param in peft_model.named_parameters():
+            if "lora_B" in name:
+                torch.nn.init.normal_(param, mean=0.0, std=0.02)
+
+    peft_model.save_pretrained(output_dir)
+
+    # Verify the saved adapter contains lm_head keys
+    from safetensors import safe_open
+
+    safetensors_path = os.path.join(output_dir, "adapter_model.safetensors")
+    f = safe_open(safetensors_path, framework="pt")
+    lm_head_keys = [k for k in f.keys() if "lm_head" in k]
+    assert (
+        len(lm_head_keys) > 0
+    ), f"Expected lm_head LoRA weights in adapter, got keys: {sorted(f.keys())}"
+
+    print(f"Created LoRA adapter at {output_dir}")
+    print(f"  lm_head keys: {lm_head_keys}")
+
+    # Clean up the model to free memory
+    del peft_model, model
+    torch.cuda.empty_cache()
+
+
+class TestLoRATiedLMHead(CustomTestCase):
+    """
+    Test that LoRA works correctly on models with tied lm_head.
+    """
+
+    _adapter_dir = None
+
+    @classmethod
+    def setUpClass(cls):
+        """Create a temporary LoRA adapter with lm_head targeting."""
+        super().setUpClass()
+        cls._adapter_dir = tempfile.mkdtemp(prefix="sglang_test_lora_tied_lm_head_")
+        create_lora_adapter_with_lm_head(BASE_MODEL, cls._adapter_dir)
+
+    @classmethod
+    def tearDownClass(cls):
+        """Clean up the temporary adapter directory."""
+        if cls._adapter_dir and os.path.exists(cls._adapter_dir):
+            shutil.rmtree(cls._adapter_dir)
+        super().tearDownClass()
+
+    def test_tied_lm_head_lora_hf_sgl_logprob_match(self):
+        """
+        Compare logprobs between HuggingFace+PEFT and SGLang+LoRA
+        for a tied lm_head adapter, ensuring numerical consistency.
+        """
+        prompts = TEST_PROMPTS[:2]
+
+        # Run SGLang with LoRA
+        with SRTRunner(
+            BASE_MODEL,
+            torch_dtype=torch.float16,
+            model_type="generation",
+            lora_paths=[self._adapter_dir],
+            max_loras_per_batch=1,
+            lora_backend="triton",
+            lora_target_modules=["lm_head"],
+            disable_cuda_graph=True,
+            disable_radix_cache=True,
+            mem_fraction_static=0.80,
+            port=DEFAULT_PORT_FOR_SRT_TEST_RUNNER,
+        ) as srt_runner:
+            srt_outputs = srt_runner.forward(
+                prompts,
+                max_new_tokens=MAX_NEW_TOKENS,
+                lora_paths=[self._adapter_dir] * len(prompts),
+            )
+
+        torch.cuda.empty_cache()
+
+        # Run HuggingFace with LoRA (via PEFT)
+        with HFRunner(
+            BASE_MODEL,
+            torch_dtype=torch.float16,
+            model_type="generation",
+        ) as hf_runner:
+            hf_outputs = hf_runner.forward(
+                prompts,
+                max_new_tokens=MAX_NEW_TOKENS,
+                lora_paths=[self._adapter_dir] * len(prompts),
+            )
+
+        # Compare prefill logprobs
+        for i in range(len(prompts)):
+            srt_logprobs = torch.tensor(srt_outputs.top_input_logprobs[i])
+            hf_logprobs = torch.tensor(hf_outputs.top_input_logprobs[i])
+            max_diff = torch.max(torch.abs(srt_logprobs - hf_logprobs)).item()
+            print(f"Prompt {i} prefill logprob max_diff (SGLang vs HF): {max_diff:.6e}")
+            self.assertLess(
+                max_diff,
+                LOGPROB_THRESHOLD,
+                f"Prompt {i}: prefill logprob diff {max_diff:.6e} "
+                f"exceeds threshold {LOGPROB_THRESHOLD:.0e}",
+            )
+
+        # Compare decode logprobs
+        for i in range(len(prompts)):
+            srt_logprobs = torch.tensor(srt_outputs.top_output_logprobs[i])
+            hf_logprobs = torch.tensor(hf_outputs.top_output_logprobs[i])
+            max_diff = torch.max(torch.abs(srt_logprobs - hf_logprobs)).item()
+            print(f"Prompt {i} decode logprob max_diff (SGLang vs HF): {max_diff:.6e}")
+            self.assertLess(
+                max_diff,
+                LOGPROB_THRESHOLD,
+                f"Prompt {i}: decode logprob diff {max_diff:.6e} "
+                f"exceeds threshold {LOGPROB_THRESHOLD:.0e}",
+            )
+
+
+if __name__ == "__main__":
+    try:
+        mp.set_start_method("spawn")
+    except RuntimeError:
+        pass
+
+    unittest.main(warnings="ignore")