[NPU] support model skywork-reward-gemma2-2-27B-v0.2 (sgl-project#16947)

McZyWu · chenyang08056032 · web-flow · commit 4f7422f7bada · 2026-02-11T15:34:53.000+08:00
Co-authored-by: cy &lt;chenyang08056032@163.com&gt;
diff --git a/python/sglang/srt/configs/model_config.py b/python/sglang/srt/configs/model_config.py
@@ -1163,6 +1163,7 @@ def is_generation_model(model_architectures: List[str], is_embedding: bool = Fal
         or "BertForSequenceClassification" in model_architectures
         or "XLMRobertaModel" in model_architectures
         or "XLMRobertaForSequenceClassification" in model_architectures
+        or "Gemma2ForSequenceClassification" in model_architectures
     ):
         return False
     else:
diff --git a/python/sglang/srt/environ.py b/python/sglang/srt/environ.py
@@ -294,6 +294,10 @@ class Envs:
     SGLANG_NPU_DISABLE_ACL_FORMAT_WEIGHT = EnvBool(False)
     SGLANG_NPU_USE_MULTI_STREAM = EnvBool(False)
     SGLANG_NPU_USE_MLAPO = EnvBool(False)
+    # Forward native implementation for activation gelu tanh for model Skywork-Reward-Gemma-2-27B-v0.2
+    SGLANG_NPU_FORWARD_NATIVE_GELUTANH = EnvBool(False)
+    # Forward native implementation for gemma rms norm for model Skywork-Reward-Gemma-2-27B-v0.2
+    SGLANG_NPU_FORWARD_NATIVE_GEMMA_RMS_NORM = EnvBool(False)
 
     # Quantization
     SGLANG_INT4_WEIGHT = EnvBool(False)
diff --git a/python/sglang/srt/hardware_backend/npu/attention/ascend_backend.py b/python/sglang/srt/hardware_backend/npu/attention/ascend_backend.py
@@ -225,6 +225,11 @@ def __init__(self, model_runner: ModelRunner):
             self.q_head_dim = self.qk_rope_head_dim + self.qk_nope_head_dim
         else:
             self.use_alibi = getattr(model_runner.model_config, "use_alibi", False)
+            if (
+                "Gemma2ForSequenceClassification"
+                in model_runner.model_config.hf_config.architectures
+            ):
+                self.use_native_sdpa = True
         self.native_attn = AscendTorchNativeAttnBackend()
         self.graph_metadata = {}
         self.max_context_len = model_runner.model_config.context_len
@@ -821,10 +826,12 @@ def forward_extend(
 
                 # there are some accuracy issues in cross attention scene to use torch_npu._npu_flash_attention_qlens
                 # forward_batch.encoder_lens is not None in cross attention scend, we add native attn to solve accuracy issues
+                # Model skywork-reward-gemma2-2-27B also suffers from precision anomalies, thus the torch native backend becomes beneficial approach.
                 if (
                     layer.qk_head_dim <= 128
                     and causal
                     and forward_batch.encoder_lens is None
+                    and not getattr(self, "use_native_sdpa", False)
                 ):
                     if not self.use_alibi:
                         query = q.reshape(-1, layer.tp_q_head_num * layer.qk_head_dim)
diff --git a/python/sglang/srt/layers/activation.py b/python/sglang/srt/layers/activation.py
@@ -27,6 +27,7 @@
     get_tensor_model_parallel_rank,
     get_tensor_model_parallel_world_size,
 )
+from sglang.srt.environ import envs
 from sglang.srt.layers.quantization.base_config import QuantizationConfig
 from sglang.srt.layers.utils import MultiPlatformOp
 from sglang.srt.server_args import get_global_server_args
@@ -131,6 +132,8 @@ def forward_xpu(self, x: torch.Tensor) -> torch.Tensor:
         return self._forward_impl(x)
 
     def forward_npu(self, x: torch.Tensor) -> torch.Tensor:
+        if envs.SGLANG_NPU_FORWARD_NATIVE_GELUTANH.get():
+            return self.forward_native(x)
         y_npu, gelu_npu = torch_npu.npu_geglu(
             x,
             dim=-1,
diff --git a/python/sglang/srt/layers/layernorm.py b/python/sglang/srt/layers/layernorm.py
@@ -24,6 +24,7 @@
     is_batch_invariant_mode_enabled,
     rms_norm_batch_invariant,
 )
+from sglang.srt.environ import envs
 from sglang.srt.layers.utils import MultiPlatformOp
 from sglang.srt.server_args import get_global_server_args
 from sglang.srt.utils import (
@@ -468,6 +469,8 @@ def forward_npu(
         residual: Optional[torch.Tensor] = None,
         post_residual_addition: Optional[torch.Tensor] = None,
     ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        if envs.SGLANG_NPU_FORWARD_NATIVE_GEMMA_RMS_NORM.get():
+            return self.forward_native(x, residual)
         if residual is not None:
             if post_residual_addition is not None:
                 residual = residual + post_residual_addition
diff --git a/python/sglang/srt/models/gemma2.py b/python/sglang/srt/models/gemma2.py
@@ -39,7 +39,9 @@
     default_weight_loader,
     maybe_remap_kv_scale_name,
 )
-from sglang.srt.utils import add_prefix, make_layers
+from sglang.srt.utils import add_prefix, is_npu, make_layers
+
+_is_npu = is_npu()
 
 
 # Aligned with HF's implementation, using sliding window inclusive with the last token
@@ -142,13 +144,28 @@ def __init__(
             quant_config=quant_config,
             prefix=add_prefix("o_proj", prefix),
         )
-        self.rotary_emb = get_rope(
-            self.head_dim,
-            rotary_dim=self.head_dim,
-            max_position=max_position_embeddings,
-            base=self.rope_theta,
-            is_neox_style=True,
-        )
+        if (
+            not _is_npu
+            or "Gemma2ForSequenceClassification" not in self.config.architectures
+        ):
+            self.rotary_emb = get_rope(
+                self.head_dim,
+                rotary_dim=self.head_dim,
+                max_position=max_position_embeddings,
+                base=self.rope_theta,
+                is_neox_style=True,
+            )
+            logit_cap = self.config.attn_logit_softcapping
+        else:
+            self.rotary_emb = get_rope(
+                self.head_dim,
+                rotary_dim=self.head_dim,
+                max_position=max_position_embeddings,
+                base=self.rope_theta,
+                is_neox_style=True,
+                dtype=torch.float32,
+            )
+            logit_cap = 0.0
 
         use_sliding_window = layer_id % 2 == 0 and hasattr(config, "sliding_window")
         self.attn = RadixAttention(
@@ -157,7 +174,7 @@ def __init__(
             self.scaling,
             num_kv_heads=self.num_kv_heads,
             layer_id=layer_id,
-            logit_cap=self.config.attn_logit_softcapping,
+            logit_cap=logit_cap,
             sliding_window_size=(
                 get_attention_sliding_window_size(config)
                 if use_sliding_window
@@ -294,7 +311,9 @@ def forward(
             hidden_states = self.embed_tokens(input_ids)
         else:
             hidden_states = input_embeds
-        normalizer = torch.tensor(self.config.hidden_size**0.5, dtype=torch.float16)
+        normalizer = torch.tensor(
+            self.config.hidden_size**0.5, dtype=hidden_states.dtype
+        )
         hidden_states *= normalizer
 
         residual = None
diff --git a/test/registered/ascend/reward_models/test_ascend_gemma_2_27b_v0_2.py b/test/registered/ascend/reward_models/test_ascend_gemma_2_27b_v0_2.py
@@ -0,0 +1,88 @@
+import logging
+import multiprocessing as mp
+import os
+import unittest
+
+import torch
+
+from sglang.test.ci.ci_register import register_npu_ci
+from sglang.test.runners import HFRunner, SRTRunner
+from sglang.test.test_utils import CustomTestCase
+
+logger = logging.getLogger(__name__)
+register_npu_ci(est_time=400, suite="nightly-1-npu-a3", nightly=True)
+
+MODELS = [
+    (
+        "/root/.cache/modelscope/hub/models/AI-ModelScope/Skywork-Reward-Gemma-2-27B-v0.2",
+        1,
+        4e-2,
+    ),
+]
+TORCH_DTYPES = [torch.bfloat16]
+
+PROMPT = (
+    "What is the range of the numeric output of a sigmoid node in a neural network?"
+)
+RESPONSE1 = "The output of a sigmoid node is bounded between -1 and 1."
+RESPONSE2 = "The output of a sigmoid node is bounded between 0 and 1."
+
+CONVS = [
+    [{"role": "user", "content": PROMPT}, {"role": "assistant", "content": RESPONSE1}],
+    [{"role": "user", "content": PROMPT}, {"role": "assistant", "content": RESPONSE2}],
+]
+
+
+class TestRewardModels(CustomTestCase):
+
+    @classmethod
+    def setUpClass(cls):
+        mp.set_start_method("spawn", force=True)
+
+    def assert_close_reward_scores(
+        self,
+        convs,
+        model_path,
+        tp_size,
+        torch_dtype,
+        tolerance,
+    ) -> None:
+        with HFRunner(
+            model_path,
+            torch_dtype=torch_dtype,
+            model_type="reward",
+        ) as hf_runner:
+            hf_outputs = hf_runner.forward(convs)
+
+        with SRTRunner(
+            model_path,
+            torch_dtype=torch_dtype,
+            model_type="reward",
+            mem_fraction_static=0.95,
+        ) as srt_runner:
+            prompts = srt_runner.tokenizer.apply_chat_template(
+                convs, tokenize=False, return_dict=False
+            )
+            srt_outputs = srt_runner.forward(prompts)
+
+        hf_scores = torch.tensor(hf_outputs.scores)
+        srt_scores = torch.tensor(srt_outputs.scores)
+        logger.info(f"{hf_scores=}")
+        logger.info(f"{srt_scores=}")
+
+        assert torch.all(
+            abs(hf_scores - srt_scores) < tolerance
+        ), "reward scores are not all close"
+
+    def test_reward_scores(self):
+        for model, tp_size, tolerance in MODELS:
+            for torch_dtype in TORCH_DTYPES:
+                self.assert_close_reward_scores(
+                    CONVS, model, tp_size, torch_dtype, tolerance
+                )
+
+
+if __name__ == "__main__":
+    os.environ["SGLANG_NPU_FORWARD_NATIVE_GELUTANH"] = "1"
+    os.environ["SGLANG_NPU_FORWARD_NATIVE_GEMMA_RMS_NORM"] = "1"
+    unittest.main()