sunstrikes
diff --git a/‎.github/workflows/pr-test-npu.yml‎
Lines changed: 40 additions & 1 deletion b/‎.github/workflows/pr-test-npu.yml‎
Lines changed: 40 additions & 1 deletion
diff --git a/‎python/pyproject_npu.toml‎
Lines changed: 2 additions & 1 deletion b/‎python/pyproject_npu.toml‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎python/sglang/multimodal_gen/runtime/distributed/group_coordinator.py‎
Lines changed: 1 addition & 11 deletions b/‎python/sglang/multimodal_gen/runtime/distributed/group_coordinator.py‎
Lines changed: 1 addition & 11 deletions
diff --git a/‎python/sglang/multimodal_gen/runtime/distributed/parallel_state.py‎
Lines changed: 6 additions & 1 deletion b/‎python/sglang/multimodal_gen/runtime/distributed/parallel_state.py‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎python/sglang/multimodal_gen/runtime/layers/activation.py‎
Lines changed: 8 additions & 0 deletions b/‎python/sglang/multimodal_gen/runtime/layers/activation.py‎
Lines changed: 8 additions & 0 deletions
diff --git a/‎python/sglang/multimodal_gen/runtime/layers/custom_op.py‎
Lines changed: 5 additions & 0 deletions b/‎python/sglang/multimodal_gen/runtime/layers/custom_op.py‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎python/sglang/multimodal_gen/runtime/layers/layernorm.py‎
Lines changed: 17 additions & 4 deletions b/‎python/sglang/multimodal_gen/runtime/layers/layernorm.py‎
Lines changed: 17 additions & 4 deletions
diff --git a/‎python/sglang/multimodal_gen/runtime/layers/linear.py‎
Lines changed: 2 additions & 1 deletion b/‎python/sglang/multimodal_gen/runtime/layers/linear.py‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎python/sglang/multimodal_gen/runtime/layers/triton_ops.py‎
Lines changed: 38 additions & 2 deletions b/‎python/sglang/multimodal_gen/runtime/layers/triton_ops.py‎
Lines changed: 38 additions & 2 deletions
diff --git a/‎python/sglang/multimodal_gen/runtime/layers/vocab_parallel_embedding.py‎
Lines changed: 5 additions & 1 deletion b/‎python/sglang/multimodal_gen/runtime/layers/vocab_parallel_embedding.py‎
Lines changed: 5 additions & 1 deletion
@@ -64,7 +64,7 @@ jobs:
             multimodal_gen:
               - "python/sglang/multimodal_gen/**"
               - "python/pyproject_npu.toml"
-              - "scripts/ci/npu_ci_install_dependency.sh"
+              - "scripts/ci/npu/npu_ci_install_dependency.sh"
               - ".github/workflows/pr-test-npu.yml"
 
   # ==================== PR Gate ==================== #
@@ -241,3 +241,42 @@ jobs:
         run: |
           cd test/srt
           python3 run_suite.py --suite per-commit-16-npu-a3 --timeout-per-file 3600 --auto-partition-id ${{ matrix.part }} --auto-partition-size 2
+
+  multimodal-gen-test-1-npu-a3:
+    needs: [check-changes, pr-gate]
+    if: needs.check-changes.outputs.multimodal_gen == 'true'
+    runs-on: linux-aarch64-a3-16
+    container:
+      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc2-a3-ubuntu22.04-py3.11
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+
+      - name: Install dependencies
+        run: |
+          # speed up by using infra cache services
+          CACHING_URL="cache-service.nginx-pypi-cache.svc.cluster.local"
+          sed -Ei "s@(ports|archive).ubuntu.com@${CACHING_URL}:8081@g" /etc/apt/sources.list
+          pip config set global.index-url http://${CACHING_URL}/pypi/simple
+          pip config set global.extra-index-url "https://pypi.tuna.tsinghua.edu.cn/simple"
+          pip config set global.trusted-host "${CACHING_URL} pypi.tuna.tsinghua.edu.cn"
+
+          bash scripts/ci/npu/npu_ci_install_dependency.sh a3
+          # copy required file from our daily cache
+          cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp
+          # copy download through proxy
+          curl -o /tmp/test.jsonl -L https://gh-proxy.test.osinfra.cn/https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl
+
+      - name: Run test
+        timeout-minutes: 60
+        env:
+          SGLANG_USE_MODELSCOPE: true
+          SGLANG_IS_IN_CI: true
+          HF_ENDPOINT: https://hf-mirror.com
+          TORCH_EXTENSIONS_DIR: /tmp/torch_extensions
+          PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True"
+          STREAMS_PER_DEVICE: 32
+        run: |
+          export PATH="/usr/local/Ascend/8.3.RC1/compiler/bishengir/bin:${PATH}"
+          cd python
+          python3 sglang/multimodal_gen/test/run_suite.py --suite 1-npu
@@ -77,7 +77,8 @@ diffusion = [
     "moviepy>=2.0.0",
     "opencv-python==4.10.0.84",
     "remote-pdb",
-    "cache-dit==1.1.8"
+    "cache-dit==1.2.1",
+    "addict"
 ]
 
 tracing = [
 
@@ -16,7 +16,6 @@
 from torch.cuda import synchronize
 from torch.distributed import Backend, ProcessGroup
 
-from sglang.multimodal_gen import envs
 from sglang.multimodal_gen.runtime.distributed.device_communicators.base_device_communicator import (
     DeviceCommunicatorBase,
 )
@@ -46,11 +45,7 @@
 def get_local_torch_device() -> torch.device:
     """Return the torch device for the current rank."""
 
-    return (
-        torch.device(f"cuda:{envs.LOCAL_RANK}")
-        if current_platform.is_cuda_alike()
-        else torch.device("mps")
-    )
+    return current_platform.get_local_torch_device()
 
 
 def _get_unique_name(name: str) -> str:
@@ -190,8 +185,6 @@ def __init__(
         # TODO: fix it for other platforms
         self.device = get_local_torch_device()
 
-        from sglang.multimodal_gen.runtime.platforms import current_platform
-
         self.use_device_communicator = use_device_communicator
 
         self.device_communicator: DeviceCommunicatorBase = None  # type: ignore
@@ -287,9 +280,6 @@ def group_skip_rank(self):
 
     @contextmanager
     def graph_capture(self, graph_capture_context: GraphCaptureContext | None = None):
-        # Platform-aware graph capture
-        from sglang.multimodal_gen.runtime.platforms import current_platform
-
         if current_platform.is_cuda_alike():
             if graph_capture_context is None:
                 stream = torch.cuda.Stream()
 
@@ -248,7 +248,11 @@ def init_distributed_environment(
         # For MPS and MUSA, don't pass device_id as it doesn't support device indices
         extra_args = (
             {}
-            if (current_platform.is_mps() or current_platform.is_musa())
+            if (
+                current_platform.is_mps()
+                or current_platform.is_musa()
+                or current_platform.is_npu()
+            )
             else dict(device_id=device_id)
         )
 
@@ -618,6 +622,7 @@ def maybe_init_distributed_environment_and_model_parallel(
         local_rank=local_rank,
         distributed_init_method=distributed_init_method,
         device_id=device,
+        backend=current_platform.get_torch_distributed_backend_str(),
         timeout=dist_timeout,
     )
     initialize_model_parallel(
 
@@ -14,8 +14,12 @@
 
 _is_cuda = current_platform.is_cuda()
 _is_hip = current_platform.is_hip()
+_is_npu = current_platform.is_npu()
 if _is_cuda or _is_hip:
     from sgl_kernel import silu_and_mul
+
+if _is_npu:
+    import torch_npu
 # TODO (will): remove this dependency
 from sglang.multimodal_gen.runtime.layers.custom_op import CustomOp
 
@@ -46,6 +50,10 @@ def forward_native(self, x: torch.Tensor) -> torch.Tensor:
         d = x.shape[-1] // 2
         return F.silu(x[..., :d]) * x[..., d:]
 
+    def forward_npu(self, x: torch.Tensor) -> torch.Tensor:
+        out = torch_npu.npu_swiglu(x)
+        return out
+
 
 @CustomOp.register("gelu_and_mul")
 class GeluAndMul(CustomOp):
 
@@ -64,6 +64,11 @@ def forward_oot(self, *args, **kwargs) -> Any:
         # PyTorch-native implementation.
         return self.forward_native(*args, **kwargs)
 
+    def forward_npu(self, *args, **kwargs) -> Any:
+        # By default, we assume that NPU ops are compatible with the
+        # PyTorch-native implementation.
+        return self.forward_native(*args, **kwargs)
+
     def dispatch_forward(self) -> Callable:
         if _is_cuda:
             return self.forward_cuda
 
@@ -12,9 +12,13 @@
 from sglang.multimodal_gen.runtime.platforms import current_platform
 
 _is_cuda = current_platform.is_cuda()
+_is_npu = current_platform.is_npu()
 if _is_cuda:
     from sgl_kernel import fused_add_rmsnorm, rmsnorm
 
+if _is_npu:
+    import torch_npu
+
 from sglang.jit_kernel.norm import can_use_fused_inplace_qknorm, fused_inplace_qknorm
 from sglang.multimodal_gen.runtime.distributed.parallel_state import (
     get_tensor_model_parallel_rank,
@@ -28,11 +32,8 @@
     rms_norm_fn,
     triton_one_pass_rms_norm,
 )
-from sglang.multimodal_gen.runtime.platforms import current_platform
 from sglang.multimodal_gen.runtime.utils.common import get_bool_env_var
 
-_is_cuda = current_platform.is_cuda()
-
 
 # Copied and adapted from sglang
 @CustomOp.register("rms_norm")
@@ -141,6 +142,18 @@ def forward_cpu(
     ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
         return self.forward_native(x, residual)
 
+    def forward_npu(
+        self,
+        x: torch.Tensor,
+        residual: Optional[torch.Tensor] = None,
+    ) -> Union[torch.Tensor, Tuple[torch.Tensor, torch.Tensor]]:
+        if residual is not None:
+            out, _, residual_out = torch_npu.npu_add_rms_norm(
+                residual, x, self.weight.data, self.variance_epsilon
+            )
+            return out, residual_out
+        return torch_npu.npu_rms_norm(x, self.weight.data, self.variance_epsilon)[0]
+
     def forward_hip(
         self,
         x: torch.Tensor,
@@ -214,7 +227,7 @@ def forward_cuda(
         x = x.view(-1, self.hidden_size)
         return self.forward_triton(x).view(shape)
 
-    @torch.compile(backend="inductor")
+    @torch.compile(backend="inductor", disable=current_platform.is_npu())
     def forward_native(
         self,
         x: torch.Tensor,
 
@@ -35,6 +35,7 @@
 
 # yapf: enable
 from sglang.multimodal_gen.runtime.models.utils import set_weight_attrs
+from sglang.multimodal_gen.runtime.platforms import current_platform
 from sglang.multimodal_gen.runtime.utils.logging_utils import init_logger
 
 logger = init_logger(__name__)
@@ -152,7 +153,7 @@ def apply(
     ) -> torch.Tensor:
         output = (
             F.linear(x, layer.weight, bias)
-            if torch.cuda.is_available() or bias is None
+            if current_platform.is_amp_supported() or bias is None
             else F.linear(x, layer.weight, bias.to(x.dtype))
         )  # NOTE: this line assumes that we are using amp when using cuda and is needed to account for the fact that amp isn't supported in mps
         return output
 
@@ -8,6 +8,8 @@
 import triton.language as tl  # type: ignore
 from torch import Tensor
 
+from sglang.multimodal_gen.runtime.platforms import current_platform
+
 
 @triton.autotune(
     configs=[
@@ -524,8 +526,14 @@ def triton_autotune_configs():
     max_threads_per_block = 1024
     # Default to warp size 32 if not defined by device
     warp_size = getattr(
-        torch.cuda.get_device_properties(torch.cuda.current_device()), "warp_size", 32
+        torch.get_device_module().get_device_properties(
+            torch.get_device_module().current_device()
+        ),
+        "warp_size",
+        32,
     )
+    if warp_size is None:
+        warp_size = 32
     # Autotune for warp counts which are powers of 2 and do not exceed thread per block limit
     return [
         triton.Config({}, num_warps=warp_count)
@@ -820,7 +828,7 @@ def _layer_norm_fwd_impl(
     BLOCK_N = min(MAX_FUSED_SIZE, triton.next_power_of_2(N))
     if N > BLOCK_N:
         raise RuntimeError("This layer norm doesn't support feature dim >= 64KB.")
-    with torch.cuda.device(x.device.index):
+    with torch.get_device_module().device(x.device.index):
         torch.library.wrap_triton(_layer_norm_fwd_1pass_kernel)[(M,)](
             x,
             out,
@@ -1166,3 +1174,31 @@ def triton_one_pass_rms_norm(x: torch.Tensor, w: torch.Tensor, eps: float = 1e-6
             BLOCK_SIZE_SEQ=BLOCK_SIZE_SEQ,
         )
     return y
+
+
+if current_platform.is_npu():
+    # TODO: remove this when triton ascend bug is fixed
+    def fuse_scale_shift_native(
+        x: torch.Tensor,
+        scale: torch.Tensor,
+        shift: torch.Tensor,
+        block_l: int = 128,
+        block_c: int = 128,
+    ):
+        return x * (1 + scale) + shift
+
+    fuse_scale_shift_kernel = fuse_scale_shift_native
+
+    # TODO: remove this when triton ascend bug is fixed
+    def apply_rotary_embedding_native(
+        x: torch.Tensor, cos: torch.Tensor, sin: torch.Tensor, interleaved: bool = False
+    ) -> torch.Tensor:
+        cos = cos.unsqueeze(-2).to(x.dtype)
+        sin = sin.unsqueeze(-2).to(x.dtype)
+        x1 = x[..., ::2]
+        x2 = x[..., 1::2]
+        o1 = x1 * cos - x2 * sin
+        o2 = x2 * cos + x1 * sin
+        return torch.stack((o1, o2), dim=-1).flatten(-2)
+
+    apply_rotary_embedding = apply_rotary_embedding_native
@@ -145,7 +145,11 @@ def __post_init__(self):
         assert self.num_added_elements <= self.num_added_elements_padded
 
 
-@torch.compile(dynamic=True, backend=current_platform.simple_compile_backend)
+@torch.compile(
+    dynamic=True,
+    backend=current_platform.simple_compile_backend,
+    disable=current_platform.is_npu(),
+)
 def get_masked_input_and_mask(
     input_: torch.Tensor,
     org_vocab_start_index: int,
Original file line number	Diff line number	Diff line change
`@@ -77,7 +77,8 @@ diffusion = [`
`77`	`77`	`"moviepy>=2.0.0",`
`78`	`78`	`"opencv-python==4.10.0.84",`
`79`	`79`	`"remote-pdb",`
`80`		`- "cache-dit==1.1.8"`
	`80`	`+ "cache-dit==1.2.1",`
	`81`	`+ "addict"`
`81`	`82`	`]`
`82`	`83`
`83`	`84`	`tracing = [`