add perf 2 gpu job

sgl-project · HaiShaw · Jan 23, 2026 · Jan 20, 2026 · Jan 20, 2026 · Jan 20, 2026
commit 74efe0389b3e78ea396e7fb4c57969e405cfb00d
@@ -680,6 +680,46 @@ jobs:
         run: |
           bash scripts/ci/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-large-1-gpu-performance-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 3 --timeout-per-file 1200
 
+  stage-b-test-large-2-gpu-performance-amd:
+    needs: [check-changes, call-gate, stage-b-test-small-1-gpu-amd]
+    if: |
+      always() &&
+      (
+        (inputs.target_stage == 'stage-b-test-large-2-gpu-performance-amd') ||
+        (
+          !inputs.target_stage &&
+          (!failure() && !cancelled()) &&
+          ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
+        )
+      )
+    strategy:
+      fail-fast: false
+      matrix:
+        runner: [linux-mi325-gpu-2]
+        part: [0, 1, 2]
+    runs-on: ${{matrix.runner}}
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
+
+      - name: Ensure VRAM is clear
+        run: bash scripts/ensure_vram_clear.sh rocm
+
+      - name: Start CI container
+        run: bash scripts/ci/amd_ci_start_container.sh
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: bash scripts/ci/amd_ci_install_dependency.sh
+
+      - name: Run test
+        timeout-minutes: 30
+        run: |
+          bash scripts/ci/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-large-2-gpu-performance-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 3 --timeout-per-file 1200
+
   # performance-test-1-gpu-part-1-amd:
   #   needs: [check-changes, stage-a-test-1-amd]
   #   if: |
@@ -854,74 +894,74 @@ jobs:
   #       run: |
   #         bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_1gpu_part2.TestBenchServing1GPUPart2.test_embeddings_api_batch_scaling
 
-  performance-test-2-gpu-amd:
-    needs: [check-changes, stage-a-test-1-amd]
-    if: |
-      always() &&
-      (
-        (inputs.target_stage == 'performance-test-2-gpu-amd') ||
-        (
-          !inputs.target_stage &&
-          (!failure() && !cancelled()) &&
-          ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
-        )
-      )
-    strategy:
-      fail-fast: false
-      matrix:
-        runner: [linux-mi325-gpu-2]
-    runs-on: ${{matrix.runner}}
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
+  # performance-test-2-gpu-amd:
+  #   needs: [check-changes, stage-a-test-1-amd]
+  #   if: |
+  #     always() &&
+  #     (
+  #       (inputs.target_stage == 'performance-test-2-gpu-amd') ||
+  #       (
+  #         !inputs.target_stage &&
+  #         (!failure() && !cancelled()) &&
+  #         ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
+  #       )
+  #     )
+  #   strategy:
+  #     fail-fast: false
+  #     matrix:
+  #       runner: [linux-mi325-gpu-2]
+  #   runs-on: ${{matrix.runner}}
+  #   steps:
+  #     - name: Checkout code
+  #       uses: actions/checkout@v4
+  #       with:
+  #         ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
 
-      - name: Ensure VRAM is clear
-        run: bash scripts/ensure_vram_clear.sh rocm
+  #     - name: Ensure VRAM is clear
+  #       run: bash scripts/ensure_vram_clear.sh rocm
 
-      - name: Start CI container
-        run: bash scripts/ci/amd_ci_start_container.sh
-        env:
-          GITHUB_WORKSPACE: ${{ github.workspace }}
+  #     - name: Start CI container
+  #       run: bash scripts/ci/amd_ci_start_container.sh
+  #       env:
+  #         GITHUB_WORKSPACE: ${{ github.workspace }}
 
-      - name: Install dependencies
-        run: bash scripts/ci/amd_ci_install_dependency.sh
+  #     - name: Install dependencies
+  #       run: bash scripts/ci/amd_ci_install_dependency.sh
 
-      - name: Benchmark dummy grok (TP=2)
-        timeout-minutes: 30
-        run: |
-          bash scripts/ci/amd_ci_exec.sh python3 models/test_dummy_grok_models.py
+  #     - name: Benchmark dummy grok (TP=2)
+  #       timeout-minutes: 30
+  #       run: |
+  #         bash scripts/ci/amd_ci_exec.sh python3 models/test_dummy_grok_models.py
 
-      - name: Benchmark single latency (TP=2)
-        timeout-minutes: 25
-        run: |
-          bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_one_batch_2gpu.TestBenchOneBatch2GPU.test_moe_tp2_bs1
+  #     - name: Benchmark single latency (TP=2)
+  #       timeout-minutes: 25
+  #       run: |
+  #         bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_one_batch_2gpu.TestBenchOneBatch2GPU.test_moe_tp2_bs1
 
-      - name: Benchmark single latency + torch.compile (TP=2)
-        timeout-minutes: 25
-        run: |
-          bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_one_batch_2gpu.TestBenchOneBatch2GPU.test_torch_compile_tp2_bs1
+  #     - name: Benchmark single latency + torch.compile (TP=2)
+  #       timeout-minutes: 25
+  #       run: |
+  #         bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_one_batch_2gpu.TestBenchOneBatch2GPU.test_torch_compile_tp2_bs1
 
-      - name: Benchmark offline throughput (TP=2)
-        timeout-minutes: 25
-        run: |
-          bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_2gpu.TestBenchServing2GPU.test_moe_offline_throughput_default
+  #     - name: Benchmark offline throughput (TP=2)
+  #       timeout-minutes: 25
+  #       run: |
+  #         bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_2gpu.TestBenchServing2GPU.test_moe_offline_throughput_default
 
-      - name: Benchmark offline throughput (w/o RadixAttention) (TP=2)
-        timeout-minutes: 25
-        run: |
-          bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_2gpu.TestBenchServing2GPU.test_moe_offline_throughput_without_radix_cache
+  #     - name: Benchmark offline throughput (w/o RadixAttention) (TP=2)
+  #       timeout-minutes: 25
+  #       run: |
+  #         bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_2gpu.TestBenchServing2GPU.test_moe_offline_throughput_without_radix_cache
 
-      - name: Benchmark offline PP decode throughput (PP=2)
-        timeout-minutes: 10
-        run: |
-          bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_2gpu.TestBenchServing2GPU.test_pp_offline_throughput_default_decode
+  #     - name: Benchmark offline PP decode throughput (PP=2)
+  #       timeout-minutes: 10
+  #       run: |
+  #         bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_2gpu.TestBenchServing2GPU.test_pp_offline_throughput_default_decode
 
-      - name: Benchmark offline PP prefill throughput (PP=2)
-        timeout-minutes: 10
-        run: |
-          bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_2gpu.TestBenchServing2GPU.test_pp_long_context_prefill
+  #     - name: Benchmark offline PP prefill throughput (PP=2)
+  #       timeout-minutes: 10
+  #       run: |
+  #         bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_2gpu.TestBenchServing2GPU.test_pp_long_context_prefill
 
   accuracy-test-1-gpu-amd:
     needs: [check-changes, stage-a-test-1-amd]

diff --git a/test/registered/perf/test_bench_one_batch_2gpu.py b/test/registered/perf/test_bench_one_batch_2gpu.py
@@ -1,6 +1,6 @@
 import unittest
 
-from sglang.test.ci.ci_register import register_cuda_ci
+from sglang.test.ci.ci_register import register_amd_ci, register_cuda_ci
 from sglang.test.test_utils import (
     DEFAULT_MODEL_NAME_FOR_TEST,
     DEFAULT_MOE_MODEL_NAME_FOR_TEST,
@@ -12,6 +12,7 @@
 )
 
 register_cuda_ci(est_time=180, suite="stage-b-test-large-2-gpu-performance")
+register_amd_ci(est_time=180, suite="stage-b-test-large-2-gpu-performance-amd")
 
 
 class TestBenchOneBatch2GPU(CustomTestCase):

diff --git a/test/registered/perf/test_bench_serving_2gpu.py b/test/registered/perf/test_bench_serving_2gpu.py
@@ -4,7 +4,7 @@
 
 import unittest
 
-from sglang.test.ci.ci_register import register_cuda_ci
+from sglang.test.ci.ci_register import register_amd_ci, register_cuda_ci
 from sglang.test.test_utils import (
     DEFAULT_MOE_MODEL_NAME_FOR_TEST,
     CustomTestCase,
@@ -15,6 +15,7 @@
 )
 
 register_cuda_ci(est_time=600, suite="stage-b-test-large-2-gpu-performance")
+register_amd_ci(est_time=600, suite="stage-b-test-large-2-gpu-performance-amd")
 
 
 class TestBenchServing2GPU(CustomTestCase):

diff --git a/test/run_suite.py b/test/run_suite.py
@@ -24,6 +24,7 @@
         "stage-b-test-small-1-gpu-amd-mi35x",
         "stage-b-test-large-2-gpu-amd",
         "stage-b-test-large-1-gpu-performance-amd",
+        "stage-b-test-large-2-gpu-performance-amd",
         "stage-c-test-large-8-gpu-amd-mi35x",
     ],
     HWBackend.CUDA: [