create new jobs, migrate tests

yctseng0211 · yctseng0211 · commit 2024c0b8cfc4 · 2026-01-19T03:09:56.000-06:00
diff --git a/.github/workflows/pr-test-amd.yml b/.github/workflows/pr-test-amd.yml
@@ -676,67 +676,12 @@ jobs:
         run: |
           bash scripts/ci/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-c-test-large-8-gpu-amd-mi35x --auto-partition-id ${{ matrix.part }} --auto-partition-size 3 --timeout-per-file 3600
 
-  performance-test-1-gpu-part-1-amd:
+  stage-b-test-large-1-gpu-performance-amd:
     needs: [check-changes, stage-a-test-1-amd]
     if: |
       always() &&
       (
-        (inputs.target_stage == 'performance-test-1-gpu-part-1-amd') ||
-        (
-          !inputs.target_stage &&
-          (!failure() && !cancelled()) &&
-          ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
-        )
-      )
-    strategy:
-      fail-fast: false
-      matrix:
-        runner: [linux-mi325-gpu-1]
-    runs-on: ${{matrix.runner}}
-    steps:
-      - name: Checkout code
-        uses: actions/checkout@v4
-        with:
-          ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
-
-      - name: Ensure VRAM is clear
-        run: bash scripts/ensure_vram_clear.sh rocm
-
-      - name: Start CI container
-        run: bash scripts/ci/amd_ci_start_container.sh
-        env:
-          GITHUB_WORKSPACE: ${{ github.workspace }}
-
-      - name: Install dependencies
-        run: bash scripts/ci/amd_ci_install_dependency.sh
-
-      - name: Benchmark single latency
-        timeout-minutes: 20
-        run: |
-          bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_one_batch_1gpu.TestBenchOneBatch1GPU.test_bs1_small
-          bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_one_batch_1gpu.TestBenchOneBatch1GPU.test_bs1_default
-
-      - name: Benchmark online latency
-        timeout-minutes: 15
-        run: |
-          bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_1gpu_part1.TestBenchServing1GPUPart1.test_online_latency_default
-
-      - name: Benchmark offline throughput
-        timeout-minutes: 15
-        run: |
-          bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_1gpu_part1.TestBenchServing1GPUPart1.test_offline_throughput_default
-
-      - name: Benchmark offline throughput (Non-streaming, small batch size)
-        timeout-minutes: 15
-        run: |
-          bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_1gpu_part1.TestBenchServing1GPUPart1.test_offline_throughput_non_stream_small_batch_size
-
-  performance-test-1-gpu-part-2-amd:
-    needs: [check-changes, stage-a-test-1-amd]
-    if: |
-      always() &&
-      (
-        (inputs.target_stage == 'performance-test-1-gpu-part-2-amd') ||
+        (inputs.target_stage == 'stage-b-test-large-1-gpu-performance-amd') ||
         (
           !inputs.target_stage &&
           (!failure() && !cancelled()) &&
@@ -747,6 +692,7 @@ jobs:
       fail-fast: false
       matrix:
         runner: [linux-mi325-gpu-1]
+        part: [0, 1]
     runs-on: ${{matrix.runner}}
     steps:
       - name: Checkout code
@@ -765,27 +711,17 @@ jobs:
       - name: Install dependencies
         run: bash scripts/ci/amd_ci_install_dependency.sh
 
-      - name: Benchmark offline throughput (w/o RadixAttention)
-        timeout-minutes: 15
-        run: |
-          bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_1gpu_part1.TestBenchServing1GPUPart1.test_offline_throughput_without_radix_cache
-
-      - name: Benchmark offline throughput (w/ Triton)
-        timeout-minutes: 15
-        run: |
-          bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_1gpu_part1.TestBenchServing1GPUPart1.test_offline_throughput_with_triton_attention_backend
-
-      - name: Benchmark offline throughput (w/ FP8)
-        timeout-minutes: 15
+      - name: Run test
+        timeout-minutes: 30
         run: |
-          bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_1gpu_large.TestBenchServing1GPULarge.test_offline_throughput_default_fp8
+          bash scripts/ci/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-large-1-gpu-performance-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 --timeout-per-file 1800
 
-  performance-test-2-gpu-amd:
+  stage-b-test-large-2-gpu-performance-amd:
     needs: [check-changes, stage-a-test-1-amd]
     if: |
       always() &&
       (
-        (inputs.target_stage == 'performance-test-2-gpu-amd') ||
+        (inputs.target_stage == 'stage-b-test-large-2-gpu-performance-amd') ||
         (
           !inputs.target_stage &&
           (!failure() && !cancelled()) &&
@@ -796,6 +732,7 @@ jobs:
       fail-fast: false
       matrix:
         runner: [linux-mi325-gpu-2]
+        part: [0, 1]
     runs-on: ${{matrix.runner}}
     steps:
       - name: Checkout code
@@ -814,40 +751,183 @@ jobs:
       - name: Install dependencies
         run: bash scripts/ci/amd_ci_install_dependency.sh
 
-      - name: Benchmark dummy grok (TP=2)
+      - name: Run test
         timeout-minutes: 30
         run: |
-          bash scripts/ci/amd_ci_exec.sh python3 models/test_dummy_grok_models.py
-
-      - name: Benchmark single latency (TP=2)
-        timeout-minutes: 25
-        run: |
-          bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_one_batch_2gpu.TestBenchOneBatch2GPU.test_moe_tp2_bs1
-
-      - name: Benchmark single latency + torch.compile (TP=2)
-        timeout-minutes: 25
-        run: |
-          bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_one_batch_2gpu.TestBenchOneBatch2GPU.test_torch_compile_tp2_bs1
-
-      - name: Benchmark offline throughput (TP=2)
-        timeout-minutes: 25
-        run: |
-          bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_2gpu.TestBenchServing2GPU.test_moe_offline_throughput_default
-
-      - name: Benchmark offline throughput (w/o RadixAttention) (TP=2)
-        timeout-minutes: 25
-        run: |
-          bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_2gpu.TestBenchServing2GPU.test_moe_offline_throughput_without_radix_cache
-
-      - name: Benchmark offline PP decode throughput (PP=2)
-        timeout-minutes: 10
-        run: |
-          bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_2gpu.TestBenchServing2GPU.test_pp_offline_throughput_default_decode
-
-      - name: Benchmark offline PP prefill throughput (PP=2)
-        timeout-minutes: 10
-        run: |
-          bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_2gpu.TestBenchServing2GPU.test_pp_long_context_prefill
+          bash scripts/ci/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-large-2-gpu-performance-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 --timeout-per-file 1800
+
+  # performance-test-1-gpu-part-1-amd:
+  #   needs: [check-changes, stage-a-test-1-amd]
+  #   if: |
+  #     always() &&
+  #     (
+  #       (inputs.target_stage == 'performance-test-1-gpu-part-1-amd') ||
+  #       (
+  #         !inputs.target_stage &&
+  #         (!failure() && !cancelled()) &&
+  #         ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
+  #       )
+  #     )
+  #   strategy:
+  #     fail-fast: false
+  #     matrix:
+  #       runner: [linux-mi325-gpu-1]
+  #   runs-on: ${{matrix.runner}}
+  #   steps:
+  #     - name: Checkout code
+  #       uses: actions/checkout@v4
+  #       with:
+  #         ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
+
+  #     - name: Ensure VRAM is clear
+  #       run: bash scripts/ensure_vram_clear.sh rocm
+
+  #     - name: Start CI container
+  #       run: bash scripts/ci/amd_ci_start_container.sh
+  #       env:
+  #         GITHUB_WORKSPACE: ${{ github.workspace }}
+
+  #     - name: Install dependencies
+  #       run: bash scripts/ci/amd_ci_install_dependency.sh
+
+  #     - name: Benchmark single latency
+  #       timeout-minutes: 20
+  #       run: |
+  #         bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_one_batch_1gpu.TestBenchOneBatch1GPU.test_bs1_small
+  #         bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_one_batch_1gpu.TestBenchOneBatch1GPU.test_bs1_default
+
+  #     - name: Benchmark online latency
+  #       timeout-minutes: 15
+  #       run: |
+  #         bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_1gpu_part1.TestBenchServing1GPUPart1.test_online_latency_default
+
+  #     - name: Benchmark offline throughput
+  #       timeout-minutes: 15
+  #       run: |
+  #         bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_1gpu_part1.TestBenchServing1GPUPart1.test_offline_throughput_default
+
+  #     - name: Benchmark offline throughput (Non-streaming, small batch size)
+  #       timeout-minutes: 15
+  #       run: |
+  #         bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_1gpu_part1.TestBenchServing1GPUPart1.test_offline_throughput_non_stream_small_batch_size
+
+  # performance-test-1-gpu-part-2-amd:
+  #   needs: [check-changes, stage-a-test-1-amd]
+  #   if: |
+  #     always() &&
+  #     (
+  #       (inputs.target_stage == 'performance-test-1-gpu-part-2-amd') ||
+  #       (
+  #         !inputs.target_stage &&
+  #         (!failure() && !cancelled()) &&
+  #         ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
+  #       )
+  #     )
+  #   strategy:
+  #     fail-fast: false
+  #     matrix:
+  #       runner: [linux-mi325-gpu-1]
+  #   runs-on: ${{matrix.runner}}
+  #   steps:
+  #     - name: Checkout code
+  #       uses: actions/checkout@v4
+  #       with:
+  #         ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
+
+  #     - name: Ensure VRAM is clear
+  #       run: bash scripts/ensure_vram_clear.sh rocm
+
+  #     - name: Start CI container
+  #       run: bash scripts/ci/amd_ci_start_container.sh
+  #       env:
+  #         GITHUB_WORKSPACE: ${{ github.workspace }}
+
+  #     - name: Install dependencies
+  #       run: bash scripts/ci/amd_ci_install_dependency.sh
+
+  #     - name: Benchmark offline throughput (w/o RadixAttention)
+  #       timeout-minutes: 15
+  #       run: |
+  #         bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_1gpu_part1.TestBenchServing1GPUPart1.test_offline_throughput_without_radix_cache
+
+  #     - name: Benchmark offline throughput (w/ Triton)
+  #       timeout-minutes: 15
+  #       run: |
+  #         bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_1gpu_part1.TestBenchServing1GPUPart1.test_offline_throughput_with_triton_attention_backend
+
+  #     - name: Benchmark offline throughput (w/ FP8)
+  #       timeout-minutes: 15
+  #       run: |
+  #         bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_1gpu_large.TestBenchServing1GPULarge.test_offline_throughput_default_fp8
+
+  # performance-test-2-gpu-amd:
+  #   needs: [check-changes, stage-a-test-1-amd]
+  #   if: |
+  #     always() &&
+  #     (
+  #       (inputs.target_stage == 'performance-test-2-gpu-amd') ||
+  #       (
+  #         !inputs.target_stage &&
+  #         (!failure() && !cancelled()) &&
+  #         ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
+  #       )
+  #     )
+  #   strategy:
+  #     fail-fast: false
+  #     matrix:
+  #       runner: [linux-mi325-gpu-2]
+  #   runs-on: ${{matrix.runner}}
+  #   steps:
+  #     - name: Checkout code
+  #       uses: actions/checkout@v4
+  #       with:
+  #         ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
+
+  #     - name: Ensure VRAM is clear
+  #       run: bash scripts/ensure_vram_clear.sh rocm
+
+  #     - name: Start CI container
+  #       run: bash scripts/ci/amd_ci_start_container.sh
+  #       env:
+  #         GITHUB_WORKSPACE: ${{ github.workspace }}
+
+  #     - name: Install dependencies
+  #       run: bash scripts/ci/amd_ci_install_dependency.sh
+
+  #     - name: Benchmark dummy grok (TP=2)
+  #       timeout-minutes: 30
+  #       run: |
+  #         bash scripts/ci/amd_ci_exec.sh python3 models/test_dummy_grok_models.py
+
+  #     - name: Benchmark single latency (TP=2)
+  #       timeout-minutes: 25
+  #       run: |
+  #         bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_one_batch_2gpu.TestBenchOneBatch2GPU.test_moe_tp2_bs1
+
+  #     - name: Benchmark single latency + torch.compile (TP=2)
+  #       timeout-minutes: 25
+  #       run: |
+  #         bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_one_batch_2gpu.TestBenchOneBatch2GPU.test_torch_compile_tp2_bs1
+
+  #     - name: Benchmark offline throughput (TP=2)
+  #       timeout-minutes: 25
+  #       run: |
+  #         bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_2gpu.TestBenchServing2GPU.test_moe_offline_throughput_default
+
+  #     - name: Benchmark offline throughput (w/o RadixAttention) (TP=2)
+  #       timeout-minutes: 25
+  #       run: |
+  #         bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_2gpu.TestBenchServing2GPU.test_moe_offline_throughput_without_radix_cache
+
+  #     - name: Benchmark offline PP decode throughput (PP=2)
+  #       timeout-minutes: 10
+  #       run: |
+  #         bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_2gpu.TestBenchServing2GPU.test_pp_offline_throughput_default_decode
+
+  #     - name: Benchmark offline PP prefill throughput (PP=2)
+  #       timeout-minutes: 10
+  #       run: |
+  #         bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_2gpu.TestBenchServing2GPU.test_pp_long_context_prefill
 
   accuracy-test-1-gpu-amd:
     needs: [check-changes, stage-a-test-1-amd]
diff --git a/test/registered/perf/test_bench_serving_1gpu_large.py b/test/registered/perf/test_bench_serving_1gpu_large.py
@@ -4,7 +4,7 @@
 
 import unittest
 
-from sglang.test.ci.ci_register import register_cuda_ci
+from sglang.test.ci.ci_register import register_amd_ci, register_cuda_ci
 from sglang.test.test_utils import (
     DEFAULT_DRAFT_MODEL_EAGLE,
     DEFAULT_MODEL_NAME_FOR_TEST_FP8,
@@ -17,6 +17,7 @@
 )
 
 register_cuda_ci(est_time=300, suite="stage-b-test-large-1-gpu-performance")
+register_amd_ci(est_time=300, suite="stage-b-test-large-1-gpu-performance-amd")
 
 
 class TestBenchServing1GPULarge(CustomTestCase):
diff --git a/test/registered/perf/test_bench_serving_1gpu_part1.py b/test/registered/perf/test_bench_serving_1gpu_part1.py
@@ -9,7 +9,7 @@
 
 import requests
 
-from sglang.test.ci.ci_register import register_cuda_ci
+from sglang.test.ci.ci_register import register_amd_ci, register_cuda_ci
 from sglang.test.test_utils import (
     DEFAULT_MODEL_NAME_FOR_TEST,
     CustomTestCase,
@@ -20,6 +20,7 @@
 )
 
 register_cuda_ci(est_time=1000, suite="stage-b-test-large-1-gpu-performance")
+register_amd_ci(est_time=1000, suite="stage-b-test-large-1-gpu-performance-amd")
 
 
 class TestBenchServing1GPUPart1(CustomTestCase):
diff --git a/test/registered/perf/test_bench_serving_2gpu.py b/test/registered/perf/test_bench_serving_2gpu.py
@@ -4,7 +4,7 @@
 
 import unittest
 
-from sglang.test.ci.ci_register import register_cuda_ci
+from sglang.test.ci.ci_register import register_amd_ci, register_cuda_ci
 from sglang.test.test_utils import (
     DEFAULT_MOE_MODEL_NAME_FOR_TEST,
     CustomTestCase,
@@ -15,6 +15,7 @@
 )
 
 register_cuda_ci(est_time=600, suite="stage-b-test-large-2-gpu-performance")
+register_amd_ci(est_time=600, suite="stage-b-test-large-2-gpu-performance-amd")
 
 
 class TestBenchServing2GPU(CustomTestCase):
diff --git a/test/run_suite.py b/test/run_suite.py
@@ -23,6 +23,8 @@
         "stage-b-test-small-1-gpu-amd",
         "stage-b-test-small-1-gpu-amd-mi35x",
         "stage-b-test-large-2-gpu-amd",
+        "stage-b-test-large-1-gpu-performance-amd",
+        "stage-b-test-large-2-gpu-performance-amd",
         "stage-c-test-large-8-gpu-amd-mi35x",
     ],
     HWBackend.CUDA: [