Skip to content
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
add perf 2 gpu job
  • Loading branch information
yctseng0211 committed Jan 20, 2026
commit 74efe0389b3e78ea396e7fb4c57969e405cfb00d
156 changes: 98 additions & 58 deletions .github/workflows/pr-test-amd.yml
Original file line number Diff line number Diff line change
Expand Up @@ -680,6 +680,46 @@ jobs:
run: |
bash scripts/ci/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-large-1-gpu-performance-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 3 --timeout-per-file 1200

stage-b-test-large-2-gpu-performance-amd:
needs: [check-changes, call-gate, stage-b-test-small-1-gpu-amd]
if: |
always() &&
(
(inputs.target_stage == 'stage-b-test-large-2-gpu-performance-amd') ||
(
!inputs.target_stage &&
(!failure() && !cancelled()) &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
)
)
strategy:
fail-fast: false
matrix:
runner: [linux-mi325-gpu-2]
part: [0, 1, 2]
runs-on: ${{matrix.runner}}
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}

- name: Ensure VRAM is clear
run: bash scripts/ensure_vram_clear.sh rocm

- name: Start CI container
run: bash scripts/ci/amd_ci_start_container.sh
env:
GITHUB_WORKSPACE: ${{ github.workspace }}

- name: Install dependencies
run: bash scripts/ci/amd_ci_install_dependency.sh

- name: Run test
timeout-minutes: 30
run: |
bash scripts/ci/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-large-2-gpu-performance-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 3 --timeout-per-file 1200

# performance-test-1-gpu-part-1-amd:
# needs: [check-changes, stage-a-test-1-amd]
# if: |
Expand Down Expand Up @@ -854,74 +894,74 @@ jobs:
# run: |
# bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_1gpu_part2.TestBenchServing1GPUPart2.test_embeddings_api_batch_scaling

performance-test-2-gpu-amd:
needs: [check-changes, stage-a-test-1-amd]
if: |
always() &&
(
(inputs.target_stage == 'performance-test-2-gpu-amd') ||
(
!inputs.target_stage &&
(!failure() && !cancelled()) &&
((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
)
)
strategy:
fail-fast: false
matrix:
runner: [linux-mi325-gpu-2]
runs-on: ${{matrix.runner}}
steps:
- name: Checkout code
uses: actions/checkout@v4
with:
ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
# performance-test-2-gpu-amd:
# needs: [check-changes, stage-a-test-1-amd]
# if: |
# always() &&
# (
# (inputs.target_stage == 'performance-test-2-gpu-amd') ||
# (
# !inputs.target_stage &&
# (!failure() && !cancelled()) &&
# ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
# )
# )
# strategy:
# fail-fast: false
# matrix:
# runner: [linux-mi325-gpu-2]
# runs-on: ${{matrix.runner}}
# steps:
# - name: Checkout code
# uses: actions/checkout@v4
# with:
# ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}

- name: Ensure VRAM is clear
run: bash scripts/ensure_vram_clear.sh rocm
# - name: Ensure VRAM is clear
# run: bash scripts/ensure_vram_clear.sh rocm

- name: Start CI container
run: bash scripts/ci/amd_ci_start_container.sh
env:
GITHUB_WORKSPACE: ${{ github.workspace }}
# - name: Start CI container
# run: bash scripts/ci/amd_ci_start_container.sh
# env:
# GITHUB_WORKSPACE: ${{ github.workspace }}

- name: Install dependencies
run: bash scripts/ci/amd_ci_install_dependency.sh
# - name: Install dependencies
# run: bash scripts/ci/amd_ci_install_dependency.sh

- name: Benchmark dummy grok (TP=2)
timeout-minutes: 30
run: |
bash scripts/ci/amd_ci_exec.sh python3 models/test_dummy_grok_models.py
# - name: Benchmark dummy grok (TP=2)
# timeout-minutes: 30
# run: |
# bash scripts/ci/amd_ci_exec.sh python3 models/test_dummy_grok_models.py

- name: Benchmark single latency (TP=2)
timeout-minutes: 25
run: |
bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_one_batch_2gpu.TestBenchOneBatch2GPU.test_moe_tp2_bs1
# - name: Benchmark single latency (TP=2)
# timeout-minutes: 25
# run: |
# bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_one_batch_2gpu.TestBenchOneBatch2GPU.test_moe_tp2_bs1

- name: Benchmark single latency + torch.compile (TP=2)
timeout-minutes: 25
run: |
bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_one_batch_2gpu.TestBenchOneBatch2GPU.test_torch_compile_tp2_bs1
# - name: Benchmark single latency + torch.compile (TP=2)
# timeout-minutes: 25
# run: |
# bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_one_batch_2gpu.TestBenchOneBatch2GPU.test_torch_compile_tp2_bs1

- name: Benchmark offline throughput (TP=2)
timeout-minutes: 25
run: |
bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_2gpu.TestBenchServing2GPU.test_moe_offline_throughput_default
# - name: Benchmark offline throughput (TP=2)
# timeout-minutes: 25
# run: |
# bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_2gpu.TestBenchServing2GPU.test_moe_offline_throughput_default

- name: Benchmark offline throughput (w/o RadixAttention) (TP=2)
timeout-minutes: 25
run: |
bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_2gpu.TestBenchServing2GPU.test_moe_offline_throughput_without_radix_cache
# - name: Benchmark offline throughput (w/o RadixAttention) (TP=2)
# timeout-minutes: 25
# run: |
# bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_2gpu.TestBenchServing2GPU.test_moe_offline_throughput_without_radix_cache

- name: Benchmark offline PP decode throughput (PP=2)
timeout-minutes: 10
run: |
bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_2gpu.TestBenchServing2GPU.test_pp_offline_throughput_default_decode
# - name: Benchmark offline PP decode throughput (PP=2)
# timeout-minutes: 10
# run: |
# bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_2gpu.TestBenchServing2GPU.test_pp_offline_throughput_default_decode

- name: Benchmark offline PP prefill throughput (PP=2)
timeout-minutes: 10
run: |
bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_2gpu.TestBenchServing2GPU.test_pp_long_context_prefill
# - name: Benchmark offline PP prefill throughput (PP=2)
# timeout-minutes: 10
# run: |
# bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_2gpu.TestBenchServing2GPU.test_pp_long_context_prefill

accuracy-test-1-gpu-amd:
needs: [check-changes, stage-a-test-1-amd]
Expand Down
3 changes: 2 additions & 1 deletion test/registered/perf/test_bench_one_batch_2gpu.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
import unittest

from sglang.test.ci.ci_register import register_cuda_ci
from sglang.test.ci.ci_register import register_amd_ci, register_cuda_ci
from sglang.test.test_utils import (
DEFAULT_MODEL_NAME_FOR_TEST,
DEFAULT_MOE_MODEL_NAME_FOR_TEST,
Expand All @@ -12,6 +12,7 @@
)

register_cuda_ci(est_time=180, suite="stage-b-test-large-2-gpu-performance")
register_amd_ci(est_time=180, suite="stage-b-test-large-2-gpu-performance-amd")


class TestBenchOneBatch2GPU(CustomTestCase):
Expand Down
3 changes: 2 additions & 1 deletion test/registered/perf/test_bench_serving_2gpu.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@

import unittest

from sglang.test.ci.ci_register import register_cuda_ci
from sglang.test.ci.ci_register import register_amd_ci, register_cuda_ci
from sglang.test.test_utils import (
DEFAULT_MOE_MODEL_NAME_FOR_TEST,
CustomTestCase,
Expand All @@ -15,6 +15,7 @@
)

register_cuda_ci(est_time=600, suite="stage-b-test-large-2-gpu-performance")
register_amd_ci(est_time=600, suite="stage-b-test-large-2-gpu-performance-amd")


class TestBenchServing2GPU(CustomTestCase):
Expand Down
1 change: 1 addition & 0 deletions test/run_suite.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
"stage-b-test-small-1-gpu-amd-mi35x",
"stage-b-test-large-2-gpu-amd",
"stage-b-test-large-1-gpu-performance-amd",
"stage-b-test-large-2-gpu-performance-amd",
"stage-c-test-large-8-gpu-amd-mi35x",
],
HWBackend.CUDA: [
Expand Down
Loading