@@ -676,67 +676,12 @@ jobs:
676676 run : |
677677 bash scripts/ci/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-c-test-large-8-gpu-amd-mi35x --auto-partition-id ${{ matrix.part }} --auto-partition-size 3 --timeout-per-file 3600
678678
679- performance- test-1-gpu-part-1 -amd :
679+ stage-b- test-large- 1-gpu-performance -amd :
680680 needs : [check-changes, stage-a-test-1-amd]
681681 if : |
682682 always() &&
683683 (
684- (inputs.target_stage == 'performance-test-1-gpu-part-1-amd') ||
685- (
686- !inputs.target_stage &&
687- (!failure() && !cancelled()) &&
688- ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
689- )
690- )
691- strategy :
692- fail-fast : false
693- matrix :
694- runner : [linux-mi325-gpu-1]
695- runs-on : ${{matrix.runner}}
696- steps :
697- - name : Checkout code
698- uses : actions/checkout@v4
699- with :
700- ref : ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
701-
702- - name : Ensure VRAM is clear
703- run : bash scripts/ensure_vram_clear.sh rocm
704-
705- - name : Start CI container
706- run : bash scripts/ci/amd_ci_start_container.sh
707- env :
708- GITHUB_WORKSPACE : ${{ github.workspace }}
709-
710- - name : Install dependencies
711- run : bash scripts/ci/amd_ci_install_dependency.sh
712-
713- - name : Benchmark single latency
714- timeout-minutes : 20
715- run : |
716- bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_one_batch_1gpu.TestBenchOneBatch1GPU.test_bs1_small
717- bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_one_batch_1gpu.TestBenchOneBatch1GPU.test_bs1_default
718-
719- - name : Benchmark online latency
720- timeout-minutes : 15
721- run : |
722- bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_1gpu_part1.TestBenchServing1GPUPart1.test_online_latency_default
723-
724- - name : Benchmark offline throughput
725- timeout-minutes : 15
726- run : |
727- bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_1gpu_part1.TestBenchServing1GPUPart1.test_offline_throughput_default
728-
729- - name : Benchmark offline throughput (Non-streaming, small batch size)
730- timeout-minutes : 15
731- run : |
732- bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_1gpu_part1.TestBenchServing1GPUPart1.test_offline_throughput_non_stream_small_batch_size
733-
734- performance-test-1-gpu-part-2-amd :
735- needs : [check-changes, stage-a-test-1-amd]
736- if : |
737- always() &&
738- (
739- (inputs.target_stage == 'performance-test-1-gpu-part-2-amd') ||
684+ (inputs.target_stage == 'stage-b-test-large-1-gpu-performance-amd') ||
740685 (
741686 !inputs.target_stage &&
742687 (!failure() && !cancelled()) &&
@@ -747,6 +692,7 @@ jobs:
747692 fail-fast : false
748693 matrix :
749694 runner : [linux-mi325-gpu-1]
695+ part : [0, 1]
750696 runs-on : ${{matrix.runner}}
751697 steps :
752698 - name : Checkout code
@@ -765,27 +711,17 @@ jobs:
765711 - name : Install dependencies
766712 run : bash scripts/ci/amd_ci_install_dependency.sh
767713
768- - name : Benchmark offline throughput (w/o RadixAttention)
769- timeout-minutes : 15
770- run : |
771- bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_1gpu_part1.TestBenchServing1GPUPart1.test_offline_throughput_without_radix_cache
772-
773- - name : Benchmark offline throughput (w/ Triton)
774- timeout-minutes : 15
775- run : |
776- bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_1gpu_part1.TestBenchServing1GPUPart1.test_offline_throughput_with_triton_attention_backend
777-
778- - name : Benchmark offline throughput (w/ FP8)
779- timeout-minutes : 15
714+ - name : Run test
715+ timeout-minutes : 30
780716 run : |
781- bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_1gpu_large.TestBenchServing1GPULarge.test_offline_throughput_default_fp8
717+ bash scripts/ci/amd_ci_exec.sh -w " /sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-large-1-gpu-performance-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 --timeout-per-file 1800
782718
783- performance- test-2-gpu-amd :
719+ stage-b- test-large- 2-gpu-performance -amd :
784720 needs : [check-changes, stage-a-test-1-amd]
785721 if : |
786722 always() &&
787723 (
788- (inputs.target_stage == 'performance- test-2-gpu-amd') ||
724+ (inputs.target_stage == 'stage-b- test-large- 2-gpu-performance -amd') ||
789725 (
790726 !inputs.target_stage &&
791727 (!failure() && !cancelled()) &&
@@ -796,6 +732,7 @@ jobs:
796732 fail-fast : false
797733 matrix :
798734 runner : [linux-mi325-gpu-2]
735+ part : [0, 1]
799736 runs-on : ${{matrix.runner}}
800737 steps :
801738 - name : Checkout code
@@ -814,40 +751,183 @@ jobs:
814751 - name : Install dependencies
815752 run : bash scripts/ci/amd_ci_install_dependency.sh
816753
817- - name : Benchmark dummy grok (TP=2)
754+ - name : Run test
818755 timeout-minutes : 30
819756 run : |
820- bash scripts/ci/amd_ci_exec.sh python3 models/test_dummy_grok_models.py
821-
822- - name : Benchmark single latency (TP=2)
823- timeout-minutes : 25
824- run : |
825- bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_one_batch_2gpu.TestBenchOneBatch2GPU.test_moe_tp2_bs1
826-
827- - name : Benchmark single latency + torch.compile (TP=2)
828- timeout-minutes : 25
829- run : |
830- bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_one_batch_2gpu.TestBenchOneBatch2GPU.test_torch_compile_tp2_bs1
831-
832- - name : Benchmark offline throughput (TP=2)
833- timeout-minutes : 25
834- run : |
835- bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_2gpu.TestBenchServing2GPU.test_moe_offline_throughput_default
836-
837- - name : Benchmark offline throughput (w/o RadixAttention) (TP=2)
838- timeout-minutes : 25
839- run : |
840- bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_2gpu.TestBenchServing2GPU.test_moe_offline_throughput_without_radix_cache
841-
842- - name : Benchmark offline PP decode throughput (PP=2)
843- timeout-minutes : 10
844- run : |
845- bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_2gpu.TestBenchServing2GPU.test_pp_offline_throughput_default_decode
846-
847- - name : Benchmark offline PP prefill throughput (PP=2)
848- timeout-minutes : 10
849- run : |
850- bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_2gpu.TestBenchServing2GPU.test_pp_long_context_prefill
757+ bash scripts/ci/amd_ci_exec.sh -w "/sglang-checkout/test" python3 run_suite.py --hw amd --suite stage-b-test-large-2-gpu-performance-amd --auto-partition-id ${{ matrix.part }} --auto-partition-size 2 --timeout-per-file 1800
758+
759+ # performance-test-1-gpu-part-1-amd:
760+ # needs: [check-changes, stage-a-test-1-amd]
761+ # if: |
762+ # always() &&
763+ # (
764+ # (inputs.target_stage == 'performance-test-1-gpu-part-1-amd') ||
765+ # (
766+ # !inputs.target_stage &&
767+ # (!failure() && !cancelled()) &&
768+ # ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
769+ # )
770+ # )
771+ # strategy:
772+ # fail-fast: false
773+ # matrix:
774+ # runner: [linux-mi325-gpu-1]
775+ # runs-on: ${{matrix.runner}}
776+ # steps:
777+ # - name: Checkout code
778+ # uses: actions/checkout@v4
779+ # with:
780+ # ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
781+
782+ # - name: Ensure VRAM is clear
783+ # run: bash scripts/ensure_vram_clear.sh rocm
784+
785+ # - name: Start CI container
786+ # run: bash scripts/ci/amd_ci_start_container.sh
787+ # env:
788+ # GITHUB_WORKSPACE: ${{ github.workspace }}
789+
790+ # - name: Install dependencies
791+ # run: bash scripts/ci/amd_ci_install_dependency.sh
792+
793+ # - name: Benchmark single latency
794+ # timeout-minutes: 20
795+ # run: |
796+ # bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_one_batch_1gpu.TestBenchOneBatch1GPU.test_bs1_small
797+ # bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_one_batch_1gpu.TestBenchOneBatch1GPU.test_bs1_default
798+
799+ # - name: Benchmark online latency
800+ # timeout-minutes: 15
801+ # run: |
802+ # bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_1gpu_part1.TestBenchServing1GPUPart1.test_online_latency_default
803+
804+ # - name: Benchmark offline throughput
805+ # timeout-minutes: 15
806+ # run: |
807+ # bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_1gpu_part1.TestBenchServing1GPUPart1.test_offline_throughput_default
808+
809+ # - name: Benchmark offline throughput (Non-streaming, small batch size)
810+ # timeout-minutes: 15
811+ # run: |
812+ # bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_1gpu_part1.TestBenchServing1GPUPart1.test_offline_throughput_non_stream_small_batch_size
813+
814+ # performance-test-1-gpu-part-2-amd:
815+ # needs: [check-changes, stage-a-test-1-amd]
816+ # if: |
817+ # always() &&
818+ # (
819+ # (inputs.target_stage == 'performance-test-1-gpu-part-2-amd') ||
820+ # (
821+ # !inputs.target_stage &&
822+ # (!failure() && !cancelled()) &&
823+ # ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
824+ # )
825+ # )
826+ # strategy:
827+ # fail-fast: false
828+ # matrix:
829+ # runner: [linux-mi325-gpu-1]
830+ # runs-on: ${{matrix.runner}}
831+ # steps:
832+ # - name: Checkout code
833+ # uses: actions/checkout@v4
834+ # with:
835+ # ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
836+
837+ # - name: Ensure VRAM is clear
838+ # run: bash scripts/ensure_vram_clear.sh rocm
839+
840+ # - name: Start CI container
841+ # run: bash scripts/ci/amd_ci_start_container.sh
842+ # env:
843+ # GITHUB_WORKSPACE: ${{ github.workspace }}
844+
845+ # - name: Install dependencies
846+ # run: bash scripts/ci/amd_ci_install_dependency.sh
847+
848+ # - name: Benchmark offline throughput (w/o RadixAttention)
849+ # timeout-minutes: 15
850+ # run: |
851+ # bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_1gpu_part1.TestBenchServing1GPUPart1.test_offline_throughput_without_radix_cache
852+
853+ # - name: Benchmark offline throughput (w/ Triton)
854+ # timeout-minutes: 15
855+ # run: |
856+ # bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_1gpu_part1.TestBenchServing1GPUPart1.test_offline_throughput_with_triton_attention_backend
857+
858+ # - name: Benchmark offline throughput (w/ FP8)
859+ # timeout-minutes: 15
860+ # run: |
861+ # bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_1gpu_large.TestBenchServing1GPULarge.test_offline_throughput_default_fp8
862+
863+ # performance-test-2-gpu-amd:
864+ # needs: [check-changes, stage-a-test-1-amd]
865+ # if: |
866+ # always() &&
867+ # (
868+ # (inputs.target_stage == 'performance-test-2-gpu-amd') ||
869+ # (
870+ # !inputs.target_stage &&
871+ # (!failure() && !cancelled()) &&
872+ # ((needs.check-changes.outputs.main_package == 'true') || (needs.check-changes.outputs.sgl_kernel == 'true'))
873+ # )
874+ # )
875+ # strategy:
876+ # fail-fast: false
877+ # matrix:
878+ # runner: [linux-mi325-gpu-2]
879+ # runs-on: ${{matrix.runner}}
880+ # steps:
881+ # - name: Checkout code
882+ # uses: actions/checkout@v4
883+ # with:
884+ # ref: ${{ inputs.pr_head_sha || inputs.ref || github.sha }}
885+
886+ # - name: Ensure VRAM is clear
887+ # run: bash scripts/ensure_vram_clear.sh rocm
888+
889+ # - name: Start CI container
890+ # run: bash scripts/ci/amd_ci_start_container.sh
891+ # env:
892+ # GITHUB_WORKSPACE: ${{ github.workspace }}
893+
894+ # - name: Install dependencies
895+ # run: bash scripts/ci/amd_ci_install_dependency.sh
896+
897+ # - name: Benchmark dummy grok (TP=2)
898+ # timeout-minutes: 30
899+ # run: |
900+ # bash scripts/ci/amd_ci_exec.sh python3 models/test_dummy_grok_models.py
901+
902+ # - name: Benchmark single latency (TP=2)
903+ # timeout-minutes: 25
904+ # run: |
905+ # bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_one_batch_2gpu.TestBenchOneBatch2GPU.test_moe_tp2_bs1
906+
907+ # - name: Benchmark single latency + torch.compile (TP=2)
908+ # timeout-minutes: 25
909+ # run: |
910+ # bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_one_batch_2gpu.TestBenchOneBatch2GPU.test_torch_compile_tp2_bs1
911+
912+ # - name: Benchmark offline throughput (TP=2)
913+ # timeout-minutes: 25
914+ # run: |
915+ # bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_2gpu.TestBenchServing2GPU.test_moe_offline_throughput_default
916+
917+ # - name: Benchmark offline throughput (w/o RadixAttention) (TP=2)
918+ # timeout-minutes: 25
919+ # run: |
920+ # bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_2gpu.TestBenchServing2GPU.test_moe_offline_throughput_without_radix_cache
921+
922+ # - name: Benchmark offline PP decode throughput (PP=2)
923+ # timeout-minutes: 10
924+ # run: |
925+ # bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_2gpu.TestBenchServing2GPU.test_pp_offline_throughput_default_decode
926+
927+ # - name: Benchmark offline PP prefill throughput (PP=2)
928+ # timeout-minutes: 10
929+ # run: |
930+ # bash scripts/ci/amd_ci_exec.sh -w /sglang-checkout/test/registered/perf python3 -m unittest test_bench_serving_2gpu.TestBenchServing2GPU.test_pp_long_context_prefill
851931
852932 accuracy-test-1-gpu-amd :
853933 needs : [check-changes, stage-a-test-1-amd]
0 commit comments