Skip to content

Commit 05319dc

Browse files
author
wangweiwei
committed
Merge remote-tracking branch 'www/pr/ks' into pr/ks
* www/pr/ks: (265 commits) [BugFix][PD]Fix metadata_buffer_index leak when aborted in PD (sgl-project#17483) Refactoring Mooncake TE as a shared distributed component (sgl-project#17810) [ModelOPT] Support Qwen 3 Next Coder NVFP4 (sgl-project#18224) Update author information in pyproject.toml (sgl-project#18453) [Kimi-K2.5] Fix missing `quant_config` in `KimiK25` (sgl-project#18440) Add tensor parallelism support to LFM2 ShortConv layers (sgl-project#17777) [diffusion] chore: revise process title (sgl-project#18446) Fix TRT-LLM MLA backend applying k_scale to BF16 KV cache in BMM1 (sgl-project#18396) [diffusion] refactor: group component loaders under the component_loaders/ directory (sgl-project#18438) [ModelOpt] Fix broken Qwen3-235B-A22B-Instruct-2507-NVFP4 launch (sgl-project#18189) [diffusion] feat: support efficient sequence shard (sgl-project#18161) [CI] fix: notebook ci may not working (sgl-project#18417) fix: sync server_args.kv_cache_dtype when detecting FP8 KV cache (sgl-project#18394) [Fix] Fix backend selection after flashinfer version update (sgl-project#18364) [diffusion] platform: support WAN/FLUX/Qwen-Image/Qwen-Image-edit on Ascend (sgl-project#13662) fix: fix NVFP4 Kimi-K2.5 weight mapping and exclude list (sgl-project#18370) [diffusion] feat: support saving videos directly on the server to avoid the overhead of tensor transfer (sgl-project#18253) [diffusion] fix: respect dist_timeout option (sgl-project#18386) [Doc] Fix outdated `--fp4-gemm-backend` documentation (sgl-project#18350) [diffusion] fix: remove unnecessary norm_type argument from GLM-Image dits (sgl-project#18382) ...
2 parents 9a14254 + b4a780d commit 05319dc

File tree

671 files changed

+43948
-16540
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

671 files changed

+43948
-16540
lines changed

.github/CI_PERMISSIONS.json

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -209,6 +209,13 @@
209209
"reason": "top contributor",
210210
"can_rerun_stage": true
211211
},
212+
"Shunkangz": {
213+
"can_tag_run_ci_label": true,
214+
"can_rerun_failed_ci": true,
215+
"cooldown_interval_minutes": 60,
216+
"reason": "custom override",
217+
"can_rerun_stage": true
218+
},
212219
"SimonCqk": {
213220
"can_tag_run_ci_label": true,
214221
"can_rerun_failed_ci": true,
@@ -412,6 +419,13 @@
412419
"reason": "custom override",
413420
"can_rerun_stage": true
414421
},
422+
"dongjiyingdjy": {
423+
"can_tag_run_ci_label": true,
424+
"can_rerun_failed_ci": true,
425+
"cooldown_interval_minutes": 60,
426+
"reason": "custom override",
427+
"can_rerun_stage": true
428+
},
415429
"dougyster": {
416430
"can_tag_run_ci_label": true,
417431
"can_rerun_failed_ci": true,
@@ -811,6 +825,13 @@
811825
"reason": "top contributor",
812826
"can_rerun_stage": true
813827
},
828+
"samuellees": {
829+
"can_tag_run_ci_label": true,
830+
"can_rerun_failed_ci": true,
831+
"cooldown_interval_minutes": 60,
832+
"reason": "custom override",
833+
"can_rerun_stage": true
834+
},
814835
"scottjlee": {
815836
"can_tag_run_ci_label": true,
816837
"can_rerun_failed_ci": true,

.github/CODEOWNERS

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -3,9 +3,10 @@
33
/docker/npu.Dockerfile @ping1jing2 @iforgetmyname
44
/python/pyproject.toml @merrymercy @Fridge003 @ispobock
55
/python/sglang/jit_kernel @DarkSharpness @BBuf
6+
/python/sglang/jit_kernel/diffusion @yingluosanqian @BBuf @mickqian
67
/python/sglang/multimodal_gen @mickqian @yhyang201
7-
/python/sglang/multimodal_gen/runtime/layers @mickqian @yhyang201 @BBuf
8-
/python/sglang/multimodal_gen/runtime/models/dits @mickqian @yhyang201 @BBuf
8+
/python/sglang/multimodal_gen/runtime/layers @mickqian @yhyang201 @BBuf @yingluosanqian
9+
/python/sglang/multimodal_gen/runtime/models/dits @mickqian @yhyang201 @BBuf @yingluosanqian
910
/python/sglang/srt/batch_invariant_ops @Fridge003 @hebiao064
1011
/python/sglang/srt/constrained @hnyls2002 @DarkSharpness
1112
/python/sglang/srt/compilation @hebiao064

.github/labeler.yml

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -108,3 +108,10 @@ deterministic:
108108
piecewise-cuda-graph:
109109
- changed-files:
110110
- any-glob-to-any-file: 'python/sglang/srt/compilation/**/*'
111+
112+
# Moore Threads specific
113+
mthreads:
114+
- changed-files:
115+
- any-glob-to-any-file:
116+
- '**/*mthreads*'
117+
- '**/*musa*'

.github/workflows/execute-notebook.yml

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -3,21 +3,33 @@ name: Execute Notebooks
33
on:
44
pull_request:
55
branches: [ main ]
6+
types: [opened, synchronize, reopened, labeled]
67
paths:
78
- "python/sglang/**"
89
- "docs/**"
10+
- "!python/sglang/**/*.md"
11+
- "!docs/**/*.md"
912
workflow_dispatch:
1013

1114

1215
concurrency:
1316
group: execute-notebook-${{ github.ref }}
1417
cancel-in-progress: true
1518

19+
env:
20+
SGLANG_IS_IN_CI: true
1621

1722
jobs:
23+
call-gate:
24+
# Align with PR Test: fail fast if PR doesn't have run-ci label.
25+
# This makes /tag-and-rerun-ci work by rerunning this failed workflow.
26+
uses: ./.github/workflows/pr-gate.yml
27+
secrets: inherit
28+
1829
run-all-notebooks:
30+
needs: [call-gate]
1931
runs-on: 1-gpu-runner
20-
if: github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'run-ci')
32+
if: github.event_name != 'pull_request' || needs.call-gate.result == 'success'
2133
steps:
2234
- name: Checkout code
2335
uses: actions/checkout@v4
@@ -43,9 +55,11 @@ jobs:
4355
4456
notebook-finish:
4557
needs: [
58+
call-gate,
4659
run-all-notebooks
4760
]
4861
runs-on: ubuntu-latest
62+
if: always() && needs.run-all-notebooks.result != 'skipped'
4963
steps:
5064
- name: Check all dependent job statuses
5165
run: |

.github/workflows/nightly-test-amd.yml

Lines changed: 40 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,7 @@ on:
3434
- 'nightly-8-gpu-kimi-k2'
3535
# MI35x jobs
3636
- 'nightly-test-1-gpu-mi35x'
37+
- 'nightly-8-gpu-mi35x-kimi-k2'
3738
- 'nightly-accuracy-8-gpu-mi35x'
3839
- 'nightly-8-gpu-mi35x-grok1-int4'
3940
- 'nightly-8-gpu-mi35x-grok2'
@@ -582,13 +583,13 @@ jobs:
582583
bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate
583584
584585
- name: Accuracy Test MI35x (8-GPU Grok1-INT4)
585-
timeout-minutes: 60
586+
timeout-minutes: 90
586587
run: |
587588
> github_summary.md # Clear summary file
588589
bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
589590
-e RCCL_MSCCL_ENABLE=0 \
590591
-e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
591-
python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-mi35x-grok1-int4 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$?
592+
python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-mi35x-grok1-int4 --nightly --timeout-per-file 5400 || TEST_EXIT_CODE=$?
592593
echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
593594
exit ${TEST_EXIT_CODE:-0}
594595
@@ -793,6 +794,39 @@ jobs:
793794
echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
794795
exit ${TEST_EXIT_CODE:-0}
795796
797+
# MI35x 8-GPU Kimi-K2 (Accuracy)
798+
nightly-8-gpu-mi35x-kimi-k2:
799+
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-mi35x-kimi-k2')
800+
runs-on: linux-mi35x-gpu-8
801+
steps:
802+
- name: Checkout code
803+
uses: actions/checkout@v4
804+
with:
805+
ref: ${{ inputs.ref || github.ref }}
806+
807+
- name: Setup docker
808+
run: |
809+
touch github_summary.md
810+
bash scripts/ci/amd/amd_ci_start_container.sh
811+
env:
812+
GITHUB_WORKSPACE: ${{ github.workspace }}
813+
814+
- name: Install dependencies
815+
run: |
816+
bash scripts/ci/amd/amd_ci_install_dependency.sh
817+
# Install tabulate for run_suite.py (missing in MI35x container)
818+
bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate
819+
820+
- name: Accuracy Test MI35x (8-GPU Kimi-K2)
821+
timeout-minutes: 180
822+
run: |
823+
> github_summary.md # Clear summary file
824+
bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
825+
-e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
826+
python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-mi35x-kimi-k2 --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$?
827+
echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
828+
exit ${TEST_EXIT_CODE:-0}
829+
796830
# MI35x 8-GPU DeepSeek-V3.2 Performance Test (MTP)
797831
nightly-perf-8-gpu-mi35x-deepseek-v32-mtp:
798832
if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-perf-8-gpu-mi35x-deepseek-v32-mtp')
@@ -834,10 +868,10 @@ jobs:
834868
# MI30x Accuracy Tests
835869
- nightly-accuracy-2-gpu
836870
- nightly-accuracy-2-gpu-vlm
837-
# MI30x Performance Tests
838-
- nightly-perf-2-gpu-text
839-
- nightly-perf-2-gpu-vlm
840871
- nightly-accuracy-8-gpu
872+
# MI30x Performance Tests - excluded from check (perf failures don't block CI)
873+
# - nightly-perf-2-gpu-text
874+
# - nightly-perf-2-gpu-vlm
841875
# MI30x Combined Accuracy + Performance Tests
842876
- nightly-8-gpu-grok1-int4
843877
- nightly-8-gpu-grok2
@@ -853,6 +887,7 @@ jobs:
853887
- nightly-8-gpu-mi35x-deepseek-r1-mxfp4
854888
- nightly-accuracy-8-gpu-mi35x-deepseek-v32
855889
- nightly-accuracy-8-gpu-mi35x-deepseek-v32-mtp
890+
- nightly-8-gpu-mi35x-kimi-k2
856891
# MI35x perf jobs excluded from check - perf failures don't block CI
857892
# - nightly-perf-8-gpu-mi35x-deepseek-v32-basic
858893
# - nightly-perf-8-gpu-mi35x-deepseek-v32-mtp

.github/workflows/nightly-test-npu.yml

Lines changed: 120 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,7 @@ jobs:
165165
STREAMS_PER_DEVICE: 32
166166
run: |
167167
hf download lmms-lab/MMMU --repo-type dataset
168-
pip install sentence_transformers torchaudio==2.8.0 torch_npu==2.8.0
168+
pip install sentence_transformers
169169
pip install protobuf==6.31.1 zss pre-commit wandb>=0.16.0 tenacity==8.3.0 loguru openpyxl latex2sympy2 zstandard transformers-stream-generator tqdm-multiprocess pycocoevalcap
170170
pip install yt-dlp sentencepiece==0.1.99 nltk av ftfy sqlitedict==2.1.0 sacrebleu>=1.5.0 pytablewriter peft==0.2.0 black==24.1.0 isort==5.13.2 peft>=0.2.0 accelerate>=0.29.1
171171
pip install jsonlines httpx==0.25.0 evaluate>=0.4.0 datasets==2.16.1 numexpr xgrammar==0.1.25 numpy==1.26.4 dotenv
@@ -178,11 +178,130 @@ jobs:
178178
cd test
179179
python3 run_suite.py --hw npu --suite nightly-4-npu-a3 --nightly --continue-on-error --timeout-per-file 3600 --auto-partition-id ${{ matrix.part }} --auto-partition-size 1
180180
181+
nightly-8-npu-a3:
182+
if: ${{ (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') }}
183+
runs-on: linux-aarch64-a3-8
184+
strategy:
185+
fail-fast: false
186+
matrix:
187+
part: [0]
188+
container:
189+
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.5.0-a3-ubuntu22.04-py3.11
190+
steps:
191+
- name: Checkout code
192+
uses: actions/checkout@v4
193+
with:
194+
ref: ${{ inputs.ref || github.ref }}
195+
196+
- name: Install dependencies
197+
run: |
198+
# speed up by using infra cache services
199+
CACHING_URL="cache-service.nginx-pypi-cache.svc.cluster.local"
200+
sed -Ei "s@(ports|archive).ubuntu.com@${CACHING_URL}:8081@g" /etc/apt/sources.list
201+
pip config set global.index-url http://${CACHING_URL}/pypi/simple
202+
pip config set global.extra-index-url "https://pypi.tuna.tsinghua.edu.cn/simple"
203+
pip config set global.trusted-host "${CACHING_URL} pypi.tuna.tsinghua.edu.cn"
204+
205+
bash scripts/ci/npu/npu_ci_install_dependency.sh a3
206+
# copy required file from our daily cache
207+
cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp
208+
# copy download through proxy
209+
curl -o /tmp/test.jsonl -L https://gh-proxy.test.osinfra.cn/https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl
210+
211+
- name: Print Log Information
212+
run: |
213+
bash scripts/ci/npu/npu_log_print.sh
214+
215+
- name: Run test
216+
timeout-minutes: 240
217+
env:
218+
SGLANG_USE_MODELSCOPE: true
219+
SGLANG_IS_IN_CI: true
220+
HF_ENDPOINT: https://hf-mirror.com
221+
TORCH_EXTENSIONS_DIR: /tmp/torch_extensions
222+
PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True"
223+
STREAMS_PER_DEVICE: 32
224+
run: |
225+
hf download lmms-lab/MMMU --repo-type dataset
226+
pip install sentence_transformers
227+
pip install protobuf==6.31.1 zss pre-commit wandb>=0.16.0 tenacity==8.3.0 loguru openpyxl latex2sympy2 zstandard transformers-stream-generator tqdm-multiprocess pycocoevalcap
228+
pip install yt-dlp sentencepiece==0.1.99 nltk av ftfy sqlitedict==2.1.0 sacrebleu>=1.5.0 pytablewriter peft==0.2.0 black==24.1.0 isort==5.13.2 peft>=0.2.0 accelerate>=0.29.1
229+
pip install jsonlines httpx==0.25.0 evaluate>=0.4.0 datasets==2.16.1 numexpr xgrammar==0.1.25 numpy==1.26.4 dotenv
230+
git clone --branch v0.3.3 --depth 1 https://github.com/EvolvingLMMs-Lab/lmms-eval.git
231+
cd ./lmms-eval
232+
nohup pip install . > lmmslog.txt 2>&1 &
233+
sleep 120
234+
export PYTHONPATH=$PYTHONPATH:$(pwd)
235+
cd ../
236+
cd test
237+
python3 run_suite.py --hw npu --suite nightly-8-npu-a3 --nightly --continue-on-error --timeout-per-file 3600 --auto-partition-id ${{ matrix.part }} --auto-partition-size 1
238+
239+
nightly-16-npu-a3:
240+
if: ${{ (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') }}
241+
runs-on: linux-aarch64-a3-16
242+
strategy:
243+
fail-fast: false
244+
matrix:
245+
part: [0]
246+
container:
247+
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.5.0-a3-ubuntu22.04-py3.11
248+
steps:
249+
- name: Checkout code
250+
uses: actions/checkout@v4
251+
with:
252+
ref: ${{ inputs.ref || github.ref }}
253+
254+
- name: Install dependencies
255+
run: |
256+
# speed up by using infra cache services
257+
CACHING_URL="cache-service.nginx-pypi-cache.svc.cluster.local"
258+
sed -Ei "s@(ports|archive).ubuntu.com@${CACHING_URL}:8081@g" /etc/apt/sources.list
259+
pip config set global.index-url http://${CACHING_URL}/pypi/simple
260+
pip config set global.extra-index-url "https://pypi.tuna.tsinghua.edu.cn/simple"
261+
pip config set global.trusted-host "${CACHING_URL} pypi.tuna.tsinghua.edu.cn"
262+
263+
bash scripts/ci/npu/npu_ci_install_dependency.sh a3
264+
# copy required file from our daily cache
265+
cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp
266+
# copy download through proxy
267+
curl -o /tmp/test.jsonl -L https://gh-proxy.test.osinfra.cn/https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl
268+
269+
- name: Print Log Information
270+
run: |
271+
bash scripts/ci/npu/npu_log_print.sh
272+
273+
- name: Run test
274+
timeout-minutes: 240
275+
env:
276+
SGLANG_USE_MODELSCOPE: true
277+
SGLANG_IS_IN_CI: true
278+
HF_ENDPOINT: https://hf-mirror.com
279+
TORCH_EXTENSIONS_DIR: /tmp/torch_extensions
280+
PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True"
281+
STREAMS_PER_DEVICE: 32
282+
run: |
283+
hf download lmms-lab/MMMU --repo-type dataset
284+
pip install sentence_transformers
285+
pip install protobuf==6.31.1 zss pre-commit wandb>=0.16.0 tenacity==8.3.0 loguru openpyxl latex2sympy2 zstandard transformers-stream-generator tqdm-multiprocess pycocoevalcap
286+
pip install yt-dlp sentencepiece==0.1.99 nltk av ftfy sqlitedict==2.1.0 sacrebleu>=1.5.0 pytablewriter peft==0.2.0 black==24.1.0 isort==5.13.2 peft>=0.2.0 accelerate>=0.29.1
287+
pip install jsonlines httpx==0.25.0 evaluate>=0.4.0 datasets==2.16.1 numexpr xgrammar==0.1.25 numpy==1.26.4 dotenv
288+
git clone --branch v0.3.3 --depth 1 https://github.com/EvolvingLMMs-Lab/lmms-eval.git
289+
cd ./lmms-eval
290+
nohup pip install . > lmmslog.txt 2>&1 &
291+
sleep 120
292+
export PYTHONPATH=$PYTHONPATH:$(pwd)
293+
cd ../
294+
cd test
295+
python3 run_suite.py --hw npu --suite nightly-16-npu-a3 --nightly --continue-on-error --timeout-per-file 3600 --auto-partition-id ${{ matrix.part }} --auto-partition-size 1
296+
181297
check-all-jobs:
182298
if: github.repository == 'sgl-project/sglang' && always()
183299
needs:
184300
- nightly-1-npu-a3
301+
- nightly-2-npu-a3
185302
- nightly-4-npu-a3
303+
- nightly-8-npu-a3
304+
- nightly-16-npu-a3
186305
runs-on: ubuntu-latest
187306
container:
188307
image: docker.m.daocloud.io/ubuntu:22.04

.github/workflows/nightly-test-nvidia.yml

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -95,7 +95,7 @@ jobs:
9595
strategy:
9696
fail-fast: false
9797
matrix:
98-
partition: [0, 1, 2]
98+
partition: [0, 1, 2, 3]
9999
env:
100100
RUNNER_LABELS: 8-gpu-h200
101101
steps:
@@ -118,7 +118,7 @@ jobs:
118118
IS_H200: "1"
119119
run: |
120120
cd test
121-
python3 run_suite.py --hw cuda --suite nightly-8-gpu-common --nightly --timeout-per-file=18000 --continue-on-error --auto-partition-id=${{ matrix.partition }} --auto-partition-size=3
121+
python3 run_suite.py --hw cuda --suite nightly-8-gpu-common --nightly --timeout-per-file=18000 --continue-on-error --auto-partition-id=${{ matrix.partition }} --auto-partition-size=4
122122
123123
- name: Run test
124124
timeout-minutes: 30
@@ -179,7 +179,7 @@ jobs:
179179
strategy:
180180
fail-fast: false
181181
matrix:
182-
partition: [0, 1, 2]
182+
partition: [0, 1, 2, 3]
183183
steps:
184184
- name: Checkout code
185185
uses: actions/checkout@v4
@@ -199,7 +199,7 @@ jobs:
199199
GPU_CONFIG: "8-gpu-b200"
200200
run: |
201201
cd test
202-
IS_BLACKWELL=1 python3 run_suite.py --hw cuda --suite nightly-8-gpu-common --nightly --timeout-per-file=12000 --continue-on-error --auto-partition-id=${{ matrix.partition }} --auto-partition-size=3
202+
IS_BLACKWELL=1 python3 run_suite.py --hw cuda --suite nightly-8-gpu-common --nightly --timeout-per-file=12000 --continue-on-error --auto-partition-id=${{ matrix.partition }} --auto-partition-size=4
203203
204204
- name: Collect performance metrics
205205
if: always()

0 commit comments

Comments
 (0)