1StepForever
diff --git a/‎.github/CI_PERMISSIONS.json‎
Lines changed: 21 additions & 0 deletions b/‎.github/CI_PERMISSIONS.json‎
Lines changed: 21 additions & 0 deletions
diff --git a/‎.github/CODEOWNERS‎
Lines changed: 3 additions & 2 deletions b/‎.github/CODEOWNERS‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎.github/labeler.yml‎
Lines changed: 7 additions & 0 deletions b/‎.github/labeler.yml‎
Lines changed: 7 additions & 0 deletions
diff --git a/‎.github/workflows/execute-notebook.yml‎
Lines changed: 15 additions & 1 deletion b/‎.github/workflows/execute-notebook.yml‎
Lines changed: 15 additions & 1 deletion
diff --git a/‎.github/workflows/nightly-test-amd.yml‎
Lines changed: 40 additions & 5 deletions b/‎.github/workflows/nightly-test-amd.yml‎
Lines changed: 40 additions & 5 deletions
diff --git a/‎.github/workflows/nightly-test-npu.yml‎
Lines changed: 120 additions & 1 deletion b/‎.github/workflows/nightly-test-npu.yml‎
Lines changed: 120 additions & 1 deletion
diff --git a/‎.github/workflows/nightly-test-nvidia.yml‎
Lines changed: 4 additions & 4 deletions b/‎.github/workflows/nightly-test-nvidia.yml‎
Lines changed: 4 additions & 4 deletions
@@ -209,6 +209,13 @@
         "reason": "top contributor",
         "can_rerun_stage": true
     },
+    "Shunkangz": {
+        "can_tag_run_ci_label": true,
+        "can_rerun_failed_ci": true,
+        "cooldown_interval_minutes": 60,
+        "reason": "custom override",
+        "can_rerun_stage": true
+    },
     "SimonCqk": {
         "can_tag_run_ci_label": true,
         "can_rerun_failed_ci": true,
@@ -412,6 +419,13 @@
         "reason": "custom override",
         "can_rerun_stage": true
     },
+    "dongjiyingdjy": {
+        "can_tag_run_ci_label": true,
+        "can_rerun_failed_ci": true,
+        "cooldown_interval_minutes": 60,
+        "reason": "custom override",
+        "can_rerun_stage": true
+    },
     "dougyster": {
         "can_tag_run_ci_label": true,
         "can_rerun_failed_ci": true,
@@ -811,6 +825,13 @@
         "reason": "top contributor",
         "can_rerun_stage": true
     },
+    "samuellees": {
+        "can_tag_run_ci_label": true,
+        "can_rerun_failed_ci": true,
+        "cooldown_interval_minutes": 60,
+        "reason": "custom override",
+        "can_rerun_stage": true
+    },
     "scottjlee": {
         "can_tag_run_ci_label": true,
         "can_rerun_failed_ci": true,
 
@@ -3,9 +3,10 @@
 /docker/npu.Dockerfile @ping1jing2 @iforgetmyname
 /python/pyproject.toml @merrymercy @Fridge003 @ispobock
 /python/sglang/jit_kernel @DarkSharpness @BBuf
+/python/sglang/jit_kernel/diffusion @yingluosanqian @BBuf @mickqian
 /python/sglang/multimodal_gen @mickqian @yhyang201
-/python/sglang/multimodal_gen/runtime/layers @mickqian @yhyang201 @BBuf
-/python/sglang/multimodal_gen/runtime/models/dits @mickqian @yhyang201 @BBuf
+/python/sglang/multimodal_gen/runtime/layers @mickqian @yhyang201 @BBuf @yingluosanqian
+/python/sglang/multimodal_gen/runtime/models/dits @mickqian @yhyang201 @BBuf @yingluosanqian
 /python/sglang/srt/batch_invariant_ops @Fridge003 @hebiao064
 /python/sglang/srt/constrained @hnyls2002 @DarkSharpness
 /python/sglang/srt/compilation @hebiao064
 
@@ -108,3 +108,10 @@ deterministic:
 piecewise-cuda-graph:
   - changed-files:
     - any-glob-to-any-file: 'python/sglang/srt/compilation/**/*'
+
+# Moore Threads specific
+mthreads:
+  - changed-files:
+    - any-glob-to-any-file:
+      - '**/*mthreads*'
+      - '**/*musa*'
@@ -3,21 +3,33 @@ name: Execute Notebooks
 on:
   pull_request:
     branches: [ main ]
+    types: [opened, synchronize, reopened, labeled]
     paths:
       - "python/sglang/**"
       - "docs/**"
+      - "!python/sglang/**/*.md"
+      - "!docs/**/*.md"
   workflow_dispatch:
 
 
 concurrency:
   group: execute-notebook-${{ github.ref }}
   cancel-in-progress: true
 
+env:
+  SGLANG_IS_IN_CI: true
 
 jobs:
+  call-gate:
+    # Align with PR Test: fail fast if PR doesn't have run-ci label.
+    # This makes /tag-and-rerun-ci work by rerunning this failed workflow.
+    uses: ./.github/workflows/pr-gate.yml
+    secrets: inherit
+
   run-all-notebooks:
+    needs: [call-gate]
     runs-on: 1-gpu-runner
-    if: github.event_name != 'pull_request' || contains(github.event.pull_request.labels.*.name, 'run-ci')
+    if: github.event_name != 'pull_request' || needs.call-gate.result == 'success'
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
@@ -43,9 +55,11 @@ jobs:
 
   notebook-finish:
     needs: [
+      call-gate,
       run-all-notebooks
     ]
     runs-on: ubuntu-latest
+    if: always() && needs.run-all-notebooks.result != 'skipped'
     steps:
       - name: Check all dependent job statuses
         run: |
 
@@ -34,6 +34,7 @@ on:
           - 'nightly-8-gpu-kimi-k2'
           # MI35x jobs
           - 'nightly-test-1-gpu-mi35x'
+          - 'nightly-8-gpu-mi35x-kimi-k2'
           - 'nightly-accuracy-8-gpu-mi35x'
           - 'nightly-8-gpu-mi35x-grok1-int4'
           - 'nightly-8-gpu-mi35x-grok2'
@@ -582,13 +583,13 @@ jobs:
           bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate
 
       - name: Accuracy Test MI35x (8-GPU Grok1-INT4)
-        timeout-minutes: 60
+        timeout-minutes: 90
         run: |
           > github_summary.md  # Clear summary file
           bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
             -e RCCL_MSCCL_ENABLE=0 \
             -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
-            python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-mi35x-grok1-int4 --nightly --timeout-per-file 3600 || TEST_EXIT_CODE=$?
+            python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-mi35x-grok1-int4 --nightly --timeout-per-file 5400 || TEST_EXIT_CODE=$?
           echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
           exit ${TEST_EXIT_CODE:-0}
 
@@ -793,6 +794,39 @@ jobs:
           echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
           exit ${TEST_EXIT_CODE:-0}
 
+  # MI35x 8-GPU Kimi-K2 (Accuracy)
+  nightly-8-gpu-mi35x-kimi-k2:
+    if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-8-gpu-mi35x-kimi-k2')
+    runs-on: linux-mi35x-gpu-8
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref || github.ref }}
+
+      - name: Setup docker
+        run: |
+          touch github_summary.md
+          bash scripts/ci/amd/amd_ci_start_container.sh
+        env:
+          GITHUB_WORKSPACE: ${{ github.workspace }}
+
+      - name: Install dependencies
+        run: |
+          bash scripts/ci/amd/amd_ci_install_dependency.sh
+          # Install tabulate for run_suite.py (missing in MI35x container)
+          bash scripts/ci/amd/amd_ci_exec.sh pip install tabulate
+
+      - name: Accuracy Test MI35x (8-GPU Kimi-K2)
+        timeout-minutes: 180
+        run: |
+          > github_summary.md  # Clear summary file
+          bash scripts/ci/amd/amd_ci_exec.sh -w /sglang-checkout/test \
+            -e GITHUB_STEP_SUMMARY="/sglang-checkout/github_summary.md" \
+            python3 run_suite.py --hw amd --suite nightly-amd-accuracy-8-gpu-mi35x-kimi-k2 --nightly --timeout-per-file 7200 || TEST_EXIT_CODE=$?
+          echo "$(<github_summary.md )" >> $GITHUB_STEP_SUMMARY || true
+          exit ${TEST_EXIT_CODE:-0}
+
   # MI35x 8-GPU DeepSeek-V3.2 Performance Test (MTP)
   nightly-perf-8-gpu-mi35x-deepseek-v32-mtp:
     if: (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') && (inputs.job_filter == '' || inputs.job_filter == 'all' || inputs.job_filter == 'nightly-perf-8-gpu-mi35x-deepseek-v32-mtp')
@@ -834,10 +868,10 @@ jobs:
       # MI30x Accuracy Tests
       - nightly-accuracy-2-gpu
       - nightly-accuracy-2-gpu-vlm
-      # MI30x Performance Tests
-      - nightly-perf-2-gpu-text
-      - nightly-perf-2-gpu-vlm
       - nightly-accuracy-8-gpu
+      # MI30x Performance Tests - excluded from check (perf failures don't block CI)
+      # - nightly-perf-2-gpu-text
+      # - nightly-perf-2-gpu-vlm
       # MI30x Combined Accuracy + Performance Tests
       - nightly-8-gpu-grok1-int4
       - nightly-8-gpu-grok2
@@ -853,6 +887,7 @@ jobs:
       - nightly-8-gpu-mi35x-deepseek-r1-mxfp4
       - nightly-accuracy-8-gpu-mi35x-deepseek-v32
       - nightly-accuracy-8-gpu-mi35x-deepseek-v32-mtp
+      - nightly-8-gpu-mi35x-kimi-k2
       # MI35x perf jobs excluded from check - perf failures don't block CI
       # - nightly-perf-8-gpu-mi35x-deepseek-v32-basic
       # - nightly-perf-8-gpu-mi35x-deepseek-v32-mtp
 
@@ -165,7 +165,7 @@ jobs:
           STREAMS_PER_DEVICE: 32
         run: |
           hf download lmms-lab/MMMU --repo-type dataset
-          pip install sentence_transformers torchaudio==2.8.0 torch_npu==2.8.0
+          pip install sentence_transformers
           pip install protobuf==6.31.1 zss pre-commit wandb>=0.16.0 tenacity==8.3.0 loguru openpyxl latex2sympy2 zstandard transformers-stream-generator tqdm-multiprocess pycocoevalcap
           pip install yt-dlp sentencepiece==0.1.99 nltk av ftfy sqlitedict==2.1.0 sacrebleu>=1.5.0 pytablewriter peft==0.2.0 black==24.1.0 isort==5.13.2 peft>=0.2.0 accelerate>=0.29.1
           pip install jsonlines httpx==0.25.0 evaluate>=0.4.0 datasets==2.16.1 numexpr xgrammar==0.1.25 numpy==1.26.4 dotenv
@@ -178,11 +178,130 @@ jobs:
           cd test
           python3 run_suite.py --hw npu --suite nightly-4-npu-a3 --nightly --continue-on-error --timeout-per-file 3600 --auto-partition-id ${{ matrix.part }} --auto-partition-size 1
 
+  nightly-8-npu-a3:
+    if: ${{ (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') }}
+    runs-on: linux-aarch64-a3-8
+    strategy:
+      fail-fast: false
+      matrix:
+        part: [0]
+    container:
+      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.5.0-a3-ubuntu22.04-py3.11
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref || github.ref }}
+
+      - name: Install dependencies
+        run: |
+          # speed up by using infra cache services
+          CACHING_URL="cache-service.nginx-pypi-cache.svc.cluster.local"
+          sed -Ei "s@(ports|archive).ubuntu.com@${CACHING_URL}:8081@g" /etc/apt/sources.list
+          pip config set global.index-url http://${CACHING_URL}/pypi/simple
+          pip config set global.extra-index-url "https://pypi.tuna.tsinghua.edu.cn/simple"
+          pip config set global.trusted-host "${CACHING_URL} pypi.tuna.tsinghua.edu.cn"
+
+          bash scripts/ci/npu/npu_ci_install_dependency.sh a3
+          # copy required file from our daily cache
+          cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp
+          # copy download through proxy
+          curl -o /tmp/test.jsonl -L https://gh-proxy.test.osinfra.cn/https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl
+
+      - name: Print Log Information
+        run: |
+          bash scripts/ci/npu/npu_log_print.sh
+
+      - name: Run test
+        timeout-minutes: 240
+        env:
+          SGLANG_USE_MODELSCOPE: true
+          SGLANG_IS_IN_CI: true
+          HF_ENDPOINT: https://hf-mirror.com
+          TORCH_EXTENSIONS_DIR: /tmp/torch_extensions
+          PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True"
+          STREAMS_PER_DEVICE: 32
+        run: |
+          hf download lmms-lab/MMMU --repo-type dataset
+          pip install sentence_transformers
+          pip install protobuf==6.31.1 zss pre-commit wandb>=0.16.0 tenacity==8.3.0 loguru openpyxl latex2sympy2 zstandard transformers-stream-generator tqdm-multiprocess pycocoevalcap
+          pip install yt-dlp sentencepiece==0.1.99 nltk av ftfy sqlitedict==2.1.0 sacrebleu>=1.5.0 pytablewriter peft==0.2.0 black==24.1.0 isort==5.13.2 peft>=0.2.0 accelerate>=0.29.1
+          pip install jsonlines httpx==0.25.0 evaluate>=0.4.0 datasets==2.16.1 numexpr xgrammar==0.1.25 numpy==1.26.4 dotenv
+          git clone --branch v0.3.3 --depth 1 https://github.com/EvolvingLMMs-Lab/lmms-eval.git
+          cd ./lmms-eval
+          nohup pip install . > lmmslog.txt 2>&1 &
+          sleep 120
+          export PYTHONPATH=$PYTHONPATH:$(pwd)
+          cd ../
+          cd test
+          python3 run_suite.py --hw npu --suite nightly-8-npu-a3 --nightly --continue-on-error --timeout-per-file 3600 --auto-partition-id ${{ matrix.part }} --auto-partition-size 1
+
+  nightly-16-npu-a3:
+    if: ${{ (github.repository == 'sgl-project/sglang' || github.event_name == 'pull_request') }}
+    runs-on: linux-aarch64-a3-16
+    strategy:
+      fail-fast: false
+      matrix:
+        part: [0]
+    container:
+      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.5.0-a3-ubuntu22.04-py3.11
+    steps:
+      - name: Checkout code
+        uses: actions/checkout@v4
+        with:
+          ref: ${{ inputs.ref || github.ref }}
+
+      - name: Install dependencies
+        run: |
+          # speed up by using infra cache services
+          CACHING_URL="cache-service.nginx-pypi-cache.svc.cluster.local"
+          sed -Ei "s@(ports|archive).ubuntu.com@${CACHING_URL}:8081@g" /etc/apt/sources.list
+          pip config set global.index-url http://${CACHING_URL}/pypi/simple
+          pip config set global.extra-index-url "https://pypi.tuna.tsinghua.edu.cn/simple"
+          pip config set global.trusted-host "${CACHING_URL} pypi.tuna.tsinghua.edu.cn"
+
+          bash scripts/ci/npu/npu_ci_install_dependency.sh a3
+          # copy required file from our daily cache
+          cp ~/.cache/modelscope/hub/datasets/otavia/ShareGPT_Vicuna_unfiltered/ShareGPT_V3_unfiltered_cleaned_split.json /tmp
+          # copy download through proxy
+          curl -o /tmp/test.jsonl -L https://gh-proxy.test.osinfra.cn/https://raw.githubusercontent.com/openai/grade-school-math/master/grade_school_math/data/test.jsonl
+
+      - name: Print Log Information
+        run: |
+          bash scripts/ci/npu/npu_log_print.sh
+
+      - name: Run test
+        timeout-minutes: 240
+        env:
+          SGLANG_USE_MODELSCOPE: true
+          SGLANG_IS_IN_CI: true
+          HF_ENDPOINT: https://hf-mirror.com
+          TORCH_EXTENSIONS_DIR: /tmp/torch_extensions
+          PYTORCH_NPU_ALLOC_CONF: "expandable_segments:True"
+          STREAMS_PER_DEVICE: 32
+        run: |
+          hf download lmms-lab/MMMU --repo-type dataset
+          pip install sentence_transformers
+          pip install protobuf==6.31.1 zss pre-commit wandb>=0.16.0 tenacity==8.3.0 loguru openpyxl latex2sympy2 zstandard transformers-stream-generator tqdm-multiprocess pycocoevalcap
+          pip install yt-dlp sentencepiece==0.1.99 nltk av ftfy sqlitedict==2.1.0 sacrebleu>=1.5.0 pytablewriter peft==0.2.0 black==24.1.0 isort==5.13.2 peft>=0.2.0 accelerate>=0.29.1
+          pip install jsonlines httpx==0.25.0 evaluate>=0.4.0 datasets==2.16.1 numexpr xgrammar==0.1.25 numpy==1.26.4 dotenv
+          git clone --branch v0.3.3 --depth 1 https://github.com/EvolvingLMMs-Lab/lmms-eval.git
+          cd ./lmms-eval
+          nohup pip install . > lmmslog.txt 2>&1 &
+          sleep 120
+          export PYTHONPATH=$PYTHONPATH:$(pwd)
+          cd ../
+          cd test
+          python3 run_suite.py --hw npu --suite nightly-16-npu-a3 --nightly --continue-on-error --timeout-per-file 3600 --auto-partition-id ${{ matrix.part }} --auto-partition-size 1
+
   check-all-jobs:
     if: github.repository == 'sgl-project/sglang' && always()
     needs:
       - nightly-1-npu-a3
+      - nightly-2-npu-a3
       - nightly-4-npu-a3
+      - nightly-8-npu-a3
+      - nightly-16-npu-a3
     runs-on: ubuntu-latest
     container:
       image: docker.m.daocloud.io/ubuntu:22.04
 
@@ -95,7 +95,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        partition: [0, 1, 2]
+        partition: [0, 1, 2, 3]
     env:
       RUNNER_LABELS: 8-gpu-h200
     steps:
@@ -118,7 +118,7 @@ jobs:
           IS_H200: "1"
         run: |
           cd test
-          python3 run_suite.py --hw cuda --suite nightly-8-gpu-common --nightly --timeout-per-file=18000 --continue-on-error --auto-partition-id=${{ matrix.partition }} --auto-partition-size=3
+          python3 run_suite.py --hw cuda --suite nightly-8-gpu-common --nightly --timeout-per-file=18000 --continue-on-error --auto-partition-id=${{ matrix.partition }} --auto-partition-size=4
 
       - name: Run test
         timeout-minutes: 30
@@ -179,7 +179,7 @@ jobs:
     strategy:
       fail-fast: false
       matrix:
-        partition: [0, 1, 2]
+        partition: [0, 1, 2, 3]
     steps:
       - name: Checkout code
         uses: actions/checkout@v4
@@ -199,7 +199,7 @@ jobs:
           GPU_CONFIG: "8-gpu-b200"
         run: |
           cd test
-          IS_BLACKWELL=1 python3 run_suite.py --hw cuda --suite nightly-8-gpu-common --nightly --timeout-per-file=12000 --continue-on-error --auto-partition-id=${{ matrix.partition }} --auto-partition-size=3
+          IS_BLACKWELL=1 python3 run_suite.py --hw cuda --suite nightly-8-gpu-common --nightly --timeout-per-file=12000 --continue-on-error --auto-partition-id=${{ matrix.partition }} --auto-partition-size=4
 
       - name: Collect performance metrics
         if: always()